From ec7b2935b37bc73d9c8463248729b4d89e5486d6 Mon Sep 17 00:00:00 2001
From: AlongWY <AlongWY@users.noreply.github.com>
Date: Thu, 27 Jul 2023 05:20:51 +0000
Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac

---
 .nojekyll   |     0
 cache.json  |     1 +
 favicon.ico |   Bin 0 -> 15086 bytes
 index.css   |   355 +
 index.html  | 75199 ++++++++++++++++++++++++++++++++++++++++++++++++++
 index.js    |    39 +
 6 files changed, 75594 insertions(+)
 create mode 100644 .nojekyll
 create mode 100644 cache.json
 create mode 100644 favicon.ico
 create mode 100644 index.css
 create mode 100644 index.html
 create mode 100644 index.js

diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 00000000..e69de29b
diff --git a/cache.json b/cache.json
new file mode 100644
index 00000000..356d9606
--- /dev/null
+++ b/cache.json
@@ -0,0 +1 @@
+{"2023-07-19T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.10172v1","updated":"2023-07-19T17:57:53Z","published":"2023-07-19T17:57:53Z","title":"DialogStudio: Towards Richest and Most Diverse Unified Dataset\n  Collection for Conversational AI","summary":"  Despite advancements in conversational AI, language models encounter\nchallenges to handle diverse conversational tasks, and existing dialogue\ndataset collections often lack diversity and comprehensiveness. To tackle these\nissues, we introduce DialogStudio: the largest and most diverse collection of\ndialogue datasets, unified under a consistent format while preserving their\noriginal information. Our collection encompasses data from open-domain\ndialogues, task-oriented dialogues, natural language understanding,\nconversational recommendation, dialogue summarization, and knowledge-grounded\ndialogues, making it an incredibly rich and diverse resource for dialogue\nresearch and model training. To further enhance the utility of DialogStudio, we\nidentify the licenses for each dataset and design domain-aware prompts for\nselected dialogues to facilitate instruction-aware fine-tuning. Furthermore, we\ndevelop conversational AI models using the dataset collection, and our\nexperiments in both zero-shot and few-shot learning scenarios demonstrate the\nsuperiority of DialogStudio. To improve transparency and support dataset and\ntask-based research, as well as language model pre-training, all datasets,\nlicenses, codes, and models associated with DialogStudio are made publicly\naccessible at https://github.com/salesforce/DialogStudio\n","authors":["Jianguo Zhang","Kun Qian","Zhiwei Liu","Shelby Heinecke","Rui Meng","Ye Liu","Zhou Yu","Silvio Savarese","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2307.10172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10169v1","updated":"2023-07-19T17:55:13Z","published":"2023-07-19T17:55:13Z","title":"Challenges and Applications of Large Language Models","summary":"  Large Language Models (LLMs) went from non-existent to ubiquitous in the\nmachine learning discourse within a few years. Due to the fast pace of the\nfield, it is difficult to identify the remaining challenges and already\nfruitful application areas. In this paper, we aim to establish a systematic set\nof open problems and application successes so that ML researchers can\ncomprehend the field's current state more quickly and become productive.\n","authors":["Jean Kaddour","Joshua Harris","Maximilian Mozes","Herbie Bradley","Roberta Raileanu","Robert McHardy"],"pdf_url":"https://arxiv.org/pdf/2307.10169v1.pdf","comment":"72 pages. v01. Work in progress. Feedback and comments are highly\n  appreciated!"},{"id":"http://arxiv.org/abs/2307.10168v1","updated":"2023-07-19T17:54:43Z","published":"2023-07-19T17:54:43Z","title":"LLMs as Workers in Human-Computational Algorithms? Replicating\n  Crowdsourcing Pipelines with LLMs","summary":"  LLMs have shown promise in replicating human-like behavior in crowdsourcing\ntasks that were previously thought to be exclusive to human abilities. However,\ncurrent efforts focus mainly on simple atomic tasks. We explore whether LLMs\ncan replicate more complex crowdsourcing pipelines. We find that modern LLMs\ncan simulate some of crowdworkers' abilities in these \"human computation\nalgorithms,\" but the level of success is variable and influenced by requesters'\nunderstanding of LLM capabilities, the specific skills required for sub-tasks,\nand the optimal interaction modality for performing these sub-tasks. We reflect\non human and LLMs' different sensitivities to instructions, stress the\nimportance of enabling human-facing safeguards for LLMs, and discuss the\npotential of training humans and LLMs with complementary skill sets. Crucially,\nwe show that replicating crowdsourcing pipelines offers a valuable platform to\ninvestigate (1) the relative strengths of LLMs on different tasks (by\ncross-comparing their performances on sub-tasks) and (2) LLMs' potential in\ncomplex tasks, where they can complete part of the tasks while leaving others\nto humans.\n","authors":["Tongshuang Wu","Haiyi Zhu","Maya Albayrak","Alexis Axon","Amanda Bertsch","Wenxing Deng","Ziqi Ding","Bill Guo","Sireesh Gururaja","Tzu-Sheng Kuo","Jenny T. Liang","Ryan Liu","Ihita Mandal","Jeremiah Milbauer","Xiaolin Ni","Namrata Padmanabhan","Subhashini Ramkumar","Alexis Sudjianto","Jordan Taylor","Ying-Jui Tseng","Patricia Vaidos","Zhijin Wu","Wei Wu","Chenyang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.10168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10156v1","updated":"2023-07-19T17:37:03Z","published":"2023-07-19T17:37:03Z","title":"Exploring Transformer Extrapolation","summary":"  Length extrapolation has attracted considerable attention recently since it\nallows transformers to be tested on longer sequences than those used in\ntraining. Previous research has shown that this property can be attained by\nusing carefully designed Relative Positional Encodings (RPEs). While these\nmethods perform well on a variety of corpora, the conditions for length\nextrapolation have yet to be investigated. This paper attempts to determine\nwhat types of RPEs allow for length extrapolation through a thorough\nmathematical and empirical analysis. We discover that a transformer is certain\nto possess this property as long as the series that corresponds to the RPE's\nexponential converges. Two practices are derived from the conditions and\nexamined in language modeling tasks on a variety of corpora. As a bonus from\nthe conditions, we derive a new Theoretical Receptive Field (TRF) to measure\nthe receptive field of RPEs without taking any training steps. Extensive\nexperiments are conducted on the Wikitext-103, Books, Github, and WikiBook\ndatasets to demonstrate the viability of our discovered conditions. We also\ncompare TRF to Empirical Receptive Field (ERF) across different models, showing\nconsistently matched trends on the aforementioned datasets. The code is\navailable at https://github.com/OpenNLPLab/Rpe.\n","authors":["Zhen Qin","Yiran Zhong","Hui Deng"],"pdf_url":"https://arxiv.org/pdf/2307.10156v1.pdf","comment":"Zhen Qin and Yiran Zhong contribute equally to this paper; Yiran\n  Zhong is the corresponding author. The code is available at\n  https://github.com/OpenNLPLab/Rpe"},{"id":"http://arxiv.org/abs/2307.09288v2","updated":"2023-07-19T17:08:59Z","published":"2023-07-18T14:31:57Z","title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","summary":"  In this work, we develop and release Llama 2, a collection of pretrained and\nfine-tuned large language models (LLMs) ranging in scale from 7 billion to 70\nbillion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for\ndialogue use cases. Our models outperform open-source chat models on most\nbenchmarks we tested, and based on our human evaluations for helpfulness and\nsafety, may be a suitable substitute for closed-source models. We provide a\ndetailed description of our approach to fine-tuning and safety improvements of\nLlama 2-Chat in order to enable the community to build on our work and\ncontribute to the responsible development of LLMs.\n","authors":["Hugo Touvron","Louis Martin","Kevin Stone","Peter Albert","Amjad Almahairi","Yasmine Babaei","Nikolay Bashlykov","Soumya Batra","Prajjwal Bhargava","Shruti Bhosale","Dan Bikel","Lukas Blecher","Cristian Canton Ferrer","Moya Chen","Guillem Cucurull","David Esiobu","Jude Fernandes","Jeremy Fu","Wenyin Fu","Brian Fuller","Cynthia Gao","Vedanuj Goswami","Naman Goyal","Anthony Hartshorn","Saghar Hosseini","Rui Hou","Hakan Inan","Marcin Kardas","Viktor Kerkez","Madian Khabsa","Isabel Kloumann","Artem Korenev","Punit Singh Koura","Marie-Anne Lachaux","Thibaut Lavril","Jenya Lee","Diana Liskovich","Yinghai Lu","Yuning Mao","Xavier Martinet","Todor Mihaylov","Pushkar Mishra","Igor Molybog","Yixin Nie","Andrew Poulton","Jeremy Reizenstein","Rashi Rungta","Kalyan Saladi","Alan Schelten","Ruan Silva","Eric Michael Smith","Ranjan Subramanian","Xiaoqing Ellen Tan","Binh Tang","Ross Taylor","Adina Williams","Jian Xiang Kuan","Puxin Xu","Zheng Yan","Iliyan Zarov","Yuchen Zhang","Angela Fan","Melanie Kambadur","Sharan Narang","Aurelien Rodriguez","Robert Stojnic","Sergey Edunov","Thomas Scialom"],"pdf_url":"https://arxiv.org/pdf/2307.09288v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10098v1","updated":"2023-07-19T16:13:13Z","published":"2023-07-19T16:13:13Z","title":"Gradient Sparsification For Masked Fine-Tuning of Transformers","summary":"  Fine-tuning pretrained self-supervised language models is widely adopted for\ntransfer learning to downstream tasks. Fine-tuning can be achieved by freezing\ngradients of the pretrained network and only updating gradients of a newly\nadded classification layer, or by performing gradient updates on all\nparameters. Gradual unfreezing makes a trade-off between the two by gradually\nunfreezing gradients of whole layers during training. This has been an\neffective strategy to trade-off between storage and training speed with\ngeneralization performance. However, it is not clear whether gradually\nunfreezing layers throughout training is optimal, compared to sparse variants\nof gradual unfreezing which may improve fine-tuning performance. In this paper,\nwe propose to stochastically mask gradients to regularize pretrained language\nmodels for improving overall fine-tuned performance. We introduce GradDrop and\nvariants thereof, a class of gradient sparsification methods that mask\ngradients during the backward pass, acting as gradient noise. GradDrop is\nsparse and stochastic unlike gradual freezing. Extensive experiments on the\nmultilingual XGLUE benchmark with XLMR-Large show that GradDrop is competitive\nagainst methods that use additional translated data for intermediate\npretraining and outperforms standard fine-tuning and gradual unfreezing. A\npost-analysis shows how GradDrop improves performance with languages it was not\ntrained on, such as under-resourced languages.\n","authors":["James O' Neill","Sourav Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.10098v1.pdf","comment":"Accepted to IJCNN 2023"},{"id":"http://arxiv.org/abs/2307.10088v1","updated":"2023-07-19T15:57:24Z","published":"2023-07-19T15:57:24Z","title":"Android in the Wild: A Large-Scale Dataset for Android Device Control","summary":"  There is a growing interest in device-control systems that can interpret\nhuman natural language instructions and execute them on a digital device by\ndirectly controlling its user interface. We present a dataset for\ndevice-control research, Android in the Wild (AITW), which is orders of\nmagnitude larger than current datasets. The dataset contains human\ndemonstrations of device interactions, including the screens and actions, and\ncorresponding natural language instructions. It consists of 715k episodes\nspanning 30k unique instructions, four versions of Android (v10-13),and eight\ndevice types (Pixel 2 XL to Pixel 6) with varying screen resolutions. It\ncontains multi-step tasks that require semantic understanding of language and\nvisual context. This dataset poses a new challenge: actions available through\nthe user interface must be inferred from their visual appearance. And, instead\nof simple UI element-based actions, the action space consists of precise\ngestures (e.g., horizontal scrolls to operate carousel widgets). We organize\nour dataset to encourage robustness analysis of device-control systems, i.e.,\nhow well a system performs in the presence of new task descriptions, new\napplications, or new platform versions. We develop two agents and report\nperformance across the dataset. The dataset is available at\nhttps://github.com/google-research/google-research/tree/master/android_in_the_wild.\n","authors":["Christopher Rawles","Alice Li","Daniel Rodriguez","Oriana Riva","Timothy Lillicrap"],"pdf_url":"https://arxiv.org/pdf/2307.10088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11596v3","updated":"2023-07-19T15:25:37Z","published":"2023-01-27T08:45:53Z","title":"ThoughtSource: A central hub for large language model reasoning data","summary":"  Large language models (LLMs) such as GPT-4 have recently demonstrated\nimpressive results across a wide range of tasks. LLMs are still limited,\nhowever, in that they frequently fail at complex reasoning, their reasoning\nprocesses are opaque, they are prone to 'hallucinate' facts, and there are\nconcerns about their underlying biases. Letting models verbalize reasoning\nsteps as natural language, a technique known as chain-of-thought prompting, has\nrecently been proposed as a way to address some of these issues. Here we\npresent ThoughtSource, a meta-dataset and software library for chain-of-thought\n(CoT) reasoning. The goal of ThoughtSource is to improve future artificial\nintelligence systems by facilitating qualitative understanding of CoTs,\nenabling empirical evaluations, and providing training data. This first release\nof ThoughtSource integrates six scientific/medical, three general-domain and\nfive math word question answering datasets.\n","authors":["Simon Ott","Konstantin Hebenstreit","Valentin Liévin","Christoffer Egeberg Hother","Milad Moradi","Maximilian Mayrhauser","Robert Praas","Ole Winther","Matthias Samwald"],"pdf_url":"https://arxiv.org/pdf/2301.11596v3.pdf","comment":"Revision: added datasets, minor restructuring"},{"id":"http://arxiv.org/abs/2307.10025v1","updated":"2023-07-19T15:09:50Z","published":"2023-07-19T15:09:50Z","title":"An Empirical Study on Fertility Proposals Using Multi-Grined Topic\n  Analysis Methods","summary":"  Fertility issues are closely related to population security, in 60 years\nChina's population for the first time in a negative growth trend, the change of\nfertility policy is of great concern to the community. 2023 ``two sessions\"\nproposal ``suggests that the country in the form of legislation, the birth of\nthe registration of the cancellation of the marriage restriction\" This topic\nwas once a hot topic on the Internet, and ``unbundling\" the relationship\nbetween birth registration and marriage has become the focus of social debate.\nIn this paper, we adopt co-occurrence semantic analysis, topic analysis and\nsentiment analysis to conduct multi-granularity semantic analysis of microblog\ncomments. It is found that the discussion on the proposal of ``removing\nmarriage restrictions from birth registration\" involves the individual, society\nand the state at three dimensions, and is detailed into social issues such as\npersonal behaviour, social ethics and law, and national policy, with people's\nsentiment inclined to be negative in most of the topics. Based on this, eight\nproposals were made to provide a reference for governmental decision making and\nto form a reference method for researching public opinion on political issues.\n","authors":["Yulin Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.10025v1.pdf","comment":"7 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.09456v2","updated":"2023-07-19T14:27:57Z","published":"2023-07-18T17:35:45Z","title":"A comparative analysis of SRGAN models","summary":"  In this study, we evaluate the performance of multiple state-of-the-art SRGAN\n(Super Resolution Generative Adversarial Network) models, ESRGAN, Real-ESRGAN\nand EDSR, on a benchmark dataset of real-world images which undergo degradation\nusing a pipeline. Our results show that some models seem to significantly\nincrease the resolution of the input images while preserving their visual\nquality, this is assessed using Tesseract OCR engine. We observe that EDSR-BASE\nmodel from huggingface outperforms the remaining candidate models in terms of\nboth quantitative metrics and subjective visual quality assessments with least\ncompute overhead. Specifically, EDSR generates images with higher peak\nsignal-to-noise ratio (PSNR) and structural similarity index (SSIM) values and\nare seen to return high quality OCR results with Tesseract OCR engine. These\nfindings suggest that EDSR is a robust and effective approach for single-image\nsuper-resolution and may be particularly well-suited for applications where\nhigh-quality visual fidelity is critical and optimized compute.\n","authors":["Fatemeh Rezapoor Nikroo","Ajinkya Deshmukh","Anantha Sharma","Adrian Tam","Kaarthik Kumar","Cleo Norris","Aditya Dangi"],"pdf_url":"https://arxiv.org/pdf/2307.09456v2.pdf","comment":"9 pages, 6 tables, 2 figures"},{"id":"http://arxiv.org/abs/2307.09998v1","updated":"2023-07-19T14:13:02Z","published":"2023-07-19T14:13:02Z","title":"Generating Mathematical Derivations with Large Language Models","summary":"  The derivation of mathematical results in specialised fields using Large\nLanguage Models (LLMs) is an emerging research direction that can help identify\nmodels' limitations, and potentially support mathematical discovery. In this\npaper, we leverage a symbolic engine to generate derivations of equations at\nscale, and investigate the capabilities of LLMs when deriving goal equations\nfrom premises. Specifically, we employ in-context learning for GPT and\nfine-tune a range of T5 models to compare the robustness and generalisation of\npre-training strategies to specialised models. Empirical results show that\nfine-tuned FLAN-T5-large (MathT5) outperforms GPT models on all static and\nout-of-distribution test sets in terms of absolute performance. However, an\nin-depth analysis reveals that the fine-tuned models are more sensitive to\nperturbations involving unseen symbols and (to a lesser extent) changes to\nequation structure. In addition, we analyse 1.7K equations and over 200\nderivations to highlight common reasoning errors such as the inclusion of\nincorrect, irrelevant, and redundant equations, along with the tendency to skip\nderivation steps. Finally, we explore the suitability of existing metrics for\nevaluating mathematical derivations finding evidence that, while they capture\ngeneral properties such as sensitivity to perturbations, they fail to highlight\nfine-grained reasoning errors and essential differences between models.\nOverall, this work demonstrates that training models on synthetic data can\nimprove their mathematical capabilities beyond larger architectures.\n","authors":["Jordan Meadows","Marco Valentino","Andre Freitas"],"pdf_url":"https://arxiv.org/pdf/2307.09998v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2303.15056v2","updated":"2023-07-19T14:10:55Z","published":"2023-03-27T09:59:48Z","title":"ChatGPT Outperforms Crowd-Workers for Text-Annotation Tasks","summary":"  Many NLP applications require manual data annotations for a variety of tasks,\nnotably to train classifiers or evaluate the performance of unsupervised\nmodels. Depending on the size and degree of complexity, the tasks may be\nconducted by crowd-workers on platforms such as MTurk as well as trained\nannotators, such as research assistants. Using a sample of 2,382 tweets, we\ndemonstrate that ChatGPT outperforms crowd-workers for several annotation\ntasks, including relevance, stance, topics, and frames detection. Specifically,\nthe zero-shot accuracy of ChatGPT exceeds that of crowd-workers for four out of\nfive tasks, while ChatGPT's intercoder agreement exceeds that of both\ncrowd-workers and trained annotators for all tasks. Moreover, the\nper-annotation cost of ChatGPT is less than $0.003 -- about twenty times\ncheaper than MTurk. These results show the potential of large language models\nto drastically increase the efficiency of text classification.\n","authors":["Fabrizio Gilardi","Meysam Alizadeh","Maël Kubli"],"pdf_url":"https://arxiv.org/pdf/2303.15056v2.pdf","comment":"Gilardi, Fabrizio, Meysam Alizadeh, and Ma\\\"el Kubli. 2023. \"ChatGPT\n  Outperforms Crowd Workers for Text-Annotation Tasks\". Proceedings of the\n  National Academy of Sciences 120(30): e2305016120"},{"id":"http://arxiv.org/abs/2210.14037v2","updated":"2023-07-19T13:43:07Z","published":"2022-10-25T14:13:53Z","title":"Revisiting Softmax for Uncertainty Approximation in Text Classification","summary":"  Uncertainty approximation in text classification is an important area with\napplications in domain adaptation and interpretability. One of the most widely\nused uncertainty approximation methods is Monte Carlo (MC) Dropout, which is\ncomputationally expensive as it requires multiple forward passes through the\nmodel. A cheaper alternative is to simply use the softmax based on a single\nforward pass without dropout to estimate model uncertainty. However, prior work\nhas indicated that these predictions tend to be overconfident. In this paper,\nwe perform a thorough empirical analysis of these methods on five datasets with\ntwo base neural architectures in order to identify the trade-offs between the\ntwo. We compare both softmax and an efficient version of MC Dropout on their\nuncertainty approximations and downstream text classification performance,\nwhile weighing their runtime (cost) against performance (benefit). We find\nthat, while MC dropout produces the best uncertainty approximations, using a\nsimple softmax leads to competitive and in some cases better uncertainty\nestimation for text classification at a much lower computational cost,\nsuggesting that softmax can in fact be a sufficient uncertainty estimate when\ncomputational resources are a concern.\n","authors":["Andreas Nugaard Holm","Dustin Wright","Isabelle Augenstein"],"pdf_url":"https://arxiv.org/pdf/2210.14037v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09959v1","updated":"2023-07-19T13:01:03Z","published":"2023-07-19T13:01:03Z","title":"GUIDO: A Hybrid Approach to Guideline Discovery & Ordering from Natural\n  Language Texts","summary":"  Extracting workflow nets from textual descriptions can be used to simplify\nguidelines or formalize textual descriptions of formal processes like business\nprocesses and algorithms. The task of manually extracting processes, however,\nrequires domain expertise and effort. While automatic process model extraction\nis desirable, annotating texts with formalized process models is expensive.\nTherefore, there are only a few machine-learning-based extraction approaches.\nRule-based approaches, in turn, require domain specificity to work well and can\nrarely distinguish relevant and irrelevant information in textual descriptions.\nIn this paper, we present GUIDO, a hybrid approach to the process model\nextraction task that first, classifies sentences regarding their relevance to\nthe process model, using a BERT-based sentence classifier, and second, extracts\na process model from the sentences classified as relevant, using dependency\nparsing. The presented approach achieves significantly better results than a\npure rule-based approach. GUIDO achieves an average behavioral similarity score\nof $0.93$. Still, in comparison to purely machine-learning-based approaches,\nthe annotation costs stay low.\n","authors":["Nils Freyer","Dustin Thewes","Matthias Meinecke"],"pdf_url":"https://arxiv.org/pdf/2307.09959v1.pdf","comment":"Preprint of the short paper presented at the 12th International\n  Conference on Data Science, Technology and Applications"},{"id":"http://arxiv.org/abs/2307.02486v2","updated":"2023-07-19T12:25:35Z","published":"2023-07-05T17:59:38Z","title":"LongNet: Scaling Transformers to 1,000,000,000 Tokens","summary":"  Scaling sequence length has become a critical demand in the era of large\nlanguage models. However, existing methods struggle with either computational\ncomplexity or model expressivity, rendering the maximum sequence length\nrestricted. To address this issue, we introduce LongNet, a Transformer variant\nthat can scale sequence length to more than 1 billion tokens, without\nsacrificing the performance on shorter sequences. Specifically, we propose\ndilated attention, which expands the attentive field exponentially as the\ndistance grows. LongNet has significant advantages: 1) it has a linear\ncomputation complexity and a logarithm dependency between any two tokens in a\nsequence; 2) it can be served as a distributed trainer for extremely long\nsequences; 3) its dilated attention is a drop-in replacement for standard\nattention, which can be seamlessly integrated with the existing\nTransformer-based optimization. Experiments results demonstrate that LongNet\nyields strong performance on both long-sequence modeling and general language\ntasks. Our work opens up new possibilities for modeling very long sequences,\ne.g., treating a whole corpus or even the entire Internet as a sequence.\n","authors":["Jiayu Ding","Shuming Ma","Li Dong","Xingxing Zhang","Shaohan Huang","Wenhui Wang","Nanning Zheng","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.02486v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2307.09923v1","updated":"2023-07-19T11:54:46Z","published":"2023-07-19T11:54:46Z","title":"Large Language Models can accomplish Business Process Management Tasks","summary":"  Business Process Management (BPM) aims to improve organizational activities\nand their outcomes by managing the underlying processes. To achieve this, it is\noften necessary to consider information from various sources, including\nunstructured textual documents. Therefore, researchers have developed several\nBPM-specific solutions that extract information from textual documents using\nNatural Language Processing techniques. These solutions are specific to their\nrespective tasks and cannot accomplish multiple process-related problems as a\ngeneral-purpose instrument. However, in light of the recent emergence of Large\nLanguage Models (LLMs) with remarkable reasoning capabilities, such a\ngeneral-purpose instrument with multiple applications now appears attainable.\nIn this paper, we illustrate how LLMs can accomplish text-related BPM tasks by\napplying a specific LLM to three exemplary tasks: mining imperative process\nmodels from textual descriptions, mining declarative process models from\ntextual descriptions, and assessing the suitability of process tasks from\ntextual descriptions for robotic process automation. We show that, without\nextensive configuration or prompt engineering, LLMs perform comparably to or\nbetter than existing solutions and discuss implications for future BPM research\nas well as practical usage.\n","authors":["Michael Grohs","Luka Abb","Nourhan Elsayed","Jana-Rebecca Rehse"],"pdf_url":"https://arxiv.org/pdf/2307.09923v1.pdf","comment":"Accepted at NLP4BPM workshop at BPM 2023"},{"id":"http://arxiv.org/abs/2307.09885v1","updated":"2023-07-19T10:28:59Z","published":"2023-07-19T10:28:59Z","title":"Test-takers have a say: understanding the implications of the use of AI\n  in language tests","summary":"  Language tests measure a person's ability to use a language in terms of\nlistening, speaking, reading, or writing. Such tests play an integral role in\nacademic, professional, and immigration domains, with entities such as\neducational institutions, professional accreditation bodies, and governments\nusing them to assess candidate language proficiency. Recent advances in\nArtificial Intelligence (AI) and the discipline of Natural Language Processing\nhave prompted language test providers to explore AI's potential applicability\nwithin language testing, leading to transformative activity patterns\nsurrounding language instruction and learning. However, with concerns over AI's\ntrustworthiness, it is imperative to understand the implications of integrating\nAI into language testing. This knowledge will enable stakeholders to make\nwell-informed decisions, thus safeguarding community well-being and testing\nintegrity. To understand the concerns and effects of AI usage in language\ntests, we conducted interviews and surveys with English test-takers. To the\nbest of our knowledge, this is the first empirical study aimed at identifying\nthe implications of AI adoption in language tests from a test-taker\nperspective. Our study reveals test-taker perceptions and behavioral patterns.\nSpecifically, we identify that AI integration may enhance perceptions of\nfairness, consistency, and availability. Conversely, it might incite mistrust\nregarding reliability and interactivity aspects, subsequently influencing the\nbehaviors and well-being of test-takers. These insights provide a better\nunderstanding of potential societal implications and assist stakeholders in\nmaking informed decisions concerning AI usage in language testing.\n","authors":["Dawen Zhang","Thong Hoang","Shidong Pan","Yongquan Hu","Zhenchang Xing","Mark Staples","Xiwei Xu","Qinghua Lu","Aaron Quigley"],"pdf_url":"https://arxiv.org/pdf/2307.09885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09416v2","updated":"2023-07-19T08:27:50Z","published":"2023-07-18T16:33:30Z","title":"Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation\n  Evaluation","summary":"  Research in Image Generation has recently made significant progress,\nparticularly boosted by the introduction of Vision-Language models which are\nable to produce high-quality visual content based on textual inputs. Despite\nongoing advancements in terms of generation quality and realism, no methodical\nframeworks have been defined yet to quantitatively measure the quality of the\ngenerated content and the adherence with the prompted requests: so far, only\nhuman-based evaluations have been adopted for quality satisfaction and for\ncomparing different generative methods. We introduce a novel automated method\nfor Visual Concept Evaluation (ViCE), i.e. to assess consistency between a\ngenerated/edited image and the corresponding prompt/instructions, with a\nprocess inspired by the human cognitive behaviour. ViCE combines the strengths\nof Large Language Models (LLMs) and Visual Question Answering (VQA) into a\nunified pipeline, aiming to replicate the human cognitive process in quality\nassessment. This method outlines visual concepts, formulates image-specific\nverification questions, utilizes the Q&A system to investigate the image, and\nscores the combined outcome. Although this brave new hypothesis of mimicking\nhumans in the image evaluation process is in its preliminary assessment stage,\nresults are promising and open the door to a new form of automatic evaluation\nwhich could have significant impact as the image generation or the image target\nediting tasks become more and more sophisticated.\n","authors":["Federico Betti","Jacopo Staiano","Lorenzo Baraldi","Lorenzo Baraldi","Rita Cucchiara","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2307.09416v2.pdf","comment":"Accepted as oral at ACM MultiMedia 2023 (Brave New Ideas track)"},{"id":"http://arxiv.org/abs/2307.09813v1","updated":"2023-07-19T08:02:20Z","published":"2023-07-19T08:02:20Z","title":"DAPrompt: Deterministic Assumption Prompt Learning for Event Causality\n  Identification","summary":"  Event Causality Identification (ECI) aims at determining whether there is a\ncausal relation between two event mentions. Conventional prompt learning\ndesigns a prompt template to first predict an answer word and then maps it to\nthe final decision. Unlike conventional prompts, we argue that predicting an\nanswer word may not be a necessary prerequisite for the ECI task. Instead, we\ncan first make a deterministic assumption on the existence of causal relation\nbetween two events and then evaluate its rationality to either accept or reject\nthe assumption. The design motivation is to try the most utilization of the\nencyclopedia-like knowledge embedded in a pre-trained language model. In light\nof such considerations, we propose a deterministic assumption prompt learning\nmodel, called DAPrompt, for the ECI task. In particular, we design a simple\ndeterministic assumption template concatenating with the input event pair,\nwhich includes two masks as predicted events' tokens. We use the probabilities\nof predicted events to evaluate the assumption rationality for the final event\ncausality decision. Experiments on the EventStoryLine corpus and\nCausal-TimeBank corpus validate our design objective in terms of significant\nperformance improvements over the state-of-the-art algorithms.\n","authors":["Wei Xiang","Chuanhong Zhan","Bang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09793v1","updated":"2023-07-19T07:17:43Z","published":"2023-07-19T07:17:43Z","title":"On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large\n  Language Models","summary":"  Since late 2022, Large Language Models (LLMs) have become very prominent with\nLLMs like ChatGPT and Bard receiving millions of users. Hundreds of new LLMs\nare announced each week, many of which are deposited to Hugging Face, a\nrepository of machine learning models and datasets. To date, nearly 16,000 Text\nGeneration models have been uploaded to the site. Given the huge influx of\nLLMs, it is of interest to know which LLM backbones, settings, training\nmethods, and families are popular or trending. However, there is no\ncomprehensive index of LLMs available. We take advantage of the relatively\nsystematic nomenclature of Hugging Face LLMs to perform hierarchical clustering\nand identify communities amongst LLMs using n-grams and term frequency-inverse\ndocument frequency. Our methods successfully identify families of LLMs and\naccurately cluster LLMs into meaningful subgroups. We present a public web\napplication to navigate and explore Constellation, our atlas of 15,821 LLMs.\nConstellation rapidly generates a variety of visualizations, namely\ndendrograms, graphs, word clouds, and scatter plots. Constellation is available\nat the following link: https://constellation.sites.stanford.edu/.\n","authors":["Sarah Gao","Andrew Kean Gao"],"pdf_url":"https://arxiv.org/pdf/2307.09793v1.pdf","comment":"14 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.09782v1","updated":"2023-07-19T06:58:03Z","published":"2023-07-19T06:58:03Z","title":"ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization\n  Using Floating-Point Formats","summary":"  In the complex domain of large language models (LLMs), striking a balance\nbetween computational efficiency and maintaining model quality is a formidable\nchallenge. Navigating the inherent limitations of uniform quantization,\nparticularly when dealing with outliers, and motivated by the launch of\nNVIDIA's H100 hardware, this study delves into the viability of floating-point\n(FP) quantization, particularly focusing on FP8 and FP4, as a potential\nsolution. Our comprehensive investigation reveals that for LLMs, FP8 activation\nconsistently outshines its integer (INT8) equivalent, with the performance edge\nbecoming more noticeable in models possessing parameters beyond one billion.\nFor weight quantization, our findings indicate that FP4 exhibits comparable, if\nnot superior, performance to INT4, simplifying deployment on FP-supported\nhardware like H100. To mitigate the overhead from precision alignment caused by\nthe disparity between weights and activations, we propose two scaling\nconstraints for weight quantization that negligibly impact the performance\ncompared to the standard W4A8 model. We additionally enhance our quantization\nmethods by integrating the Low Rank Compensation (LoRC) strategy, yielding\nimprovements especially in smaller models. The results of our investigation\nemphasize the immense potential of FP quantization for LLMs, paving the way for\nhigh-efficiency deployment in resource-limited settings.\n","authors":["Xiaoxia Wu","Zhewei Yao","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2307.09782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01692v4","updated":"2023-07-19T06:48:35Z","published":"2022-12-03T21:14:32Z","title":"Can In-context Learners Learn a Reasoning Concept from Demonstrations?","summary":"  Language models exhibit an emergent ability to learn a new task from a small\nnumber of input-output demonstrations. However, recent work shows that\nin-context learners largely rely on their pre-trained knowledge, such as the\nsentiment of the labels, instead of learning new associations from the input.\nWe argue that the commonly-used few-shot evaluation using a random selection of\nin-context demonstrations can not disentangle models' reliance on such biases,\nas most of the randomly-selected demonstrations do not present relations\ninformative for prediction beyond exposing the task's input-output\ndistribution.\n  Therefore, to evaluate models' in-context learning ability independent of\nmodels' memory, we introduce a Concept-sharing few-shot learning method\nchoosing the demonstrations that share an underlying concept with the predicted\nsample. We extract a set of such concepts from available human explanations and\nmeasure how much models can benefit from presenting these concepts in few-shot\ndemonstrations.\n  We find that most of the recent in-context learners can not consistently\nbenefit from the demonstrated concepts, irrespective of the model size.\nHowever, we note that T0 models are more sensitive to exhibited concepts,\nbenefiting from concept-sharing demonstrations in 7 out of 8 evaluation\nscenarios.\n","authors":["Michal Štefánik","Marek Kadlčík"],"pdf_url":"https://arxiv.org/pdf/2212.01692v4.pdf","comment":"Awarded Best Paper at ACL 2023 Natural Language Reasoning and\n  Structured Explanations (NLRSE) workshop"},{"id":"http://arxiv.org/abs/2307.08621v2","updated":"2023-07-19T05:56:42Z","published":"2023-07-17T16:40:01Z","title":"Retentive Network: A Successor to Transformer for Large Language Models","summary":"  In this work, we propose Retentive Network (RetNet) as a foundation\narchitecture for large language models, simultaneously achieving training\nparallelism, low-cost inference, and good performance. We theoretically derive\nthe connection between recurrence and attention. Then we propose the retention\nmechanism for sequence modeling, which supports three computation paradigms,\ni.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel\nrepresentation allows for training parallelism. The recurrent representation\nenables low-cost $O(1)$ inference, which improves decoding throughput, latency,\nand GPU memory without sacrificing performance. The chunkwise recurrent\nrepresentation facilitates efficient long-sequence modeling with linear\ncomplexity, where each chunk is encoded parallelly while recurrently\nsummarizing the chunks. Experimental results on language modeling show that\nRetNet achieves favorable scaling results, parallel training, low-cost\ndeployment, and efficient inference. The intriguing properties make RetNet a\nstrong successor to Transformer for large language models. Code will be\navailable at https://aka.ms/retnet.\n","authors":["Yutao Sun","Li Dong","Shaohan Huang","Shuming Ma","Yuqing Xia","Jilong Xue","Jianyong Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10551v3","updated":"2023-07-19T05:52:32Z","published":"2022-12-20T18:54:08Z","title":"Lego-MT: Learning Detachable Models for Massively Multilingual Machine\n  Translation","summary":"  Multilingual neural machine translation (MNMT) aims to build a unified model\nfor many language directions. Existing monolithic models for MNMT encounter two\nchallenges: parameter interference among languages and inefficient inference\nfor large models. In this paper, we revisit the classic multi-way structures\nand develop a detachable model by assigning each language (or group of\nlanguages) to an individual branch that supports plug-and-play training and\ninference. To address the needs of learning representations for all languages\nin a unified space, we propose a novel efficient training recipe, upon which we\nbuild an effective detachable model, Lego-MT. For a fair comparison, we collect\ndata from OPUS and build a translation benchmark covering 433 languages and\n1.3B parallel data. Experiments show that Lego-MT with 1.2B parameters brings\nan average gain of 3.2 spBLEU. It even outperforms M2M-100 with 12B parameters.\nThe proposed training recipe brings a 28.2$\\times$ speedup over the\nconventional multi-way training method.\\footnote{\n\\url{https://github.com/CONE-MT/Lego-MT}.}\n","authors":["Fei Yuan","Yinquan Lu","WenHao Zhu","Lingpeng Kong","Lei Li","Yu Qiao","Jingjing Xu"],"pdf_url":"https://arxiv.org/pdf/2212.10551v3.pdf","comment":"ACL 2023 Findings"},{"id":"http://arxiv.org/abs/2303.12135v4","updated":"2023-07-19T05:30:31Z","published":"2023-03-21T18:48:11Z","title":"Understand Legal Documents with Contextualized Large Language Models","summary":"  The growth of pending legal cases in populous countries, such as India, has\nbecome a major issue. Developing effective techniques to process and understand\nlegal documents is extremely useful in resolving this problem. In this paper,\nwe present our systems for SemEval-2023 Task 6: understanding legal texts (Modi\net al., 2023). Specifically, we first develop the Legal-BERT-HSLN model that\nconsiders the comprehensive context information in both intra- and\ninter-sentence levels to predict rhetorical roles (subtask A) and then train a\nLegal-LUKE model, which is legal-contextualized and entity-aware, to recognize\nlegal entities (subtask B). Our evaluations demonstrate that our designed\nmodels are more accurate than baselines, e.g., with an up to 15.0% better F1\nscore in subtask B. We achieved notable performance in the task leaderboard,\ne.g., 0.834 micro F1 score, and ranked No.5 out of 27 teams in subtask A.\n","authors":["Xin Jin","Yuchen Wang"],"pdf_url":"https://arxiv.org/pdf/2303.12135v4.pdf","comment":"SemEval 2023"},{"id":"http://arxiv.org/abs/2306.07848v5","updated":"2023-07-19T04:56:33Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n  Pretraining for Speech Emotion Recognition","summary":"  Contrastive learning based cross-modality pretraining methods have recently\nexhibited impressive success in diverse fields. In this paper, we propose\nGEmo-CLAP, a kind of gender-attribute-enhanced contrastive language-audio\npretraining (CLAP) method for speech emotion recognition. Specifically, a novel\nemotion CLAP model (Emo-CLAP) is first built, utilizing various self-supervised\npre-trained models. Second, considering the importance of gender attribute in\nspeech emotion modeling, the soft label based GEmo-CLAP (SL-GEmo-CLAP) and\nmulti-task learning based GEmo-CLAP (ML-GEmo-CLAP) are further proposed to\nintegrate the emotion and gender information of speech signals, forming more\nreasonable objectives. Extensive experiments on IEMOCAP show that our proposed\ntwo GEmo-CLAP models consistently outperform the baseline Emo-CLAP with\ndifferent pre-trained models, while also achieving the best recognition\nperformance compared with recent state-of-the-art methods. Noticeably, the\nproposed WavLM-based ML-GEmo-CLAP obtains the best UAR of 80.16\\% and WAR of\n82.06\\%.\n","authors":["Yu Pan","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2306.07848v5.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2307.09744v1","updated":"2023-07-19T04:25:21Z","published":"2023-07-19T04:25:21Z","title":"Enhancing conversational quality in language learning chatbots: An\n  evaluation of GPT4 for ASR error correction","summary":"  The integration of natural language processing (NLP) technologies into\neducational applications has shown promising results, particularly in the\nlanguage learning domain. Recently, many spoken open-domain chatbots have been\nused as speaking partners, helping language learners improve their language\nskills. However, one of the significant challenges is the high word-error-rate\n(WER) when recognizing non-native/non-fluent speech, which interrupts\nconversation flow and leads to disappointment for learners. This paper explores\nthe use of GPT4 for ASR error correction in conversational settings. In\naddition to WER, we propose to use semantic textual similarity (STS) and next\nresponse sensibility (NRS) metrics to evaluate the impact of error correction\nmodels on the quality of the conversation. We find that transcriptions\ncorrected by GPT4 lead to higher conversation quality, despite an increase in\nWER. GPT4 also outperforms standard error correction methods without the need\nfor in-domain training data.\n","authors":["Long Mai","Julie Carson-Berndsen"],"pdf_url":"https://arxiv.org/pdf/2307.09744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09455v2","updated":"2023-07-19T04:13:11Z","published":"2023-07-18T17:29:23Z","title":"Pseudo Outlier Exposure for Out-of-Distribution Detection using\n  Pretrained Transformers","summary":"  For real-world language applications, detecting an out-of-distribution (OOD)\nsample is helpful to alert users or reject such unreliable samples. However,\nmodern over-parameterized language models often produce overconfident\npredictions for both in-distribution (ID) and OOD samples. In particular,\nlanguage models suffer from OOD samples with a similar semantic representation\nto ID samples since these OOD samples lie near the ID manifold. A rejection\nnetwork can be trained with ID and diverse outlier samples to detect test OOD\nsamples, but explicitly collecting auxiliary OOD datasets brings an additional\nburden for data collection. In this paper, we propose a simple but effective\nmethod called Pseudo Outlier Exposure (POE) that constructs a surrogate OOD\ndataset by sequentially masking tokens related to ID classes. The surrogate OOD\nsample introduced by POE shows a similar representation to ID data, which is\nmost effective in training a rejection network. Our method does not require any\nexternal OOD data and can be easily implemented within off-the-shelf\nTransformers. A comprehensive comparison with state-of-the-art algorithms\ndemonstrates POE's competitiveness on several text classification benchmarks.\n","authors":["Jaeyoung Kim","Kyuheon Jung","Dongbin Na","Sion Jang","Eunbin Park","Sungchul Choi"],"pdf_url":"https://arxiv.org/pdf/2307.09455v2.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.09706v1","updated":"2023-07-19T01:37:31Z","published":"2023-07-19T01:37:31Z","title":"RaTE: a Reproducible automatic Taxonomy Evaluation by Filling the Gap","summary":"  Taxonomies are an essential knowledge representation, yet most studies on\nautomatic taxonomy construction (ATC) resort to manual evaluation to score\nproposed algorithms. We argue that automatic taxonomy evaluation (ATE) is just\nas important as taxonomy construction. We propose RaTE, an automatic label-free\ntaxonomy scoring procedure, which relies on a large pre-trained language model.\nWe apply our evaluation procedure to three state-of-the-art ATC algorithms with\nwhich we built seven taxonomies from the Yelp domain, and show that 1) RaTE\ncorrelates well with human judgments and 2) artificially degrading a taxonomy\nleads to decreasing RaTE score.\n","authors":["Tianjian Gao","Phillipe Langlais"],"pdf_url":"https://arxiv.org/pdf/2307.09706v1.pdf","comment":"15th International Conference on Computational Semantics (IWCS),\n  Association for Computational Linguistics (ACL)"},{"id":"http://arxiv.org/abs/2307.03135v2","updated":"2023-07-19T01:28:30Z","published":"2023-07-06T17:05:26Z","title":"Distilling Large Vision-Language Model with Out-of-Distribution\n  Generalizability","summary":"  Large vision-language models have achieved outstanding performance, but their\nsize and computational requirements make their deployment on\nresource-constrained devices and time-sensitive tasks impractical. Model\ndistillation, the process of creating smaller, faster models that maintain the\nperformance of larger models, is a promising direction towards the solution.\nThis paper investigates the distillation of visual representations in large\nteacher vision-language models into lightweight student models using a small-\nor mid-scale dataset. Notably, this study focuses on open-vocabulary\nout-of-distribution (OOD) generalization, a challenging problem that has been\noverlooked in previous model distillation literature. We propose two principles\nfrom vision and language modality perspectives to enhance student's OOD\ngeneralization: (1) by better imitating teacher's visual representation space,\nand carefully promoting better coherence in vision-language alignment with the\nteacher; (2) by enriching the teacher's language representations with\ninformative and finegrained semantic attributes to effectively distinguish\nbetween different labels. We propose several metrics and conduct extensive\nexperiments to investigate their techniques. The results demonstrate\nsignificant improvements in zero-shot and few-shot student performance on\nopen-vocabulary out-of-distribution classification, highlighting the\neffectiveness of our proposed approaches. Code released at\nhttps://github.com/xuanlinli17/large_vlm_distillation_ood\n","authors":["Xuanlin Li","Yunhao Fang","Minghua Liu","Zhan Ling","Zhuowen Tu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2307.03135v2.pdf","comment":"Published at International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2307.09705v1","updated":"2023-07-19T01:22:40Z","published":"2023-07-19T01:22:40Z","title":"CValues: Measuring the Values of Chinese Large Language Models from\n  Safety to Responsibility","summary":"  With the rapid evolution of large language models (LLMs), there is a growing\nconcern that they may pose risks or have negative social impacts. Therefore,\nevaluation of human values alignment is becoming increasingly important.\nPrevious work mainly focuses on assessing the performance of LLMs on certain\nknowledge and reasoning abilities, while neglecting the alignment to human\nvalues, especially in a Chinese context. In this paper, we present CValues, the\nfirst Chinese human values evaluation benchmark to measure the alignment\nability of LLMs in terms of both safety and responsibility criteria. As a\nresult, we have manually collected adversarial safety prompts across 10\nscenarios and induced responsibility prompts from 8 domains by professional\nexperts. To provide a comprehensive values evaluation of Chinese LLMs, we not\nonly conduct human evaluation for reliable comparison, but also construct\nmulti-choice prompts for automatic evaluation. Our findings suggest that while\nmost Chinese LLMs perform well in terms of safety, there is considerable room\nfor improvement in terms of responsibility. Moreover, both the automatic and\nhuman evaluation are important for assessing the human values alignment in\ndifferent aspects. The benchmark and code is available on ModelScope and\nGithub.\n","authors":["Guohai Xu","Jiayi Liu","Ming Yan","Haotian Xu","Jinghui Si","Zhuoran Zhou","Peng Yi","Xing Gao","Jitao Sang","Rong Zhang","Ji Zhang","Chao Peng","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.09705v1.pdf","comment":"Working in Process"},{"id":"http://arxiv.org/abs/2307.09702v1","updated":"2023-07-19T01:14:49Z","published":"2023-07-19T01:14:49Z","title":"Efficient Guided Generation for LLMs","summary":"  In this article we describe an efficient approach to guiding language model\ntext generation with regular expressions and context-free grammars. Our\napproach adds little to no overhead to the token sequence generation process,\nand makes guided generation feasible in practice. An implementation is provided\nin the open source Python library Outlines.\n","authors":["Brandon T. Willard","Rémi Louf"],"pdf_url":"https://arxiv.org/pdf/2307.09702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09701v1","updated":"2023-07-19T01:05:33Z","published":"2023-07-19T01:05:33Z","title":"Efficiency Pentathlon: A Standardized Arena for Efficiency Evaluation","summary":"  Rising computational demands of modern natural language processing (NLP)\nsystems have increased the barrier to entry for cutting-edge research while\nposing serious environmental concerns. Yet, progress on model efficiency has\nbeen impeded by practical challenges in model evaluation and comparison. For\nexample, hardware is challenging to control due to disparate levels of\naccessibility across different institutions. Moreover, improvements in metrics\nsuch as FLOPs often fail to translate to progress in real-world applications.\nIn response, we introduce Pentathlon, a benchmark for holistic and realistic\nevaluation of model efficiency. Pentathlon focuses on inference, which accounts\nfor a majority of the compute in a model's lifecycle. It offers a\nstrictly-controlled hardware platform, and is designed to mirror real-world\napplications scenarios. It incorporates a suite of metrics that target\ndifferent aspects of efficiency, including latency, throughput, memory\noverhead, and energy consumption. Pentathlon also comes with a software library\nthat can be seamlessly integrated into any codebase and enable evaluation. As a\nstandardized and centralized evaluation platform, Pentathlon can drastically\nreduce the workload to make fair and reproducible efficiency comparisons. While\ninitially focused on natural language processing (NLP) models, Pentathlon is\ndesigned to allow flexible extension to other fields. We envision Pentathlon\nwill stimulate algorithmic innovations in building efficient models, and foster\nan increased awareness of the social and environmental implications in the\ndevelopment of future-generation NLP models.\n","authors":["Hao Peng","Qingqing Cao","Jesse Dodge","Matthew E. Peters","Jared Fernandez","Tom Sherborne","Kyle Lo","Sam Skjonsberg","Emma Strubell","Darrell Plessas","Iz Beltagy","Evan Pete Walsh","Noah A. Smith","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2307.09701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08272v2","updated":"2023-07-19T23:52:23Z","published":"2023-07-17T06:36:53Z","title":"ChatGPT is Good but Bing Chat is Better for Vietnamese Students","summary":"  This study examines the efficacy of two SOTA large language models (LLMs),\nnamely ChatGPT and Microsoft Bing Chat (BingChat), in catering to the needs of\nVietnamese students. Although ChatGPT exhibits proficiency in multiple\ndisciplines, Bing Chat emerges as the more advantageous option. We conduct a\ncomparative analysis of their academic achievements in various disciplines,\nencompassing mathematics, literature, English language, physics, chemistry,\nbiology, history, geography, and civic education. The results of our study\nsuggest that BingChat demonstrates superior performance compared to ChatGPT\nacross a wide range of subjects, with the exception of literature, where\nChatGPT exhibits better performance. Additionally, BingChat utilizes the more\nadvanced GPT-4 technology in contrast to ChatGPT, which is built upon GPT-3.5.\nThis allows BingChat to improve to comprehension, reasoning and generation of\ncreative and informative text. Moreover, the fact that BingChat is accessible\nin Vietnam and its integration of hyperlinks and citations within responses\nserve to reinforce its superiority. In our analysis, it is evident that while\nChatGPT exhibits praiseworthy qualities, BingChat presents a more apdated\nsolutions for Vietnamese students.\n","authors":["Xuan-Quy Dao","Ngoc-Bich Le"],"pdf_url":"https://arxiv.org/pdf/2307.08272v2.pdf","comment":"13 pages; 6 figures"},{"id":"http://arxiv.org/abs/2307.10490v1","updated":"2023-07-19T23:03:20Z","published":"2023-07-19T23:03:20Z","title":"(Ab)using Images and Sounds for Indirect Instruction Injection in\n  Multi-Modal LLMs","summary":"  We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10488v1","updated":"2023-07-19T22:48:02Z","published":"2023-07-19T22:48:02Z","title":"SPRINT: A Unified Toolkit for Evaluating and Demystifying Zero-shot\n  Neural Sparse Retrieval","summary":"  Traditionally, sparse retrieval systems relied on lexical representations to\nretrieve documents, such as BM25, dominated information retrieval tasks. With\nthe onset of pre-trained transformer models such as BERT, neural sparse\nretrieval has led to a new paradigm within retrieval. Despite the success,\nthere has been limited software supporting different sparse retrievers running\nin a unified, common environment. This hinders practitioners from fairly\ncomparing different sparse models and obtaining realistic evaluation results.\nAnother missing piece is, that a majority of prior work evaluates sparse\nretrieval models on in-domain retrieval, i.e. on a single dataset: MS MARCO.\nHowever, a key requirement in practical retrieval systems requires models that\ncan generalize well to unseen out-of-domain, i.e. zero-shot retrieval tasks. In\nthis work, we provide SPRINT, a unified Python toolkit based on Pyserini and\nLucene, supporting a common interface for evaluating neural sparse retrieval.\nThe toolkit currently includes five built-in models: uniCOIL, DeepImpact,\nSPARTA, TILDEv2 and SPLADEv2. Users can also easily add customized models by\ndefining their term weighting method. Using our toolkit, we establish strong\nand reproducible zero-shot sparse retrieval baselines across the\nwell-acknowledged benchmark, BEIR. Our results demonstrate that SPLADEv2\nachieves the best average score of 0.470 nDCG@10 on BEIR amongst all neural\nsparse retrievers. In this work, we further uncover the reasons behind its\nperformance gain. We show that SPLADEv2 produces sparse representations with a\nmajority of tokens outside of the original query and document which is often\ncrucial for its performance gains, i.e. a limitation among its other sparse\ncounterparts. We provide our SPRINT toolkit, models, and data used in our\nexperiments publicly here at https://github.com/thakur-nandan/sprint.\n","authors":["Nandan Thakur","Kexin Wang","Iryna Gurevych","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2307.10488v1.pdf","comment":"Accepted at SIGIR 2023 (Resource Track)"},{"id":"http://arxiv.org/abs/2307.10485v1","updated":"2023-07-19T22:43:57Z","published":"2023-07-19T22:43:57Z","title":"FinGPT: Democratizing Internet-scale Data for Financial Large Language\n  Models","summary":"  Large language models (LLMs) have demonstrated remarkable proficiency in\nunderstanding and generating human-like texts, which may potentially\nrevolutionize the finance industry. However, existing LLMs often fall short in\nthe financial field, which is mainly attributed to the disparities between\ngeneral text data and financial text data. Unfortunately, there is only a\nlimited number of financial text datasets available (quite small size), and\nBloombergGPT, the first financial LLM (FinLLM), is close-sourced (only the\ntraining logs were released). In light of this, we aim to democratize\nInternet-scale financial data for LLMs, which is an open challenge due to\ndiverse data sources, low signal-to-noise ratio, and high time-validity. To\naddress the challenges, we introduce an open-sourced and data-centric\nframework, \\textit{Financial Generative Pre-trained Transformer (FinGPT)}, that\nautomates the collection and curation of real-time financial data from >34\ndiverse sources on the Internet, providing researchers and practitioners with\naccessible and transparent resources to develop their FinLLMs. Additionally, we\npropose a simple yet effective strategy for fine-tuning FinLLM using the\ninherent feedback from the market, dubbed Reinforcement Learning with Stock\nPrices (RLSP). We also adopt the Low-rank Adaptation (LoRA, QLoRA) method that\nenables users to customize their own FinLLMs from open-source general-purpose\nLLMs at a low cost. Finally, we showcase several FinGPT applications, including\nrobo-advisor, sentiment analysis for algorithmic trading, and low-code\ndevelopment. FinGPT aims to democratize FinLLMs, stimulate innovation, and\nunlock new opportunities in open finance. The codes are available at\nhttps://github.com/AI4Finance-Foundation/FinGPT and\nhttps://github.com/AI4Finance-Foundation/FinNLP\n","authors":["Xiao-Yang Liu","Guoxuan Wang","Daochen Zha"],"pdf_url":"https://arxiv.org/pdf/2307.10485v1.pdf","comment":"43 pages, 9 tables, and 3 figures"},{"id":"http://arxiv.org/abs/2307.10476v1","updated":"2023-07-19T22:14:58Z","published":"2023-07-19T22:14:58Z","title":"What can we learn from Data Leakage and Unlearning for Law?","summary":"  Large Language Models (LLMs) have a privacy concern because they memorize\ntraining data (including personally identifiable information (PII) like emails\nand phone numbers) and leak it during inference. A company can train an LLM on\nits domain-customized data which can potentially also include their users' PII.\nIn order to comply with privacy laws such as the \"right to be forgotten\", the\ndata points of users that are most vulnerable to extraction could be deleted.\nWe find that once the most vulnerable points are deleted, a new set of points\nbecome vulnerable to extraction. So far, little attention has been given to\nunderstanding memorization for fine-tuned models. In this work, we also show\nthat not only do fine-tuned models leak their training data but they also leak\nthe pre-training data (and PII) memorized during the pre-training phase. The\nproperty of new data points becoming vulnerable to extraction after unlearning\nand leakage of pre-training data through fine-tuned models can pose significant\nprivacy and legal concerns for companies that use LLMs to offer services. We\nhope this work will start an interdisciplinary discussion within AI and law\ncommunities regarding the need for policies to tackle these issues.\n","authors":["Jaydeep Borkar"],"pdf_url":"https://arxiv.org/pdf/2307.10476v1.pdf","comment":"5 pages, 8 figures, accepted to the first GenLaw workshop at ICML'23,\n  Hawai'i"},{"id":"http://arxiv.org/abs/2307.10475v1","updated":"2023-07-19T22:14:49Z","published":"2023-07-19T22:14:49Z","title":"Findings of Factify 2: Multimodal Fake News Detection","summary":"  With social media usage growing exponentially in the past few years, fake\nnews has also become extremely prevalent. The detrimental impact of fake news\nemphasizes the need for research focused on automating the detection of false\ninformation and verifying its accuracy. In this work, we present the outcome of\nthe Factify 2 shared task, which provides a multi-modal fact verification and\nsatire news dataset, as part of the DeFactify 2 workshop at AAAI'23. The data\ncalls for a comparison based approach to the task by pairing social media\nclaims with supporting documents, with both text and image, divided into 5\nclasses based on multi-modal relations. In the second iteration of this task we\nhad over 60 participants and 9 final test-set submissions. The best\nperformances came from the use of DeBERTa for text and Swinv2 and CLIP for\nimage. The highest F1 score averaged for all five classes was 81.82%.\n","authors":["S Suryavardan","Shreyash Mishra","Megha Chakraborty","Parth Patwa","Anku Rani","Aman Chadha","Aishwarya Reganti","Amitava Das","Amit Sheth","Manoj Chinnakotla","Asif Ekbal","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2307.10475v1.pdf","comment":"Defactify2 @AAAI 2023"},{"id":"http://arxiv.org/abs/2307.10472v1","updated":"2023-07-19T22:03:40Z","published":"2023-07-19T22:03:40Z","title":"Can Instruction Fine-Tuned Language Models Identify Social Bias through\n  Prompting?","summary":"  As the breadth and depth of language model applications continue to expand\nrapidly, it is increasingly important to build efficient frameworks for\nmeasuring and mitigating the learned or inherited social biases of these\nmodels. In this paper, we present our work on evaluating instruction fine-tuned\nlanguage models' ability to identify bias through zero-shot prompting,\nincluding Chain-of-Thought (CoT) prompts. Across LLaMA and its two instruction\nfine-tuned versions, Alpaca 7B performs best on the bias identification task\nwith an accuracy of 56.7%. We also demonstrate that scaling up LLM size and\ndata diversity could lead to further performance gain. This is a\nwork-in-progress presenting the first component of our bias mitigation\nframework. We will keep updating this work as we get more results.\n","authors":["Omkar Dige","Jacob-Junqi Tian","David Emerson","Faiza Khan Khattak"],"pdf_url":"https://arxiv.org/pdf/2307.10472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10457v1","updated":"2023-07-19T21:00:16Z","published":"2023-07-19T21:00:16Z","title":"Improving Pre-trained Language Models' Generalization","summary":"  The reusability of state-of-the-art Pre-trained Language Models (PLMs) is\noften limited by their generalization problem, where their performance\ndrastically decreases when evaluated on examples that differ from the training\ndataset, known as Out-of-Distribution (OOD)/unseen examples. This limitation\narises from PLMs' reliance on spurious correlations, which work well for\nfrequent example types but not for general examples. To address this issue, we\npropose a training approach called Mask-tuning, which integrates Masked\nLanguage Modeling (MLM) training objectives into the fine-tuning process to\nenhance PLMs' generalization. Comprehensive experiments demonstrate that\nMask-tuning surpasses current state-of-the-art techniques and enhances PLMs'\ngeneralization on OOD datasets while improving their performance on\nin-distribution datasets. The findings suggest that Mask-tuning improves the\nreusability of PLMs on unseen data, making them more practical and effective\nfor real-world applications.\n","authors":["Somayeh Ghanbarzadeh","Hamid Palangi","Yan Huang","Radames Cruz Moreno","Hamed Khanpour"],"pdf_url":"https://arxiv.org/pdf/2307.10457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10443v1","updated":"2023-07-19T20:17:37Z","published":"2023-07-19T20:17:37Z","title":"Integrating a Heterogeneous Graph with Entity-aware Self-attention using\n  Relative Position Labels for Reading Comprehension Model","summary":"  Despite the significant progress made by transformer models in machine\nreading comprehension tasks, they still face limitations in handling complex\nreasoning tasks due to the absence of explicit knowledge in the input sequence.\nThis paper proposes a novel attention pattern to overcome this limitation,\nwhich integrates reasoning knowledge derived from a heterogeneous graph into\nthe transformer architecture using a graph-enhanced self-attention mechanism.\nThe proposed attention pattern comprises three key elements: global-local\nattention for word tokens, graph attention for entity tokens that exhibit\nstrong attention towards tokens connected in the graph as opposed to those\nunconnected, and the consideration of the type of relationship between each\nentity token and word token. This results in optimized attention between the\ntwo if a relationship exists. The pattern is coupled with special relative\nposition labels, allowing it to integrate with LUKE's entity-aware\nself-attention mechanism. The experimental findings corroborate that our model\noutperforms both the cutting-edge LUKE-Graph and the baseline LUKE model on the\nReCoRD dataset that focuses on commonsense reasoning.\n","authors":["Shima Foolad","Kourosh Kiani"],"pdf_url":"https://arxiv.org/pdf/2307.10443v1.pdf","comment":"submitted for Knowledge-Based Systems Journal"},{"id":"http://arxiv.org/abs/2307.10442v1","updated":"2023-07-19T20:16:46Z","published":"2023-07-19T20:16:46Z","title":"Thrust: Adaptively Propels Large Language Models with External Knowledge","summary":"  Although large-scale pre-trained language models (PTLMs) are shown to encode\nrich knowledge in their model parameters, the inherent knowledge in PTLMs can\nbe opaque or static, making external knowledge necessary. However, the existing\ninformation retrieval techniques could be costly and may even introduce noisy\nand sometimes misleading knowledge. To address these challenges, we propose the\ninstance-level adaptive propulsion of external knowledge (IAPEK), where we only\nconduct the retrieval when necessary. To achieve this goal, we propose\nmeasuring whether a PTLM contains enough knowledge to solve an instance with a\nnovel metric, Thrust, which leverages the representation distribution of a\nsmall number of seen instances. Extensive experiments demonstrate that thrust\nis a good measurement of PTLM models' instance-level knowledgeability.\nMoreover, we can achieve significantly higher cost-efficiency with the Thrust\nscore as the retrieval indicator than the naive usage of external knowledge on\n88% of the evaluated tasks with 26% average performance improvement. Such\nfindings shed light on the real-world practice of knowledge-enhanced LMs with a\nlimited knowledge-seeking budget due to computation latency or costs.\n","authors":["Xinran Zhao","Hongming Zhang","Xiaoman Pan","Wenlin Yao","Dong Yu","Jianshu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.10442v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2301.13816v4","updated":"2023-07-19T19:55:31Z","published":"2023-01-31T18:02:26Z","title":"Execution-based Code Generation using Deep Reinforcement Learning","summary":"  The utilization of programming language (PL) models, pre-trained on\nlarge-scale code corpora, as a means of automating software engineering\nprocesses has demonstrated considerable potential in streamlining various code\ngeneration tasks such as code completion, code translation, and program\nsynthesis. However, current approaches mainly rely on supervised fine-tuning\nobjectives borrowed from text generation, neglecting unique sequence-level\ncharacteristics of code, including but not limited to compilability as well as\nsyntactic and functional correctness. To address this limitation, we propose\nPPOCoder, a new framework for code generation that synergistically combines\npre-trained PL models with Proximal Policy Optimization (PPO) which is a widely\nused deep reinforcement learning technique. By utilizing non-differentiable\nfeedback from code execution and structure alignment, PPOCoder seamlessly\nintegrates external code-specific knowledge into the model optimization\nprocess. It's important to note that PPOCoder is a task-agnostic and\nmodel-agnostic framework that can be used across different code generation\ntasks and PLs. Extensive experiments on three code generation tasks demonstrate\nthe effectiveness of our proposed approach compared to SOTA methods, achieving\nsignificant improvements in compilation success rates and functional\ncorrectness across different PLs.\n","authors":["Parshin Shojaee","Aneesh Jain","Sindhu Tipirneni","Chandan K. Reddy"],"pdf_url":"https://arxiv.org/pdf/2301.13816v4.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR), 2023"},{"id":"http://arxiv.org/abs/2307.10432v1","updated":"2023-07-19T19:40:34Z","published":"2023-07-19T19:40:34Z","title":"PharmacyGPT: The AI Pharmacist","summary":"  In this study, we introduce PharmacyGPT, a novel framework to assess the\ncapabilities of large language models (LLMs) such as ChatGPT and GPT-4 in\nemulating the role of clinical pharmacists. Our methodology encompasses the\nutilization of LLMs to generate comprehensible patient clusters, formulate\nmedication plans, and forecast patient outcomes. We conduct our investigation\nusing real data acquired from the intensive care unit (ICU) at the University\nof North Carolina Chapel Hill (UNC) Hospital. Our analysis offers valuable\ninsights into the potential applications and limitations of LLMs in the field\nof clinical pharmacy, with implications for both patient care and the\ndevelopment of future AI-driven healthcare solutions. By evaluating the\nperformance of PharmacyGPT, we aim to contribute to the ongoing discourse\nsurrounding the integration of artificial intelligence in healthcare settings,\nultimately promoting the responsible and efficacious use of such technologies.\n","authors":["Zhengliang Liu","Zihao Wu","Mengxuan Hu","Bokai Zhao","Lin Zhao","Tianyi Zhang","Haixing Dai","Xianyan Chen","Ye Shen","Sheng Li","Brian Murray","Tianming Liu","Andrea Sikora"],"pdf_url":"https://arxiv.org/pdf/2307.10432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09826v2","updated":"2023-07-19T19:30:52Z","published":"2023-04-16T11:22:59Z","title":"Fairness in AI and Its Long-Term Implications on Society","summary":"  Successful deployment of artificial intelligence (AI) in various settings has\nled to numerous positive outcomes for individuals and society. However, AI\nsystems have also been shown to harm parts of the population due to biased\npredictions. AI fairness focuses on mitigating such biases to ensure AI\ndecision making is not discriminatory towards certain groups. We take a closer\nlook at AI fairness and analyze how lack of AI fairness can lead to deepening\nof biases over time and act as a social stressor. More specifically, we discuss\nhow biased models can lead to more negative real-world outcomes for certain\ngroups, which may then become more prevalent by deploying new AI models trained\non increasingly biased data, resulting in a feedback loop. If the issues\npersist, they could be reinforced by interactions with other risks and have\nsevere implications on society in the form of social unrest. We examine current\nstrategies for improving AI fairness, assess their limitations in terms of\nreal-world deployment, and explore potential paths forward to ensure we reap\nAI's benefits without causing society's collapse.\n","authors":["Ondrej Bohdal","Timothy Hospedales","Philip H. S. Torr","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2304.09826v2.pdf","comment":"Stanford Existential Risks Conference 2023"},{"id":"http://arxiv.org/abs/2306.17582v2","updated":"2023-07-19T19:30:28Z","published":"2023-02-20T06:39:06Z","title":"ChatGPT for Robotics: Design Principles and Model Abilities","summary":"  This paper presents an experimental study regarding the use of OpenAI's\nChatGPT for robotics applications. We outline a strategy that combines design\nprinciples for prompt engineering and the creation of a high-level function\nlibrary which allows ChatGPT to adapt to different robotics tasks, simulators,\nand form factors. We focus our evaluations on the effectiveness of different\nprompt engineering techniques and dialog strategies towards the execution of\nvarious types of robotics tasks. We explore ChatGPT's ability to use free-form\ndialog, parse XML tags, and to synthesize code, in addition to the use of\ntask-specific prompting functions and closed-loop reasoning through dialogues.\nOur study encompasses a range of tasks within the robotics domain, from basic\nlogical, geometrical, and mathematical reasoning all the way to complex domains\nsuch as aerial navigation, manipulation, and embodied agents. We show that\nChatGPT can be effective at solving several of such tasks, while allowing users\nto interact with it primarily via natural language instructions. In addition to\nthese studies, we introduce an open-sourced research tool called PromptCraft,\nwhich contains a platform where researchers can collaboratively upload and vote\non examples of good prompting schemes for robotics applications, as well as a\nsample robotics simulator with ChatGPT integration, making it easier for users\nto get started with using ChatGPT for robotics.\n","authors":["Sai Vemprala","Rogerio Bonatti","Arthur Bucker","Ashish Kapoor"],"pdf_url":"https://arxiv.org/pdf/2306.17582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10323v1","updated":"2023-07-19T07:20:30Z","published":"2023-07-19T07:20:30Z","title":"IncDSI: Incrementally Updatable Document Retrieval","summary":"  Differentiable Search Index is a recently proposed paradigm for document\nretrieval, that encodes information about a corpus of documents within the\nparameters of a neural network and directly maps queries to corresponding\ndocuments. These models have achieved state-of-the-art performances for\ndocument retrieval across many benchmarks. These kinds of models have a\nsignificant limitation: it is not easy to add new documents after a model is\ntrained. We propose IncDSI, a method to add documents in real time (about\n20-50ms per document), without retraining the model on the entire dataset (or\neven parts thereof). Instead we formulate the addition of documents as a\nconstrained optimization problem that makes minimal changes to the network\nparameters. Although orders of magnitude faster, our approach is competitive\nwith re-training the model on the whole dataset and enables the development of\ndocument retrieval systems that can be updated with new information in\nreal-time. Our code for IncDSI is available at\nhttps://github.com/varshakishore/IncDSI.\n","authors":["Varsha Kishore","Chao Wan","Justin Lovelace","Yoav Artzi","Kilian Q. Weinberger"],"pdf_url":"https://arxiv.org/pdf/2307.10323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00370v2","updated":"2023-07-19T06:55:04Z","published":"2023-07-01T15:44:53Z","title":"Improving Text Matching in E-Commerce Search with A Rationalizable,\n  Intervenable and Fast Entity-Based Relevance Model","summary":"  Discovering the intended items of user queries from a massive repository of\nitems is one of the main goals of an e-commerce search system. Relevance\nprediction is essential to the search system since it helps improve\nperformance. When online serving a relevance model, the model is required to\nperform fast and accurate inference. Currently, the widely used models such as\nBi-encoder and Cross-encoder have their limitations in accuracy or inference\nspeed respectively. In this work, we propose a novel model called the\nEntity-Based Relevance Model (EBRM). We identify the entities contained in an\nitem and decompose the QI (query-item) relevance problem into multiple QE\n(query-entity) relevance problems; we then aggregate their results to form the\nQI prediction using a soft logic formulation. The decomposition allows us to\nuse a Cross-encoder QE relevance module for high accuracy as well as cache QE\npredictions for fast online inference. Utilizing soft logic makes the\nprediction procedure interpretable and intervenable. We also show that\npretraining the QE module with auto-generated QE data from user logs can\nfurther improve the overall performance. The proposed method is evaluated on\nlabeled data from e-commerce websites. Empirical results show that it achieves\npromising improvements with computation efficiency.\n","authors":["Jiong Cai","Yong Jiang","Yue Zhang","Chengyue Jiang","Ke Yu","Jianhui Ji","Rong Xiao","Haihong Tang","Tao Wang","Zhongqiang Huang","Pengjun Xie","Fei Huang","Kewei Tu"],"pdf_url":"https://arxiv.org/pdf/2307.00370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10314v1","updated":"2023-07-19T03:31:41Z","published":"2023-07-19T03:31:41Z","title":"Mood Classification of Bangla Songs Based on Lyrics","summary":"  Music can evoke various emotions, and with the advancement of technology, it\nhas become more accessible to people. Bangla music, which portrays different\nhuman emotions, lacks sufficient research. The authors of this article aim to\nanalyze Bangla songs and classify their moods based on the lyrics. To achieve\nthis, this research has compiled a dataset of 4000 Bangla song lyrics, genres,\nand used Natural Language Processing and the Bert Algorithm to analyze the\ndata. Among the 4000 songs, 1513 songs are represented for the sad mood, 1362\nfor the romantic mood, 886 for happiness, and the rest 239 are classified as\nrelaxation. By embedding the lyrics of the songs, the authors have classified\nthe songs into four moods: Happy, Sad, Romantic, and Relaxed. This research is\ncrucial as it enables a multi-class classification of songs' moods, making the\nmusic more relatable to people's emotions. The article presents the automated\nresult of the four moods accurately derived from the song lyrics.\n","authors":["Maliha Mahajebin","Mohammad Rifat Ahmmad Rashid","Nafees Mansoor"],"pdf_url":"https://arxiv.org/pdf/2307.10314v1.pdf","comment":"Presented at International Conference on. Inventive Communication and\n  Computational Technologies 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.10173v1","updated":"2023-07-19T17:58:03Z","published":"2023-07-19T17:58:03Z","title":"DNA-Rendering: A Diverse Neural Actor Repository for High-Fidelity\n  Human-centric Rendering","summary":"  Realistic human-centric rendering plays a key role in both computer vision\nand computer graphics. Rapid progress has been made in the algorithm aspect\nover the years, yet existing human-centric rendering datasets and benchmarks\nare rather impoverished in terms of diversity, which are crucial for rendering\neffect. Researchers are usually constrained to explore and evaluate a small set\nof rendering problems on current datasets, while real-world applications\nrequire methods to be robust across different scenarios. In this work, we\npresent DNA-Rendering, a large-scale, high-fidelity repository of human\nperformance data for neural actor rendering. DNA-Rendering presents several\nalluring attributes. First, our dataset contains over 1500 human subjects, 5000\nmotion sequences, and 67.5M frames' data volume. Second, we provide rich assets\nfor each subject -- 2D/3D human body keypoints, foreground masks, SMPLX models,\ncloth/accessory materials, multi-view images, and videos. These assets boost\nthe current method's accuracy on downstream rendering tasks. Third, we\nconstruct a professional multi-view system to capture data, which contains 60\nsynchronous cameras with max 4096 x 3000 resolution, 15 fps speed, and stern\ncamera calibration steps, ensuring high-quality resources for task training and\nevaluation. Along with the dataset, we provide a large-scale and quantitative\nbenchmark in full-scale, with multiple tasks to evaluate the existing progress\nof novel view synthesis, novel pose animation synthesis, and novel identity\nrendering methods. In this manuscript, we describe our DNA-Rendering effort as\na revealing of new observations, challenges, and future directions to\nhuman-centric rendering. The dataset, code, and benchmarks will be publicly\navailable at https://dna-rendering.github.io/\n","authors":["Wei Cheng","Ruixiang Chen","Wanqi Yin","Siming Fan","Keyu Chen","Honglin He","Huiwen Luo","Zhongang Cai","Jingbo Wang","Yang Gao","Zhengming Yu","Zhengyu Lin","Daxuan Ren","Lei Yang","Ziwei Liu","Chen Change Loy","Chen Qian","Wayne Wu","Dahua Lin","Bo Dai","Kwan-Yee Lin"],"pdf_url":"https://arxiv.org/pdf/2307.10173v1.pdf","comment":"This paper is accepted by ICCV2023. Project page:\n  https://dna-rendering.github.io/"},{"id":"http://arxiv.org/abs/2112.06809v8","updated":"2023-07-19T17:50:21Z","published":"2021-12-13T17:11:32Z","title":"Persistent Animal Identification Leveraging Non-Visual Markers","summary":"  Our objective is to locate and provide a unique identifier for each mouse in\na cluttered home-cage environment through time, as a precursor to automated\nbehaviour recognition for biological research. This is a very challenging\nproblem due to (i) the lack of distinguishing visual features for each mouse,\nand (ii) the close confines of the scene with constant occlusion, making\nstandard visual tracking approaches unusable. However, a coarse estimate of\neach mouse's location is available from a unique RFID implant, so there is the\npotential to optimally combine information from (weak) tracking with coarse\ninformation on identity. To achieve our objective, we make the following key\ncontributions: (a) the formulation of the object identification problem as an\nassignment problem (solved using Integer Linear Programming), and (b) a novel\nprobabilistic model of the affinity between tracklets and RFID data. The latter\nis a crucial part of the model, as it provides a principled probabilistic\ntreatment of object detections given coarse localisation. Our approach achieves\n77% accuracy on this animal identification problem, and is able to reject\nspurious detections when the animals are hidden.\n","authors":["Michael P. J. Camilleri","Li Zhang","Rasneer S. Bains","Andrew Zisserman","Christopher K. I. Williams"],"pdf_url":"https://arxiv.org/pdf/2112.06809v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10166v1","updated":"2023-07-19T17:50:03Z","published":"2023-07-19T17:50:03Z","title":"Adversarial Latent Autoencoder with Self-Attention for Structural Image\n  Synthesis","summary":"  Generative Engineering Design approaches driven by Deep Generative Models\n(DGM) have been proposed to facilitate industrial engineering processes. In\nsuch processes, designs often come in the form of images, such as blueprints,\nengineering drawings, and CAD models depending on the level of detail. DGMs\nhave been successfully employed for synthesis of natural images, e.g.,\ndisplaying animals, human faces and landscapes. However, industrial design\nimages are fundamentally different from natural scenes in that they contain\nrich structural patterns and long-range dependencies, which are challenging for\nconvolution-based DGMs to generate. Moreover, DGM-driven generation process is\ntypically triggered based on random noisy inputs, which outputs unpredictable\nsamples and thus cannot perform an efficient industrial design exploration. We\ntackle these challenges by proposing a novel model Self-Attention Adversarial\nLatent Autoencoder (SA-ALAE), which allows generating feasible design images of\ncomplex engineering parts. With SA-ALAE, users can not only explore novel\nvariants of an existing design, but also control the generation process by\noperating in latent space. The potential of SA-ALAE is shown by generating\nengineering blueprints in a real automotive design task.\n","authors":["Jiajie Fan","Laure Vuaille","Hao Wang","Thomas Bäck"],"pdf_url":"https://arxiv.org/pdf/2307.10166v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.10165v1","updated":"2023-07-19T17:46:55Z","published":"2023-07-19T17:46:55Z","title":"Drone navigation and license place detection for vehicle location in\n  indoor spaces","summary":"  Millions of vehicles are transported every year, tightly parked in vessels or\nboats. To reduce the risks of associated safety issues like fires, knowing the\nlocation of vehicles is essential, since different vehicles may need different\nmitigation measures, e.g. electric cars. This work is aimed at creating a\nsolution based on a nano-drone that navigates across rows of parked vehicles\nand detects their license plates. We do so via a wall-following algorithm, and\na CNN trained to detect license plates. All computations are done in real-time\non the drone, which just sends position and detected images that allow the\ncreation of a 2D map with the position of the plates. Our solution is capable\nof reading all plates across eight test cases (with several rows of plates,\ndifferent drone speeds, or low light) by aggregation of measurements across\nseveral drone journeys.\n","authors":["Moa Arvidsson","Sithichot Sawirot","Cristofer Englund","Fernando Alonso-Fernandez","Martin Torstensson","Boris Duran"],"pdf_url":"https://arxiv.org/pdf/2307.10165v1.pdf","comment":"Published at VIII International Workshop on Artificial Intelligence\n  and Pattern Recognition, IWAIPR"},{"id":"http://arxiv.org/abs/2307.10160v1","updated":"2023-07-19T17:42:36Z","published":"2023-07-19T17:42:36Z","title":"Robust Driving Policy Learning with Guided Meta Reinforcement Learning","summary":"  Although deep reinforcement learning (DRL) has shown promising results for\nautonomous navigation in interactive traffic scenarios, existing work typically\nadopts a fixed behavior policy to control social vehicles in the training\nenvironment. This may cause the learned driving policy to overfit the\nenvironment, making it difficult to interact well with vehicles with different,\nunseen behaviors. In this work, we introduce an efficient method to train\ndiverse driving policies for social vehicles as a single meta-policy. By\nrandomizing the interaction-based reward functions of social vehicles, we can\ngenerate diverse objectives and efficiently train the meta-policy through\nguiding policies that achieve specific objectives. We further propose a\ntraining strategy to enhance the robustness of the ego vehicle's driving policy\nusing the environment where social vehicles are controlled by the learned\nmeta-policy. Our method successfully learns an ego driving policy that\ngeneralizes well to unseen situations with out-of-distribution (OOD) social\nagents' behaviors in a challenging uncontrolled T-intersection scenario.\n","authors":["Kanghoon Lee","Jiachen Li","David Isele","Jinkyoo Park","Kikuo Fujimura","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2307.10160v1.pdf","comment":"ITSC 2023"},{"id":"http://arxiv.org/abs/2307.10159v1","updated":"2023-07-19T17:39:39Z","published":"2023-07-19T17:39:39Z","title":"FABRIC: Personalizing Diffusion Models with Iterative Feedback","summary":"  In an era where visual content generation is increasingly driven by machine\nlearning, the integration of human feedback into generative models presents\nsignificant opportunities for enhancing user experience and output quality.\nThis study explores strategies for incorporating iterative human feedback into\nthe generative process of diffusion-based text-to-image models. We propose\nFABRIC, a training-free approach applicable to a wide range of popular\ndiffusion models, which exploits the self-attention layer present in the most\nwidely used architectures to condition the diffusion process on a set of\nfeedback images. To ensure a rigorous assessment of our approach, we introduce\na comprehensive evaluation methodology, offering a robust mechanism to quantify\nthe performance of generative visual models that integrate human feedback. We\nshow that generation results improve over multiple rounds of iterative feedback\nthrough exhaustive analysis, implicitly optimizing arbitrary user preferences.\nThe potential applications of these findings extend to fields such as\npersonalized content creation and customization.\n","authors":["Dimitri von Rütte","Elisabetta Fedele","Jonathan Thomm","Lukas Wolf"],"pdf_url":"https://arxiv.org/pdf/2307.10159v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.10157v1","updated":"2023-07-19T17:38:26Z","published":"2023-07-19T17:38:26Z","title":"Leveraging Visemes for Better Visual Speech Representation and Lip\n  Reading","summary":"  Lip reading is a challenging task that has many potential applications in\nspeech recognition, human-computer interaction, and security systems. However,\nexisting lip reading systems often suffer from low accuracy due to the\nlimitations of video features. In this paper, we propose a novel approach that\nleverages visemes, which are groups of phonetically similar lip shapes, to\nextract more discriminative and robust video features for lip reading. We\nevaluate our approach on various tasks, including word-level and sentence-level\nlip reading, and audiovisual speech recognition using the Arman-AV dataset, a\nlargescale Persian corpus. Our experimental results show that our viseme based\napproach consistently outperforms the state-of-theart methods in all these\ntasks. The proposed method reduces the lip-reading word error rate (WER) by\n9.1% relative to the best previous method.\n","authors":["Javad Peymanfard","Vahid Saeedi","Mohammad Reza Mohammadi","Hossein Zeinali","Nasser Mozayani"],"pdf_url":"https://arxiv.org/pdf/2307.10157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10135v1","updated":"2023-07-19T17:00:45Z","published":"2023-07-19T17:00:45Z","title":"An Improved NeuMIP with Better Accuracy","summary":"  Neural reflectance models are capable of accurately reproducing the\nspatially-varying appearance of many real-world materials at different scales.\nHowever, existing methods have difficulties handling highly glossy materials.\nTo address this problem, we introduce a new neural reflectance model which,\ncompared with existing methods, better preserves not only specular highlights\nbut also fine-grained details. To this end, we enhance the neural network\nperformance by encoding input data to frequency space, inspired by NeRF, to\nbetter preserve the details. Furthermore, we introduce a gradient-based loss\nand employ it in multiple stages, adaptive to the progress of the learning\nphase. Lastly, we utilize an optional extension to the decoder network using\nthe Inception module for more accurate yet costly performance. We demonstrate\nthe effectiveness of our method using a variety of synthetic and real examples.\n","authors":["Bowen Xue","Shuang Zhao","Henrik Wann Jensen","Zahra Montazeri"],"pdf_url":"https://arxiv.org/pdf/2307.10135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10129v1","updated":"2023-07-19T16:51:59Z","published":"2023-07-19T16:51:59Z","title":"General vs. Long-Tailed Age Estimation: An Approach to Kill Two Birds\n  with One Stone","summary":"  Facial age estimation has received a lot of attention for its diverse\napplication scenarios. Most existing studies treat each sample equally and aim\nto reduce the average estimation error for the entire dataset, which can be\nsummarized as General Age Estimation. However, due to the long-tailed\ndistribution prevalent in the dataset, treating all samples equally will\ninevitably bias the model toward the head classes (usually the adult with a\nmajority of samples). Driven by this, some works suggest that each class should\nbe treated equally to improve performance in tail classes (with a minority of\nsamples), which can be summarized as Long-tailed Age Estimation. However,\nLong-tailed Age Estimation usually faces a performance trade-off, i.e.,\nachieving improvement in tail classes by sacrificing the head classes. In this\npaper, our goal is to design a unified framework to perform well on both tasks,\nkilling two birds with one stone. To this end, we propose a simple, effective,\nand flexible training paradigm named GLAE, which is two-fold. Our GLAE provides\na surprising improvement on Morph II, reaching the lowest MAE and CMAE of 1.14\nand 1.27 years, respectively. Compared to the previous best method, MAE dropped\nby up to 34%, which is an unprecedented improvement, and for the first time,\nMAE is close to 1 year old. Extensive experiments on other age benchmark\ndatasets, including CACD, MIVIA, and Chalearn LAP 2015, also indicate that GLAE\noutperforms the state-of-the-art approaches significantly.\n","authors":["Zenghao Bao","Zichang Tan","Jun Li","Jun Wan","Xibo Ma","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2307.10129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10123v1","updated":"2023-07-19T16:42:52Z","published":"2023-07-19T16:42:52Z","title":"Two Approaches to Supervised Image Segmentation","summary":"  Though performed almost effortlessly by humans, segmenting 2D gray-scale or\ncolor images in terms of their constituent regions of interest\n(e.g.~background, objects or portions of objects) constitutes one of the\ngreatest challenges in science and technology as a consequence of the involved\ndimensionality reduction(3D to 2D), noise, reflections, shades, and occlusions,\namong many other possible effects. While a large number of interesting\napproaches have been respectively suggested along the last decades, it was\nmainly with the more recent development of deep learning that more effective\nand general solutions have been obtained, currently constituting the basic\ncomparison reference for this type of operation. Also developed recently, a\nmultiset-based methodology has been described that is capable of encouraging\nperformance that combines spatial accuracy, stability, and robustness while\nrequiring minimal computational resources (hardware and/or training and\nrecognition time). The interesting features of the latter methodology mostly\nfollow from the enhanced selectivity and sensitivity, as well as good\nrobustness to data perturbations and outliers, allowed by the coincidence\nsimilarity index on which the multiset approach to supervised image\nsegmentation is based. After describing the deep learning and multiset\napproaches, the present work develops two comparison experiments between them\nwhich are primarily aimed at illustrating their respective main interesting\nfeatures when applied to the adopted specific type of data and parameter\nconfigurations. While the deep learning approach confirmed its potential for\nperforming image segmentation, the alternative multiset methodology allowed for\nencouraging accuracy while requiring little computational resources.\n","authors":["Alexandre Benatti","Luciano da F. Costa"],"pdf_url":"https://arxiv.org/pdf/2307.10123v1.pdf","comment":"37 pages, 18 figures"},{"id":"http://arxiv.org/abs/2103.03328v3","updated":"2023-07-19T16:19:53Z","published":"2021-03-04T20:58:22Z","title":"Evaluation of Complexity Measures for Deep Learning Generalization in\n  Medical Image Analysis","summary":"  The generalization performance of deep learning models for medical image\nanalysis often decreases on images collected with different devices for data\nacquisition, device settings, or patient population. A better understanding of\nthe generalization capacity on new images is crucial for clinicians'\ntrustworthiness in deep learning. Although significant research efforts have\nbeen recently directed toward establishing generalization bounds and complexity\nmeasures, still, there is often a significant discrepancy between the predicted\nand actual generalization performance. As well, related large empirical studies\nhave been primarily based on validation with general-purpose image datasets.\nThis paper presents an empirical study that investigates the correlation\nbetween 25 complexity measures and the generalization abilities of supervised\ndeep learning classifiers for breast ultrasound images. The results indicate\nthat PAC-Bayes flatness-based and path norm-based measures produce the most\nconsistent explanation for the combination of models and data. We also\ninvestigate the use of multi-task classification and segmentation approach for\nbreast images, and report that such learning approach acts as an implicit\nregularizer and is conducive toward improved generalization.\n","authors":["Aleksandar Vakanski","Min Xian"],"pdf_url":"https://arxiv.org/pdf/2103.03328v3.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.10097v1","updated":"2023-07-19T16:12:37Z","published":"2023-07-19T16:12:37Z","title":"Boundary-Refined Prototype Generation: A General End-to-End Paradigm for\n  Semi-Supervised Semantic Segmentation","summary":"  Prototype-based classification is a classical method in machine learning, and\nrecently it has achieved remarkable success in semi-supervised semantic\nsegmentation. However, the current approach isolates the prototype\ninitialization process from the main training framework, which appears to be\nunnecessary. Furthermore, while the direct use of K-Means algorithm for\nprototype generation has considered rich intra-class variance, it may not be\nthe optimal solution for the classification task. To tackle these problems, we\npropose a novel boundary-refined prototype generation (BRPG) method, which is\nincorporated into the whole training framework. Specifically, our approach\nsamples and clusters high- and low-confidence features separately based on a\nconfidence threshold, aiming to generate prototypes closer to the class\nboundaries. Moreover, an adaptive prototype optimization strategy is introduced\nto make prototype augmentation for categories with scattered feature\ndistributions. Extensive experiments on the PASCAL VOC 2012 and Cityscapes\ndatasets demonstrate the superiority and scalability of the proposed method,\noutperforming the current state-of-the-art approaches. The code is available at\nxxxxxxxxxxxxxx.\n","authors":["Junhao Dong","Zhu Meng","Delong Liu","Zhicheng Zhao","Fei Su"],"pdf_url":"https://arxiv.org/pdf/2307.10097v1.pdf","comment":"53 pages, 7 figures"},{"id":"http://arxiv.org/abs/2303.13479v2","updated":"2023-07-19T16:11:13Z","published":"2023-03-23T17:48:12Z","title":"IST-Net: Prior-free Category-level Pose Estimation with Implicit Space\n  Transformation","summary":"  Category-level 6D pose estimation aims to predict the poses and sizes of\nunseen objects from a specific category. Thanks to prior deformation, which\nexplicitly adapts a category-specific 3D prior (i.e., a 3D template) to a given\nobject instance, prior-based methods attained great success and have become a\nmajor research stream. However, obtaining category-specific priors requires\ncollecting a large amount of 3D models, which is labor-consuming and often not\naccessible in practice. This motivates us to investigate whether priors are\nnecessary to make prior-based methods effective. Our empirical study shows that\nthe 3D prior itself is not the credit to the high performance. The keypoint\nactually is the explicit deformation process, which aligns camera and world\ncoordinates supervised by world-space 3D models (also called canonical space).\nInspired by these observations, we introduce a simple prior-free implicit space\ntransformation network, namely IST-Net, to transform camera-space features to\nworld-space counterparts and build correspondence between them in an implicit\nmanner without relying on 3D priors. Besides, we design camera- and world-space\nenhancers to enrich the features with pose-sensitive information and\ngeometrical constraints, respectively. Albeit simple, IST-Net achieves\nstate-of-the-art performance based-on prior-free design, with top inference\nspeed on the REAL275 benchmark. Our code and models are available at\nhttps://github.com/CVMI-Lab/IST-Net.\n","authors":["Jianhui Liu","Yukang Chen","Xiaoqing Ye","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2303.13479v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.10094v1","updated":"2023-07-19T16:01:09Z","published":"2023-07-19T16:01:09Z","title":"Make-A-Volume: Leveraging Latent Diffusion Models for Cross-Modality 3D\n  Brain MRI Synthesis","summary":"  Cross-modality medical image synthesis is a critical topic and has the\npotential to facilitate numerous applications in the medical imaging field.\nDespite recent successes in deep-learning-based generative models, most current\nmedical image synthesis methods rely on generative adversarial networks and\nsuffer from notorious mode collapse and unstable training. Moreover, the 2D\nbackbone-driven approaches would easily result in volumetric inconsistency,\nwhile 3D backbones are challenging and impractical due to the tremendous memory\ncost and training difficulty. In this paper, we introduce a new paradigm for\nvolumetric medical data synthesis by leveraging 2D backbones and present a\ndiffusion-based framework, Make-A-Volume, for cross-modality 3D medical image\nsynthesis. To learn the cross-modality slice-wise mapping, we employ a latent\ndiffusion model and learn a low-dimensional latent space, resulting in high\ncomputational efficiency. To enable the 3D image synthesis and mitigate\nvolumetric inconsistency, we further insert a series of volumetric layers in\nthe 2D slice-mapping model and fine-tune them with paired 3D data. This\nparadigm extends the 2D image diffusion model to a volumetric version with a\nslightly increasing number of parameters and computation, offering a principled\nsolution for generic cross-modality 3D medical image synthesis. We showcase the\neffectiveness of our Make-A-Volume framework on an in-house SWI-MRA brain MRI\ndataset and a public T1-T2 brain MRI dataset. Experimental results demonstrate\nthat our framework achieves superior synthesis results with volumetric\nconsistency.\n","authors":["Lingting Zhu","Zeyue Xue","Zhenchao Jin","Xian Liu","Jingzhen He","Ziwei Liu","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2307.10094v1.pdf","comment":"Accepted by International Conference on Medical Image Computing and\n  Computer Assisted Intervention (MICCAI 2023). 10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2207.00419v3","updated":"2023-07-19T16:00:08Z","published":"2022-06-18T00:26:52Z","title":"Self-Supervised Learning for Videos: A Survey","summary":"  The remarkable success of deep learning in various domains relies on the\navailability of large-scale annotated datasets. However, obtaining annotations\nis expensive and requires great effort, which is especially challenging for\nvideos. Moreover, the use of human-generated annotations leads to models with\nbiased learning and poor domain generalization and robustness. As an\nalternative, self-supervised learning provides a way for representation\nlearning which does not require annotations and has shown promise in both image\nand video domains. Different from the image domain, learning video\nrepresentations are more challenging due to the temporal dimension, bringing in\nmotion and other environmental dynamics. This also provides opportunities for\nvideo-exclusive ideas that advance self-supervised learning in the video and\nmultimodal domain. In this survey, we provide a review of existing approaches\non self-supervised learning focusing on the video domain. We summarize these\nmethods into four different categories based on their learning objectives: 1)\npretext tasks, 2) generative learning, 3) contrastive learning, and 4)\ncross-modal agreement. We further introduce the commonly used datasets,\ndownstream evaluation tasks, insights into the limitations of existing works,\nand the potential future directions in this area.\n","authors":["Madeline C. Schiappa","Yogesh S. Rawat","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2207.00419v3.pdf","comment":"ACM CSUR (December 2022). Project Link: https://bit.ly/3Oimc7Q"},{"id":"http://arxiv.org/abs/2307.04838v2","updated":"2023-07-19T15:59:03Z","published":"2023-07-10T18:15:03Z","title":"CREPE: Learnable Prompting With CLIP Improves Visual Relationship\n  Prediction","summary":"  In this paper, we explore the potential of Vision-Language Models (VLMs),\nspecifically CLIP, in predicting visual object relationships, which involves\ninterpreting visual features from images into language-based relations. Current\nstate-of-the-art methods use complex graphical models that utilize language\ncues and visual features to address this challenge. We hypothesize that the\nstrong language priors in CLIP embeddings can simplify these graphical models\npaving for a simpler approach. We adopt the UVTransE relation prediction\nframework, which learns the relation as a translational embedding with subject,\nobject, and union box embeddings from a scene. We systematically explore the\ndesign of CLIP-based subject, object, and union-box representations within the\nUVTransE framework and propose CREPE (CLIP Representation Enhanced Predicate\nEstimation). CREPE utilizes text-based representations for all three bounding\nboxes and introduces a novel contrastive training strategy to automatically\ninfer the text prompt for union-box. Our approach achieves state-of-the-art\nperformance in predicate estimation, mR@5 27.79, and mR@20 31.95 on the Visual\nGenome benchmark, achieving a 15.3\\% gain in performance over recent\nstate-of-the-art at mR@20. This work demonstrates CLIP's effectiveness in\nobject relation prediction and encourages further research on VLMs in this\nchallenging domain.\n","authors":["Rakshith Subramanyam","T. S. Jayram","Rushil Anirudh","Jayaraman J. Thiagarajan"],"pdf_url":"https://arxiv.org/pdf/2307.04838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07894v3","updated":"2023-07-19T15:57:12Z","published":"2023-06-13T16:39:39Z","title":"iSLAM: Imperative SLAM","summary":"  Simultaneous localization and mapping (SLAM) stands as one of the critical\nchallenges in robot navigation. Recent advancements suggest that methods based\non supervised learning deliver impressive performance in front-end odometry,\nwhile traditional optimization-based methods still play a vital role in the\nback-end for minimizing estimation drift. In this paper, we found that such\ndecoupled paradigm can lead to only sub-optimal performance, consequently\ncurtailing system capabilities and generalization potential. To solve this\nproblem, we proposed a novel self-supervised learning framework, imperative\nSLAM (iSLAM), which fosters reciprocal correction between the front-end and\nback-end, thus enhancing performance without necessitating any external\nsupervision. Specifically, we formulate a SLAM system as a bi-level\noptimization problem so that the two components are bidirectionally connected.\nAs a result, the front-end model is able to learn global geometric knowledge\nobtained through pose graph optimization by back-propagating the residuals from\nthe back-end. This significantly improves the generalization ability of the\nentire system and thus achieves the accuracy improvement up to 45%. To the best\nof our knowledge, iSLAM is the first SLAM system showing that the front-end and\nback-end can learn jointly and mutually contribute to each other in a\nself-supervised manner.\n","authors":["Taimeng Fu","Shaoshu Su","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2306.07894v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10062v1","updated":"2023-07-19T15:33:11Z","published":"2023-07-19T15:33:11Z","title":"Unsupervised Accuracy Estimation of Deep Visual Models using\n  Domain-Adaptive Adversarial Perturbation without Source Samples","summary":"  Deploying deep visual models can lead to performance drops due to the\ndiscrepancies between source and target distributions. Several approaches\nleverage labeled source data to estimate target domain accuracy, but accessing\nlabeled source data is often prohibitively difficult due to data\nconfidentiality or resource limitations on serving devices. Our work proposes a\nnew framework to estimate model accuracy on unlabeled target data without\naccess to source data. We investigate the feasibility of using pseudo-labels\nfor accuracy estimation and evolve this idea into adopting recent advances in\nsource-free domain adaptation algorithms. Our approach measures the\ndisagreement rate between the source hypothesis and the target pseudo-labeling\nfunction, adapted from the source hypothesis. We mitigate the impact of\nerroneous pseudo-labels that may arise due to a high ideal joint hypothesis\nrisk by employing adaptive adversarial perturbation on the input of the target\nmodel. Our proposed source-free framework effectively addresses the challenging\ndistribution shift scenarios and outperforms existing methods requiring source\ndata and labels for training.\n","authors":["JoonHo Lee","Jae Oh Woo","Hankyu Moon","Kwonho Lee"],"pdf_url":"https://arxiv.org/pdf/2307.10062v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10046v1","updated":"2023-07-19T15:22:06Z","published":"2023-07-19T15:22:06Z","title":"Divert More Attention to Vision-Language Object Tracking","summary":"  Multimodal vision-language (VL) learning has noticeably pushed the tendency\ntoward generic intelligence owing to emerging large foundation models. However,\ntracking, as a fundamental vision problem, surprisingly enjoys less bonus from\nrecent flourishing VL learning. We argue that the reasons are two-fold: the\nlack of large-scale vision-language annotated videos and ineffective\nvision-language interaction learning of current works. These nuisances motivate\nus to design more effective vision-language representation for tracking,\nmeanwhile constructing a large database with language annotation for model\nlearning. Particularly, in this paper, we first propose a general attribute\nannotation strategy to decorate videos in six popular tracking benchmarks,\nwhich contributes a large-scale vision-language tracking database with more\nthan 23,000 videos. We then introduce a novel framework to improve tracking by\nlearning a unified-adaptive VL representation, where the cores are the proposed\nasymmetric architecture search and modality mixer (ModaMixer). To further\nimprove VL representation, we introduce a contrastive loss to align different\nmodalities. To thoroughly evidence the effectiveness of our method, we\nintegrate the proposed framework on three tracking methods with different\ndesigns, i.e., the CNN-based SiamCAR, the Transformer-based OSTrack, and the\nhybrid structure TransT. The experiments demonstrate that our framework can\nsignificantly improve all baselines on six benchmarks. Besides empirical\nresults, we theoretically analyze our approach to show its rationality. By\nrevealing the potential of VL representation, we expect the community to divert\nmore attention to VL tracking and hope to open more possibilities for future\ntracking with diversified multimodal messages.\n","authors":["Mingzhe Guo","Zhipeng Zhang","Liping Jing","Haibin Ling","Heng Fan"],"pdf_url":"https://arxiv.org/pdf/2307.10046v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.10036v1","updated":"2023-07-19T15:19:02Z","published":"2023-07-19T15:19:02Z","title":"Class Attention to Regions of Lesion for Imbalanced Medical Image\n  Recognition","summary":"  Automated medical image classification is the key component in intelligent\ndiagnosis systems. However, most medical image datasets contain plenty of\nsamples of common diseases and just a handful of rare ones, leading to major\nclass imbalances. Currently, it is an open problem in intelligent diagnosis to\neffectively learn from imbalanced training data. In this paper, we propose a\nsimple yet effective framework, named \\textbf{C}lass \\textbf{A}ttention to\n\\textbf{RE}gions of the lesion (CARE), to handle data imbalance issues by\nembedding attention into the training process of \\textbf{C}onvolutional\n\\textbf{N}eural \\textbf{N}etworks (CNNs). The proposed attention module helps\nCNNs attend to lesion regions of rare diseases, therefore helping CNNs to learn\ntheir characteristics more effectively. In addition, this attention module\nworks only during the training phase and does not change the architecture of\nthe original network, so it can be directly combined with any existing CNN\narchitecture. The CARE framework needs bounding boxes to represent the lesion\nregions of rare diseases. To alleviate the need for manual annotation, we\nfurther developed variants of CARE by leveraging the traditional saliency\nmethods or a pretrained segmentation model for bounding box generation. Results\nshow that the CARE variants with automated bounding box generation are\ncomparable to the original CARE framework with \\textit{manual} bounding box\nannotations. A series of experiments on an imbalanced skin image dataset and a\npneumonia dataset indicates that our method can effectively help the network\nfocus on the lesion regions of rare diseases and remarkably improves the\nclassification performance of rare diseases.\n","authors":["Jia-Xin Zhuang","Jiabin Cai","Jianguo Zhang","Wei-shi Zheng","Ruixuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10036v1.pdf","comment":"Accepted by Neurocomputing on July 2023. 37 pages"},{"id":"http://arxiv.org/abs/2307.06385v2","updated":"2023-07-19T14:51:37Z","published":"2023-07-12T18:13:58Z","title":"Temporal Label-Refinement for Weakly-Supervised Audio-Visual Event\n  Localization","summary":"  Audio-Visual Event Localization (AVEL) is the task of temporally localizing\nand classifying \\emph{audio-visual events}, i.e., events simultaneously visible\nand audible in a video. In this paper, we solve AVEL in a weakly-supervised\nsetting, where only video-level event labels (their presence/absence, but not\ntheir locations in time) are available as supervision for training. Our idea is\nto use a base model to estimate labels on the training data at a finer temporal\nresolution than at the video level and re-train the model with these labels.\nI.e., we determine the subset of labels for each \\emph{slice} of frames in a\ntraining video by (i) replacing the frames outside the slice with those from a\nsecond video having no overlap in video-level labels, and (ii) feeding this\nsynthetic video into the base model to extract labels for just the slice in\nquestion. To handle the out-of-distribution nature of our synthetic videos, we\npropose an auxiliary objective for the base model that induces more reliable\npredictions of the localized event labels as desired. Our three-stage pipeline\noutperforms several existing AVEL methods with no architectural changes and\nimproves performance on a related weakly-supervised task as well.\n","authors":["Kalyan Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2307.06385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10011v1","updated":"2023-07-19T14:49:14Z","published":"2023-07-19T14:49:14Z","title":"Towards Fair Face Verification: An In-depth Analysis of Demographic\n  Biases","summary":"  Deep learning-based person identification and verification systems have\nremarkably improved in terms of accuracy in recent years; however, such\nsystems, including widely popular cloud-based solutions, have been found to\nexhibit significant biases related to race, age, and gender, a problem that\nrequires in-depth exploration and solutions. This paper presents an in-depth\nanalysis, with a particular emphasis on the intersectionality of these\ndemographic factors. Intersectional bias refers to the performance\ndiscrepancies w.r.t. the different combinations of race, age, and gender\ngroups, an area relatively unexplored in current literature. Furthermore, the\nreliance of most state-of-the-art approaches on accuracy as the principal\nevaluation metric often masks significant demographic disparities in\nperformance. To counter this crucial limitation, we incorporate five additional\nmetrics in our quantitative analysis, including disparate impact and\nmistreatment metrics, which are typically ignored by the relevant\nfairness-aware approaches. Results on the Racial Faces in-the-Wild (RFW)\nbenchmark indicate pervasive biases in face recognition systems, extending\nbeyond race, with different demographic factors yielding significantly\ndisparate outcomes. In particular, Africans demonstrate an 11.25% lower True\nPositive Rate (TPR) compared to Caucasians, while only a 3.51% accuracy drop is\nobserved. Even more concerning, the intersections of multiple protected groups,\nsuch as African females over 60 years old, demonstrate a +39.89% disparate\nmistreatment rate compared to the highest Caucasians rate. By shedding light on\nthese biases and their implications, this paper aims to stimulate further\nresearch towards developing fairer, more equitable face recognition and\nverification systems.\n","authors":["Ioannis Sarridis","Christos Koutlis","Symeon Papadopoulos","Christos Diou"],"pdf_url":"https://arxiv.org/pdf/2307.10011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10008v1","updated":"2023-07-19T14:45:11Z","published":"2023-07-19T14:45:11Z","title":"MODA: Mapping-Once Audio-driven Portrait Animation with Dual Attentions","summary":"  Audio-driven portrait animation aims to synthesize portrait videos that are\nconditioned by given audio. Animating high-fidelity and multimodal video\nportraits has a variety of applications. Previous methods have attempted to\ncapture different motion modes and generate high-fidelity portrait videos by\ntraining different models or sampling signals from given videos. However,\nlacking correlation learning between lip-sync and other movements (e.g., head\npose/eye blinking) usually leads to unnatural results. In this paper, we\npropose a unified system for multi-person, diverse, and high-fidelity talking\nportrait generation. Our method contains three stages, i.e., 1) Mapping-Once\nnetwork with Dual Attentions (MODA) generates talking representation from given\naudio. In MODA, we design a dual-attention module to encode accurate mouth\nmovements and diverse modalities. 2) Facial composer network generates dense\nand detailed face landmarks, and 3) temporal-guided renderer syntheses stable\nvideos. Extensive evaluations demonstrate that the proposed system produces\nmore natural and realistic video portraits compared to previous methods.\n","authors":["Yunfei Liu","Lijian Lin","Fei Yu","Changyin Zhou","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2307.10008v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09456v2","updated":"2023-07-19T14:27:57Z","published":"2023-07-18T17:35:45Z","title":"A comparative analysis of SRGAN models","summary":"  In this study, we evaluate the performance of multiple state-of-the-art SRGAN\n(Super Resolution Generative Adversarial Network) models, ESRGAN, Real-ESRGAN\nand EDSR, on a benchmark dataset of real-world images which undergo degradation\nusing a pipeline. Our results show that some models seem to significantly\nincrease the resolution of the input images while preserving their visual\nquality, this is assessed using Tesseract OCR engine. We observe that EDSR-BASE\nmodel from huggingface outperforms the remaining candidate models in terms of\nboth quantitative metrics and subjective visual quality assessments with least\ncompute overhead. Specifically, EDSR generates images with higher peak\nsignal-to-noise ratio (PSNR) and structural similarity index (SSIM) values and\nare seen to return high quality OCR results with Tesseract OCR engine. These\nfindings suggest that EDSR is a robust and effective approach for single-image\nsuper-resolution and may be particularly well-suited for applications where\nhigh-quality visual fidelity is critical and optimized compute.\n","authors":["Fatemeh Rezapoor Nikroo","Ajinkya Deshmukh","Anantha Sharma","Adrian Tam","Kaarthik Kumar","Cleo Norris","Aditya Dangi"],"pdf_url":"https://arxiv.org/pdf/2307.09456v2.pdf","comment":"9 pages, 6 tables, 2 figures"},{"id":"http://arxiv.org/abs/2307.10003v1","updated":"2023-07-19T14:23:26Z","published":"2023-07-19T14:23:26Z","title":"TbExplain: A Text-based Explanation Method for Scene Classification\n  Models with the Statistical Prediction Correction","summary":"  The field of Explainable Artificial Intelligence (XAI) aims to improve the\ninterpretability of black-box machine learning models. Building a heatmap based\non the importance value of input features is a popular method for explaining\nthe underlying functions of such models in producing their predictions.\nHeatmaps are almost understandable to humans, yet they are not without flaws.\nNon-expert users, for example, may not fully understand the logic of heatmaps\n(the logic in which relevant pixels to the model's prediction are highlighted\nwith different intensities or colors). Additionally, objects and regions of the\ninput image that are relevant to the model prediction are frequently not\nentirely differentiated by heatmaps. In this paper, we propose a framework\ncalled TbExplain that employs XAI techniques and a pre-trained object detector\nto present text-based explanations of scene classification models. Moreover,\nTbExplain incorporates a novel method to correct predictions and textually\nexplain them based on the statistics of objects in the input image when the\ninitial prediction is unreliable. To assess the trustworthiness and validity of\nthe text-based explanations, we conducted a qualitative experiment, and the\nfindings indicated that these explanations are sufficiently reliable.\nFurthermore, our quantitative and qualitative experiments on TbExplain with\nscene classification datasets reveal an improvement in classification accuracy\nover ResNet variants.\n","authors":["Amirhossein Aminimehr","Pouya Khani","Amirali Molaei","Amirmohammad Kazemeini","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2307.10003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10001v1","updated":"2023-07-19T14:21:11Z","published":"2023-07-19T14:21:11Z","title":"As large as it gets: Learning infinitely large Filters via Neural\n  Implicit Functions in the Fourier Domain","summary":"  Motivated by the recent trend towards the usage of larger receptive fields\nfor more context-aware neural networks in vision applications, we aim to\ninvestigate how large these receptive fields really need to be. To facilitate\nsuch study, several challenges need to be addressed, most importantly: (i) We\nneed to provide an effective way for models to learn large filters (potentially\nas large as the input data) without increasing their memory consumption during\ntraining or inference, (ii) the study of filter sizes has to be decoupled from\nother effects such as the network width or number of learnable parameters, and\n(iii) the employed convolution operation should be a plug-and-play module that\ncan replace any conventional convolution in a Convolutional Neural Network\n(CNN) and allow for an efficient implementation in current frameworks. To\nfacilitate such models, we propose to learn not spatial but frequency\nrepresentations of filter weights as neural implicit functions, such that even\ninfinitely large filters can be parameterized by only a few learnable weights.\nThe resulting neural implicit frequency CNNs are the first models to achieve\nresults on par with the state-of-the-art on large image classification\nbenchmarks while executing convolutions solely in the frequency domain and can\nbe employed within any CNN architecture. They allow us to provide an extensive\nanalysis of the learned receptive fields. Interestingly, our analysis shows\nthat, although the proposed networks could learn very large convolution\nkernels, the learned filters practically translate into well-localized and\nrelatively small convolution kernels in the spatial domain.\n","authors":["Julia Grabinski","Janis Keuper","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.10001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08913v2","updated":"2023-07-19T14:18:00Z","published":"2023-07-18T01:16:23Z","title":"Towards the Sparseness of Projection Head in Self-Supervised Learning","summary":"  In recent years, self-supervised learning (SSL) has emerged as a promising\napproach for extracting valuable representations from unlabeled data. One\nsuccessful SSL method is contrastive learning, which aims to bring positive\nexamples closer while pushing negative examples apart. Many current contrastive\nlearning approaches utilize a parameterized projection head. Through a\ncombination of empirical analysis and theoretical investigation, we provide\ninsights into the internal mechanisms of the projection head and its\nrelationship with the phenomenon of dimensional collapse. Our findings\ndemonstrate that the projection head enhances the quality of representations by\nperforming contrastive loss in a projected subspace. Therefore, we propose an\nassumption that only a subset of features is necessary when minimizing the\ncontrastive loss of a mini-batch of data. Theoretical analysis further suggests\nthat a sparse projection head can enhance generalization, leading us to\nintroduce SparseHead - a regularization term that effectively constrains the\nsparsity of the projection head, and can be seamlessly integrated with any\nself-supervised learning (SSL) approaches. Our experimental results validate\nthe effectiveness of SparseHead, demonstrating its ability to improve the\nperformance of existing contrastive methods.\n","authors":["Zeen Song","Xingzhe Su","Jingyao Wang","Wenwen Qiang","Changwen Zheng","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2307.08913v2.pdf","comment":"9 pages,3 figures"},{"id":"http://arxiv.org/abs/2307.09997v1","updated":"2023-07-19T14:10:55Z","published":"2023-07-19T14:10:55Z","title":"TUNeS: A Temporal U-Net with Self-Attention for Video-based Surgical\n  Phase Recognition","summary":"  To enable context-aware computer assistance in the operating room of the\nfuture, cognitive systems need to understand automatically which surgical phase\nis being performed by the medical team. The primary source of information for\nsurgical phase recognition is typically video, which presents two challenges:\nextracting meaningful features from the video stream and effectively modeling\ntemporal information in the sequence of visual features. For temporal modeling,\nattention mechanisms have gained popularity due to their ability to capture\nlong-range dependencies. In this paper, we explore design choices for attention\nin existing temporal models for surgical phase recognition and propose a novel\napproach that does not resort to local attention or regularization of attention\nweights: TUNeS is an efficient and simple temporal model that incorporates\nself-attention at the coarsest stage of a U-Net-like structure. In addition, we\npropose to train the feature extractor, a standard CNN, together with an LSTM\non preferably long video segments, i.e., with long temporal context. In our\nexperiments, all temporal models performed better on top of feature extractors\nthat were trained with longer temporal context. On top of these contextualized\nfeatures, TUNeS achieves state-of-the-art results on Cholec80.\n","authors":["Isabel Funke","Dominik Rivoir","Stefanie Krell","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2307.09997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09994v1","updated":"2023-07-19T13:58:01Z","published":"2023-07-19T13:58:01Z","title":"Impact of Disentanglement on Pruning Neural Networks","summary":"  Deploying deep learning neural networks on edge devices, to accomplish task\nspecific objectives in the real-world, requires a reduction in their memory\nfootprint, power consumption, and latency. This can be realized via efficient\nmodel compression. Disentangled latent representations produced by variational\nautoencoder (VAE) networks are a promising approach for achieving model\ncompression because they mainly retain task-specific information, discarding\nuseless information for the task at hand. We make use of the Beta-VAE framework\ncombined with a standard criterion for pruning to investigate the impact of\nforcing the network to learn disentangled representations on the pruning\nprocess for the task of classification. In particular, we perform experiments\non MNIST and CIFAR10 datasets, examine disentanglement challenges, and propose\na path forward for future works.\n","authors":["Carl Shneider","Peyman Rostami","Anis Kacem","Nilotpal Sinha","Abd El Rahman Shabayek","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2307.09994v1.pdf","comment":"Presented in ISCS23"},{"id":"http://arxiv.org/abs/2307.08347v2","updated":"2023-07-19T13:55:32Z","published":"2023-07-17T09:38:41Z","title":"M-FLAG: Medical Vision-Language Pre-training with Frozen Language Models\n  and Latent Space Geometry Optimization","summary":"  Medical vision-language models enable co-learning and integrating features\nfrom medical imaging and clinical text. However, these models are not easy to\ntrain and the latent representation space can be complex. Here we propose a\nnovel way for pre-training and regularising medical vision-language models. The\nproposed method, named Medical vision-language pre-training with Frozen\nlanguage models and Latent spAce Geometry optimization (M-FLAG), leverages a\nfrozen language model for training stability and efficiency and introduces a\nnovel orthogonality loss to harmonize the latent space geometry. We demonstrate\nthe potential of the pre-trained model on three downstream tasks: medical image\nclassification, segmentation, and object detection. Extensive experiments\nacross five public datasets demonstrate that M-FLAG significantly outperforms\nexisting medical vision-language pre-training approaches and reduces the number\nof parameters by 78\\%. Notably, M-FLAG achieves outstanding performance on the\nsegmentation task while using only 1\\% of the RSNA dataset, even outperforming\nImageNet pre-trained models that have been fine-tuned using 100\\% of the data.\n","authors":["Che Liu","Sibo Cheng","Chen Chen","Mengyun Qiao","Weitong Zhang","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2307.08347v2.pdf","comment":"Accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.09988v1","updated":"2023-07-19T13:49:12Z","published":"2023-07-19T13:49:12Z","title":"TinyTrain: Deep Neural Network Training at the Extreme Edge","summary":"  On-device training is essential for user personalisation and privacy. With\nthe pervasiveness of IoT devices and microcontroller units (MCU), this task\nbecomes more challenging due to the constrained memory and compute resources,\nand the limited availability of labelled user data. Nonetheless, prior works\nneglect the data scarcity issue, require excessively long training time (e.g. a\nfew hours), or induce substantial accuracy loss ($\\geq$10\\%). We propose\nTinyTrain, an on-device training approach that drastically reduces training\ntime by selectively updating parts of the model and explicitly coping with data\nscarcity. TinyTrain introduces a task-adaptive sparse-update method that\ndynamically selects the layer/channel based on a multi-objective criterion that\njointly captures user data, the memory, and the compute capabilities of the\ntarget device, leading to high accuracy on unseen tasks with reduced\ncomputation and memory footprint. TinyTrain outperforms vanilla fine-tuning of\nthe entire network by 3.6-5.0\\% in accuracy, while reducing the backward-pass\nmemory and computation cost by up to 2,286$\\times$ and 7.68$\\times$,\nrespectively. Targeting broadly used real-world edge devices, TinyTrain\nachieves 9.5$\\times$ faster and 3.5$\\times$ more energy-efficient training over\nstatus-quo approaches, and 2.8$\\times$ smaller memory footprint than SOTA\napproaches, while remaining within the 1 MB memory envelope of MCU-grade\nplatforms.\n","authors":["Young D. Kwon","Rui Li","Stylianos I. Venieris","Jagmohan Chauhan","Nicholas D. Lane","Cecilia Mascolo"],"pdf_url":"https://arxiv.org/pdf/2307.09988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09981v1","updated":"2023-07-19T13:40:45Z","published":"2023-07-19T13:40:45Z","title":"Lazy Visual Localization via Motion Averaging","summary":"  Visual (re)localization is critical for various applications in computer\nvision and robotics. Its goal is to estimate the 6 degrees of freedom (DoF)\ncamera pose for each query image, based on a set of posed database images.\nCurrently, all leading solutions are structure-based that either explicitly\nconstruct 3D metric maps from the database with structure-from-motion, or\nimplicitly encode the 3D information with scene coordinate regression models.\nOn the contrary, visual localization without reconstructing the scene in 3D\noffers clear benefits. It makes deployment more convenient by reducing database\npre-processing time, releasing storage requirements, and remaining unaffected\nby imperfect reconstruction, etc. In this technical report, we demonstrate that\nit is possible to achieve high localization accuracy without reconstructing the\nscene from the database. The key to achieving this owes to a tailored motion\naveraging over database-query pairs. Experiments show that our visual\nlocalization proposal, LazyLoc, achieves comparable performance against\nstate-of-the-art structure-based methods. Furthermore, we showcase the\nversatility of LazyLoc, which can be easily extended to handle complex\nconfigurations such as multi-query co-localization and camera rigs.\n","authors":["Siyan Dong","Shaohui Liu","Hengkai Guo","Baoquan Chen","Marc Pollefeys"],"pdf_url":"https://arxiv.org/pdf/2307.09981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09362v2","updated":"2023-07-19T13:21:30Z","published":"2023-07-18T15:46:21Z","title":"Disentangle then Parse:Night-time Semantic Segmentation with\n  Illumination Disentanglement","summary":"  Most prior semantic segmentation methods have been developed for day-time\nscenes, while typically underperforming in night-time scenes due to\ninsufficient and complicated lighting conditions. In this work, we tackle this\nchallenge by proposing a novel night-time semantic segmentation paradigm, i.e.,\ndisentangle then parse (DTP). DTP explicitly disentangles night-time images\ninto light-invariant reflectance and light-specific illumination components and\nthen recognizes semantics based on their adaptive fusion. Concretely, the\nproposed DTP comprises two key components: 1) Instead of processing\nlighting-entangled features as in prior works, our Semantic-Oriented\nDisentanglement (SOD) framework enables the extraction of reflectance component\nwithout being impeded by lighting, allowing the network to consistently\nrecognize the semantics under cover of varying and complicated lighting\nconditions. 2) Based on the observation that the illumination component can\nserve as a cue for some semantically confused regions, we further introduce an\nIllumination-Aware Parser (IAParser) to explicitly learn the correlation\nbetween semantics and lighting, and aggregate the illumination features to\nyield more precise predictions. Extensive experiments on the night-time\nsegmentation task with various settings demonstrate that DTP significantly\noutperforms state-of-the-art methods. Furthermore, with negligible additional\nparameters, DTP can be directly used to benefit existing day-time methods for\nnight-time segmentation.\n","authors":["Zhixiang Wei","Lin Chen","Tao Tu","Huaian Chen","Pengyang Ling","Yi Jin"],"pdf_url":"https://arxiv.org/pdf/2307.09362v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2305.09946v2","updated":"2023-07-19T13:15:08Z","published":"2023-05-17T04:56:11Z","title":"AdaMSS: Adaptive Multi-Modality Segmentation-to-Survival Learning for\n  Survival Outcome Prediction from PET/CT Images","summary":"  Survival prediction is a major concern for cancer management. Deep survival\nmodels based on deep learning have been widely adopted to perform end-to-end\nsurvival prediction from medical images. Recent deep survival models achieved\npromising performance by jointly performing tumor segmentation with survival\nprediction, where the models were guided to extract tumor-related information\nthrough Multi-Task Learning (MTL). However, these deep survival models have\ndifficulties in exploring out-of-tumor prognostic information. In addition,\nexisting deep survival models are unable to effectively leverage multi-modality\nimages. Empirically-designed fusion strategies were commonly adopted to fuse\nmulti-modality information via task-specific manually-designed networks, thus\nlimiting the adaptability to different scenarios. In this study, we propose an\nAdaptive Multi-modality Segmentation-to-Survival model (AdaMSS) for survival\nprediction from PET/CT images. Instead of adopting MTL, we propose a novel\nSegmentation-to-Survival Learning (SSL) strategy, where our AdaMSS is trained\nfor tumor segmentation and survival prediction sequentially in two stages. This\nstrategy enables the AdaMSS to focus on tumor regions in the first stage and\ngradually expand its focus to include other prognosis-related regions in the\nsecond stage. We also propose a data-driven strategy to fuse multi-modality\ninformation, which realizes adaptive optimization of fusion strategies based on\ntraining data during training. With the SSL and data-driven fusion strategies,\nour AdaMSS is designed as an adaptive model that can self-adapt its focus\nregions and fusion strategy for different training stages. Extensive\nexperiments with two large clinical datasets show that our AdaMSS outperforms\nstate-of-the-art survival prediction methods.\n","authors":["Mingyuan Meng","Bingxin Gu","Michael Fulham","Shaoli Song","Dagan Feng","Lei Bi","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2305.09946v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2305.18060v2","updated":"2023-07-19T13:13:39Z","published":"2023-05-29T12:53:54Z","title":"Mining Negative Temporal Contexts For False Positive Suppression In\n  Real-Time Ultrasound Lesion Detection","summary":"  During ultrasonic scanning processes, real-time lesion detection can assist\nradiologists in accurate cancer diagnosis. However, this essential task remains\nchallenging and underexplored. General-purpose real-time object detection\nmodels can mistakenly report obvious false positives (FPs) when applied to\nultrasound videos, potentially misleading junior radiologists. One key issue is\ntheir failure to utilize negative symptoms in previous frames, denoted as\nnegative temporal contexts (NTC). To address this issue, we propose to extract\ncontexts from previous frames, including NTC, with the guidance of inverse\noptical flow. By aggregating extracted contexts, we endow the model with the\nability to suppress FPs by leveraging NTC. We call the resulting model\nUltraDet. The proposed UltraDet demonstrates significant improvement over\nprevious state-of-the-arts and achieves real-time inference speed. We release\nthe code, checkpoints, and high-quality labels of the CVA-BUS dataset in\nhttps://github.com/HaojunYu1998/UltraDet.\n","authors":["Haojun Yu","Youcheng Li","QuanLin Wu","Ziwei Zhao","Dengbo Chen","Dong Wang","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2305.18060v2.pdf","comment":"10 pages, 4 figures, MICCAI 2023 Early Accept"},{"id":"http://arxiv.org/abs/2001.05887v4","updated":"2023-07-19T12:58:18Z","published":"2020-01-16T15:24:26Z","title":"MixPath: A Unified Approach for One-shot Neural Architecture Search","summary":"  Blending multiple convolutional kernels is proved advantageous in neural\narchitecture design. However, current two-stage neural architecture search\nmethods are mainly limited to single-path search spaces. How to efficiently\nsearch models of multi-path structures remains a difficult problem. In this\npaper, we are motivated to train a one-shot multi-path supernet to accurately\nevaluate the candidate architectures. Specifically, we discover that in the\nstudied search spaces, feature vectors summed from multiple paths are nearly\nmultiples of those from a single path. Such disparity perturbs the supernet\ntraining and its ranking ability. Therefore, we propose a novel mechanism\ncalled Shadow Batch Normalization (SBN) to regularize the disparate feature\nstatistics. Extensive experiments prove that SBNs are capable of stabilizing\nthe optimization and improving ranking performance. We call our unified\nmulti-path one-shot approach as MixPath, which generates a series of models\nthat achieve state-of-the-art results on ImageNet.\n","authors":["Xiangxiang Chu","Shun Lu","Xudong Li","Bo Zhang"],"pdf_url":"https://arxiv.org/pdf/2001.05887v4.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.09947v1","updated":"2023-07-19T12:41:54Z","published":"2023-07-19T12:41:54Z","title":"U-CE: Uncertainty-aware Cross-Entropy for Semantic Segmentation","summary":"  Deep neural networks have shown exceptional performance in various tasks, but\ntheir lack of robustness, reliability, and tendency to be overconfident pose\nchallenges for their deployment in safety-critical applications like autonomous\ndriving. In this regard, quantifying the uncertainty inherent to a model's\nprediction is a promising endeavour to address these shortcomings. In this\nwork, we present a novel Uncertainty-aware Cross-Entropy loss (U-CE) that\nincorporates dynamic predictive uncertainties into the training process by\npixel-wise weighting of the well-known cross-entropy loss (CE). Through\nextensive experimentation, we demonstrate the superiority of U-CE over regular\nCE training on two benchmark datasets, Cityscapes and ACDC, using two common\nbackbone architectures, ResNet-18 and ResNet-101. With U-CE, we manage to train\nmodels that not only improve their segmentation performance but also provide\nmeaningful uncertainties after training. Consequently, we contribute to the\ndevelopment of more robust and reliable segmentation models, ultimately\nadvancing the state-of-the-art in safety-critical applications and beyond.\n","authors":["Steven Landgraf","Markus Hillemann","Kira Wursthorn","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2307.09947v1.pdf","comment":"10 pages, 3 figures, 7 tables, 1 algorithm"},{"id":"http://arxiv.org/abs/2307.09944v1","updated":"2023-07-19T12:39:40Z","published":"2023-07-19T12:39:40Z","title":"ProtoCaps: A Fast and Non-Iterative Capsule Network Routing Method","summary":"  Capsule Networks have emerged as a powerful class of deep learning\narchitectures, known for robust performance with relatively few parameters\ncompared to Convolutional Neural Networks (CNNs). However, their inherent\nefficiency is often overshadowed by their slow, iterative routing mechanisms\nwhich establish connections between Capsule layers, posing computational\nchallenges resulting in an inability to scale. In this paper, we introduce a\nnovel, non-iterative routing mechanism, inspired by trainable prototype\nclustering. This innovative approach aims to mitigate computational complexity,\nwhile retaining, if not enhancing, performance efficacy. Furthermore, we\nharness a shared Capsule subspace, negating the need to project each\nlower-level Capsule to each higher-level Capsule, thereby significantly\nreducing memory requisites during training. Our approach demonstrates superior\nresults compared to the current best non-iterative Capsule Network and tests on\nthe Imagewoof dataset, which is too computationally demanding to handle\nefficiently by iterative approaches. Our findings underscore the potential of\nour proposed methodology in enhancing the operational efficiency and\nperformance of Capsule Networks, paving the way for their application in\nincreasingly complex computational scenarios.\n","authors":["Miles Everett","Mingjun Zhong","Georgios Leontidis"],"pdf_url":"https://arxiv.org/pdf/2307.09944v1.pdf","comment":"8 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.09936v1","updated":"2023-07-19T12:21:39Z","published":"2023-07-19T12:21:39Z","title":"AGAR: Attention Graph-RNN for Adaptative Motion Prediction of Point\n  Clouds of Deformable Objects","summary":"  This paper focuses on motion prediction for point cloud sequences in the\nchallenging case of deformable 3D objects, such as human body motion. First, we\ninvestigate the challenges caused by deformable shapes and complex motions\npresent in this type of representation, with the ultimate goal of understanding\nthe technical limitations of state-of-the-art models. From this understanding,\nwe propose an improved architecture for point cloud prediction of deformable 3D\nobjects. Specifically, to handle deformable shapes, we propose a graph-based\napproach that learns and exploits the spatial structure of point clouds to\nextract more representative features. Then we propose a module able to combine\nthe learned features in an adaptative manner according to the point cloud\nmovements. The proposed adaptative module controls the composition of local and\nglobal motions for each point, enabling the network to model complex motions in\ndeformable 3D objects more effectively. We tested the proposed method on the\nfollowing datasets: MNIST moving digits, the Mixamo human bodies motions, JPEG\nand CWIPC-SXR real-world dynamic bodies. Simulation results demonstrate that\nour method outperforms the current baseline methods given its improved ability\nto model complex movements as well as preserve point cloud shape. Furthermore,\nwe demonstrate the generalizability of the proposed framework for dynamic\nfeature learning, by testing the framework for action recognition on the\nMSRAction3D dataset and achieving results on-par with state-of-the-art methods\n","authors":["Pedro Gomes","Silvia Rossi","Laura Toni"],"pdf_url":"https://arxiv.org/pdf/2307.09936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09933v1","updated":"2023-07-19T12:15:06Z","published":"2023-07-19T12:15:06Z","title":"Spuriosity Didn't Kill the Classifier: Using Invariant Predictions to\n  Harness Spurious Features","summary":"  To avoid failures on out-of-distribution data, recent works have sought to\nextract features that have a stable or invariant relationship with the label\nacross domains, discarding the \"spurious\" or unstable features whose\nrelationship with the label changes across domains. However, unstable features\noften carry complementary information about the label that could boost\nperformance if used correctly in the test domain. Our main contribution is to\nshow that it is possible to learn how to use these unstable features in the\ntest domain without labels. In particular, we prove that pseudo-labels based on\nstable features provide sufficient guidance for doing so, provided that stable\nand unstable features are conditionally independent given the label. Based on\nthis theoretical insight, we propose Stable Feature Boosting (SFB), an\nalgorithm for: (i) learning a predictor that separates stable and\nconditionally-independent unstable features; and (ii) using the stable-feature\npredictions to adapt the unstable-feature predictions in the test domain.\nTheoretically, we prove that SFB can learn an asymptotically-optimal predictor\nwithout test-domain labels. Empirically, we demonstrate the effectiveness of\nSFB on real and synthetic data.\n","authors":["Cian Eastwood","Shashank Singh","Andrei Liviu Nicolicioiu","Marin Vlastelica","Julius von Kügelgen","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2307.09933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09931v1","updated":"2023-07-19T12:12:17Z","published":"2023-07-19T12:12:17Z","title":"DISA: DIfferentiable Similarity Approximation for Universal Multimodal\n  Registration","summary":"  Multimodal image registration is a challenging but essential step for\nnumerous image-guided procedures. Most registration algorithms rely on the\ncomputation of complex, frequently non-differentiable similarity metrics to\ndeal with the appearance discrepancy of anatomical structures between imaging\nmodalities. Recent Machine Learning based approaches are limited to specific\nanatomy-modality combinations and do not generalize to new settings. We propose\na generic framework for creating expressive cross-modal descriptors that enable\nfast deformable global registration. We achieve this by approximating existing\nmetrics with a dot-product in the feature space of a small convolutional neural\nnetwork (CNN) which is inherently differentiable can be trained without\nregistered data. Our method is several orders of magnitude faster than local\npatch-based metrics and can be directly applied in clinical settings by\nreplacing the similarity measure with the proposed one. Experiments on three\ndifferent datasets demonstrate that our approach generalizes well beyond the\ntraining data, yielding a broad capture range even on unseen anatomies and\nmodality pairs, without the need for specialized retraining. We make our\ntraining code and data publicly available.\n","authors":["Matteo Ronchetti","Wolfgang Wein","Nassir Navab","Oliver Zettinig","Raphael Prevost"],"pdf_url":"https://arxiv.org/pdf/2307.09931v1.pdf","comment":"This preprint was submitted to MICCAI 2023. The Version of Record of\n  this contribution will be published in Springer LNCS"},{"id":"http://arxiv.org/abs/2307.09929v1","updated":"2023-07-19T12:11:15Z","published":"2023-07-19T12:11:15Z","title":"Measuring and Modeling Uncertainty Degree for Monocular Depth Estimation","summary":"  Effectively measuring and modeling the reliability of a trained model is\nessential to the real-world deployment of monocular depth estimation (MDE)\nmodels. However, the intrinsic ill-posedness and ordinal-sensitive nature of\nMDE pose major challenges to the estimation of uncertainty degree of the\ntrained models. On the one hand, utilizing current uncertainty modeling methods\nmay increase memory consumption and are usually time-consuming. On the other\nhand, measuring the uncertainty based on model accuracy can also be\nproblematic, where uncertainty reliability and prediction accuracy are not well\ndecoupled. In this paper, we propose to model the uncertainty of MDE models\nfrom the perspective of the inherent probability distributions originating from\nthe depth probability volume and its extensions, and to assess it more fairly\nwith more comprehensive metrics. By simply introducing additional training\nregularization terms, our model, with surprisingly simple formations and\nwithout requiring extra modules or multiple inferences, can provide uncertainty\nestimations with state-of-the-art reliability, and can be further improved when\ncombined with ensemble or sampling methods. A series of experiments demonstrate\nthe effectiveness of our methods.\n","authors":["Mochu Xiang","Jing Zhang","Nick Barnes","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2307.09929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04639v2","updated":"2023-07-19T12:08:51Z","published":"2023-07-10T15:35:31Z","title":"Multimodal brain age estimation using interpretable adaptive\n  population-graph learning","summary":"  Brain age estimation is clinically important as it can provide valuable\ninformation in the context of neurodegenerative diseases such as Alzheimer's.\nPopulation graphs, which include multimodal imaging information of the subjects\nalong with the relationships among the population, have been used in literature\nalong with Graph Convolutional Networks (GCNs) and have proved beneficial for a\nvariety of medical imaging tasks. A population graph is usually static and\nconstructed manually using non-imaging information. However, graph construction\nis not a trivial task and might significantly affect the performance of the\nGCN, which is inherently very sensitive to the graph structure. In this work,\nwe propose a framework that learns a population graph structure optimized for\nthe downstream task. An attention mechanism assigns weights to a set of imaging\nand non-imaging features (phenotypes), which are then used for edge extraction.\nThe resulting graph is used to train the GCN. The entire pipeline can be\ntrained end-to-end. Additionally, by visualizing the attention weights that\nwere the most important for the graph construction, we increase the\ninterpretability of the graph. We use the UK Biobank, which provides a large\nvariety of neuroimaging and non-imaging phenotypes, to evaluate our method on\nbrain age regression and classification. The proposed method outperforms\ncompeting static graph approaches and other state-of-the-art adaptive methods.\nWe further show that the assigned attention scores indicate that there are both\nimaging and non-imaging phenotypes that are informative for brain age\nestimation and are in agreement with the relevant literature.\n","authors":["Kyriaki-Margarita Bintsi","Vasileios Baltatzis","Rolandos Alexandros Potamias","Alexander Hammers","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2307.04639v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2303.06635v2","updated":"2023-07-19T12:05:29Z","published":"2023-03-12T11:23:56Z","title":"Schema Inference for Interpretable Image Classification","summary":"  In this paper, we study a novel inference paradigm, termed as schema\ninference, that learns to deductively infer the explainable predictions by\nrebuilding the prior deep neural network (DNN) forwarding scheme, guided by the\nprevalent philosophical cognitive concept of schema. We strive to reformulate\nthe conventional model inference pipeline into a graph matching policy that\nassociates the extracted visual concepts of an image with the pre-computed\nscene impression, by analogy with human reasoning mechanism via impression\nmatching. To this end, we devise an elaborated architecture, termed as\nSchemaNet, as a dedicated instantiation of the proposed schema inference\nconcept, that models both the visual semantics of input instances and the\nlearned abstract imaginations of target categories as topological relational\ngraphs. Meanwhile, to capture and leverage the compositional contributions of\nvisual semantics in a global view, we also introduce a universal Feat2Graph\nscheme in SchemaNet to establish the relational graphs that contain abundant\ninteraction information. Both the theoretical analysis and the experimental\nresults on several benchmarks demonstrate that the proposed schema inference\nachieves encouraging performance and meanwhile yields a clear picture of the\ndeductive process leading to the predictions. Our code is available at\nhttps://github.com/zhfeing/SchemaNet-PyTorch.\n","authors":["Haofei Zhang","Mengqi Xue","Xiaokang Liu","Kaixuan Chen","Jie Song","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2303.06635v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07591v3","updated":"2023-07-19T12:04:59Z","published":"2023-06-13T07:35:28Z","title":"I See Dead People: Gray-Box Adversarial Attack on Image-To-Text Models","summary":"  Modern image-to-text systems typically adopt the encoder-decoder framework,\nwhich comprises two main components: an image encoder, responsible for\nextracting image features, and a transformer-based decoder, used for generating\ncaptions. Taking inspiration from the analysis of neural networks' robustness\nagainst adversarial perturbations, we propose a novel gray-box algorithm for\ncreating adversarial examples in image-to-text models. Unlike image\nclassification tasks that have a finite set of class labels, finding visually\nsimilar adversarial examples in an image-to-text task poses greater challenges\nbecause the captioning system allows for a virtually infinite space of possible\ncaptions. In this paper, we present a gray-box adversarial attack on\nimage-to-text, both untargeted and targeted. We formulate the process of\ndiscovering adversarial perturbations as an optimization problem that uses only\nthe image-encoder component, meaning the proposed attack is language-model\nagnostic. Through experiments conducted on the ViT-GPT2 model, which is the\nmost-used image-to-text model in Hugging Face, and the Flickr30k dataset, we\ndemonstrate that our proposed attack successfully generates visually similar\nadversarial examples, both with untargeted and targeted captions. Notably, our\nattack operates in a gray-box manner, requiring no knowledge about the decoder\nmodule. We also show that our attacks fool the popular open-source platform\nHugging Face.\n","authors":["Raz Lapid","Moshe Sipper"],"pdf_url":"https://arxiv.org/pdf/2306.07591v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.03544v2","updated":"2023-07-19T11:46:34Z","published":"2021-10-07T15:06:52Z","title":"RAR: Region-Aware Point Cloud Registration","summary":"  This paper concerns the research problem of point cloud registration to find\nthe rigid transformation to optimally align the source point set with the\ntarget one. Learning robust point cloud registration models with deep neural\nnetworks has emerged as a powerful paradigm, offering promising performance in\npredicting the global geometric transformation for a pair of point sets.\nExisting methods firstly leverage an encoder to regress a latent shape\nembedding, which is then decoded into a shape-conditioned transformation via\nconcatenation-based conditioning. However, different regions of a 3D shape vary\nin their geometric structures which makes it more sense that we have a\nregion-conditioned transformation instead of the shape-conditioned one. In this\npaper we present a \\underline{R}egion-\\underline{A}ware point cloud\n\\underline{R}egistration, denoted as RAR, to predict transformation for\npairwise point sets in the self-supervised learning fashion. More specifically,\nwe develop a novel region-aware decoder (RAD) module that is formed with an\nimplicit neural region representation parameterized by neural networks. The\nimplicit neural region representation is learned with a self-supervised 3D\nshape reconstruction loss without the need for region labels. Consequently, the\nregion-aware decoder (RAD) module guides the training of the region-aware\ntransformation (RAT) module and region-aware weight (RAW) module, which predict\nthe transforms and weights for different regions respectively. The global\ngeometric transformation from source point set to target one is then formed by\nthe weighted fusion of region-aware transforms. Compared to the\nstate-of-the-art approaches, our experiments show that our RAR achieves\nsuperior registration performance over various benchmark datasets (e.g.\nModelNet40).\n","authors":["Yu Hao","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2110.03544v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2006.06200"},{"id":"http://arxiv.org/abs/2307.09915v1","updated":"2023-07-19T11:35:21Z","published":"2023-07-19T11:35:21Z","title":"Embedded Heterogeneous Attention Transformer for Cross-lingual Image\n  Captioning","summary":"  Cross-lingual image captioning is confronted with both cross-lingual and\ncross-modal challenges for multimedia analysis. The crucial issue in this task\nis to model the global and local matching between the image and different\nlanguages. Existing cross-modal embedding methods based on Transformer\narchitecture oversight the local matching between the image region and\nmonolingual words, not to mention in the face of a variety of differentiated\nlanguages. Due to the heterogeneous property of the cross-modal and\ncross-lingual task, we utilize the heterogeneous network to establish\ncross-domain relationships and the local correspondences between the image and\ndifferent languages. In this paper, we propose an Embedded Heterogeneous\nAttention Transformer (EHAT) to build reasoning paths bridging cross-domain for\ncross-lingual image captioning and integrate into transformer. The proposed\nEHAT consists of a Masked Heterogeneous Cross-attention (MHCA), Heterogeneous\nAttention Reasoning Network (HARN) and Heterogeneous Co-attention (HCA). HARN\nas the core network, models and infers cross-domain relationship anchored by\nvision bounding box representation features to connect two languages word\nfeatures and learn the heterogeneous maps. MHCA and HCA implement cross-domain\nintegration in the encoder through the special heterogeneous attention and\nenable single model to generate two language captioning. We test on MSCOCO\ndataset to generate English and Chinese, which are most widely used and have\nobvious difference between their language families. Our experiments show that\nour method even achieve better than advanced monolingual methods.\n","authors":["Zijie Song","Zhenzhen Hu","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2307.09915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03238v3","updated":"2023-07-19T11:20:12Z","published":"2023-05-05T01:40:00Z","title":"Reduction of Class Activation Uncertainty with Background Information","summary":"  Multitask learning is a popular approach to training high-performing neural\nnetworks with improved generalization. In this paper, we propose a background\nclass to achieve improved generalization at a lower computation compared to\nmultitask learning to help researchers and organizations with limited\ncomputation power. We also present a methodology for selecting background\nimages and discuss potential future improvements. We apply our approach to\nseveral datasets and achieved improved generalization with much lower\ncomputation. We also investigate class activation mappings (CAMs) of the\ntrained model and observed the tendency towards looking at a bigger picture in\na few class classification problems with the proposed model training\nmethodology. Applying transformer with the proposed background class, we\nreceive state-of-the-art (SOTA) performance on STL-10, Caltech-101, and\nCINIC-10 datasets. Example scripts are available in the `CAM' folder of the\nfollowing GitHub Repository: github.com/dipuk0506/UQ\n","authors":["H M Dipu Kabir"],"pdf_url":"https://arxiv.org/pdf/2305.03238v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09906v1","updated":"2023-07-19T11:10:26Z","published":"2023-07-19T11:10:26Z","title":"Implicit Identity Representation Conditioned Memory Compensation Network\n  for Talking Head video Generation","summary":"  Talking head video generation aims to animate a human face in a still image\nwith dynamic poses and expressions using motion information derived from a\ntarget-driving video, while maintaining the person's identity in the source\nimage. However, dramatic and complex motions in the driving video cause\nambiguous generation, because the still source image cannot provide sufficient\nappearance information for occluded regions or delicate expression variations,\nwhich produces severe artifacts and significantly degrades the generation\nquality. To tackle this problem, we propose to learn a global facial\nrepresentation space, and design a novel implicit identity representation\nconditioned memory compensation network, coined as MCNet, for high-fidelity\ntalking head generation.~Specifically, we devise a network module to learn a\nunified spatial facial meta-memory bank from all training samples, which can\nprovide rich facial structure and appearance priors to compensate warped source\nfacial features for the generation. Furthermore, we propose an effective query\nmechanism based on implicit identity representations learned from the discrete\nkeypoints of the source image. It can greatly facilitate the retrieval of more\ncorrelated information from the memory bank for the compensation. Extensive\nexperiments demonstrate that MCNet can learn representative and complementary\nfacial memory, and can clearly outperform previous state-of-the-art talking\nhead generation methods on VoxCeleb1 and CelebV datasets. Please check our\n\\href{https://github.com/harlanhong/ICCV2023-MCNET}{Project}.\n","authors":["Fa-Ting Hong","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.09906v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2305.09211v3","updated":"2023-07-19T10:52:30Z","published":"2023-05-16T06:40:04Z","title":"CB-HVTNet: A channel-boosted hybrid vision transformer network for\n  lymphocyte assessment in histopathological images","summary":"  Transformers, due to their ability to learn long range dependencies, have\novercome the shortcomings of convolutional neural networks (CNNs) for global\nperspective learning. Therefore, they have gained the focus of researchers for\nseveral vision related tasks including medical diagnosis. However, their\nmulti-head attention module only captures global level feature representations,\nwhich is insufficient for medical images. To address this issue, we propose a\nChannel Boosted Hybrid Vision Transformer (CB HVT) that uses transfer learning\nto generate boosted channels and employs both transformers and CNNs to analyse\nlymphocytes in histopathological images. The proposed CB HVT comprises five\nmodules, including a channel generation module, channel exploitation module,\nchannel merging module, region-aware module, and a detection and segmentation\nhead, which work together to effectively identify lymphocytes. The channel\ngeneration module uses the idea of channel boosting through transfer learning\nto extract diverse channels from different auxiliary learners. In the CB HVT,\nthese boosted channels are first concatenated and ranked using an attention\nmechanism in the channel exploitation module. A fusion block is then utilized\nin the channel merging module for a gradual and systematic merging of the\ndiverse boosted channels to improve the network's learning representations. The\nCB HVT also employs a proposal network in its region aware module and a head to\neffectively identify objects, even in overlapping regions and with artifacts.\nWe evaluated the proposed CB HVT on two publicly available datasets for\nlymphocyte assessment in histopathological images. The results show that CB HVT\noutperformed other state of the art detection models, and has good\ngeneralization ability, demonstrating its value as a tool for pathologists.\n","authors":["Momina Liaqat Ali","Zunaira Rauf","Asifullah Khan","Anabia Sohail","Rafi Ullah","Jeonghwan Gwak"],"pdf_url":"https://arxiv.org/pdf/2305.09211v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09893v1","updated":"2023-07-19T10:45:49Z","published":"2023-07-19T10:45:49Z","title":"Learning from Abstract Images: on the Importance of Occlusion in a\n  Minimalist Encoding of Human Poses","summary":"  Existing 2D-to-3D pose lifting networks suffer from poor performance in\ncross-dataset benchmarks. Although the use of 2D keypoints joined by\n\"stick-figure\" limbs has shown promise as an intermediate step, stick-figures\ndo not account for occlusion information that is often inherent in an image. In\nthis paper, we propose a novel representation using opaque 3D limbs that\npreserves occlusion information while implicitly encoding joint locations.\nCrucially, when training on data with accurate three-dimensional keypoints and\nwithout part-maps, this representation allows training on abstract synthetic\nimages, with occlusion, from as many synthetic viewpoints as desired. The\nresult is a pose defined by limb angles rather than joint positions\n$\\unicode{x2013}$ because poses are, in the real world, independent of cameras\n$\\unicode{x2013}$ allowing us to predict poses that are completely independent\nof camera viewpoint. The result provides not only an improvement in\nsame-dataset benchmarks, but a \"quantum leap\" in cross-dataset benchmarks.\n","authors":["Saad Manzur","Wayne Hayes"],"pdf_url":"https://arxiv.org/pdf/2307.09893v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.09892v1","updated":"2023-07-19T10:44:44Z","published":"2023-07-19T10:44:44Z","title":"3Deformer: A Common Framework for Image-Guided Mesh Deformation","summary":"  We propose 3Deformer, a general-purpose framework for interactive 3D shape\nediting. Given a source 3D mesh with semantic materials, and a user-specified\nsemantic image, 3Deformer can accurately edit the source mesh following the\nshape guidance of the semantic image, while preserving the source topology as\nrigid as possible. Recent studies of 3D shape editing mostly focus on learning\nneural networks to predict 3D shapes, which requires high-cost 3D training\ndatasets and is limited to handling objects involved in the datasets. Unlike\nthese studies, our 3Deformer is a non-training and common framework, which only\nrequires supervision of readily-available semantic images, and is compatible\nwith editing various objects unlimited by datasets. In 3Deformer, the source\nmesh is deformed utilizing the differentiable renderer technique, according to\nthe correspondences between semantic images and mesh materials. However,\nguiding complex 3D shapes with a simple 2D image incurs extra challenges, that\nis, the deform accuracy, surface smoothness, geometric rigidity, and global\nsynchronization of the edited mesh should be guaranteed. To address these\nchallenges, we propose a hierarchical optimization architecture to balance the\nglobal and local shape features, and propose further various strategies and\nlosses to improve properties of accuracy, smoothness, rigidity, and so on.\nExtensive experiments show that our 3Deformer is able to produce impressive\nresults and reaches the state-of-the-art level.\n","authors":["Hao Su","Xuefeng Liu","Jianwei Niu","Ji Wan","Xinghao Wu"],"pdf_url":"https://arxiv.org/pdf/2307.09892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09886v1","updated":"2023-07-19T10:31:35Z","published":"2023-07-19T10:31:35Z","title":"A reinforcement learning approach for VQA validation: an application to\n  diabetic macular edema grading","summary":"  Recent advances in machine learning models have greatly increased the\nperformance of automated methods in medical image analysis. However, the\ninternal functioning of such models is largely hidden, which hinders their\nintegration in clinical practice. Explainability and trust are viewed as\nimportant aspects of modern methods, for the latter's widespread use in\nclinical communities. As such, validation of machine learning models represents\nan important aspect and yet, most methods are only validated in a limited way.\nIn this work, we focus on providing a richer and more appropriate validation\napproach for highly powerful Visual Question Answering (VQA) algorithms. To\nbetter understand the performance of these methods, which answer arbitrary\nquestions related to images, this work focuses on an automatic visual Turing\ntest (VTT). That is, we propose an automatic adaptive questioning method, that\naims to expose the reasoning behavior of a VQA algorithm. Specifically, we\nintroduce a reinforcement learning (RL) agent that observes the history of\npreviously asked questions, and uses it to select the next question to pose. We\ndemonstrate our approach in the context of evaluating algorithms that\nautomatically answer questions related to diabetic macular edema (DME) grading.\nThe experiments show that such an agent has similar behavior to a clinician,\nwhereby asking questions that are relevant to key clinical concepts.\n","authors":["Tatiana Fountoukidou","Raphael Sznitman"],"pdf_url":"https://arxiv.org/pdf/2307.09886v1.pdf","comment":"16 pages (+ 23 pages supplementary material)"},{"id":"http://arxiv.org/abs/2307.09880v1","updated":"2023-07-19T10:23:28Z","published":"2023-07-19T10:23:28Z","title":"A3D: Adaptive, Accurate, and Autonomous Navigation for Edge-Assisted\n  Drones","summary":"  Accurate navigation is of paramount importance to ensure flight safety and\nefficiency for autonomous drones. Recent research starts to use Deep Neural\nNetworks to enhance drone navigation given their remarkable predictive\ncapability for visual perception. However, existing solutions either run DNN\ninference tasks on drones in situ, impeded by the limited onboard resource, or\noffload the computation to external servers which may incur large network\nlatency. Few works consider jointly optimizing the offloading decisions along\nwith image transmission configurations and adapting them on the fly. In this\npaper, we propose A3D, an edge server assisted drone navigation framework that\ncan dynamically adjust task execution location, input resolution, and image\ncompression ratio in order to achieve low inference latency, high prediction\naccuracy, and long flight distances. Specifically, we first augment\nstate-of-the-art convolutional neural networks for drone navigation and define\na novel metric called Quality of Navigation as our optimization objective which\ncan effectively capture the above goals. We then design a deep reinforcement\nlearning based neural scheduler at the drone side for which an information\nencoder is devised to reshape the state features and thus improve its learning\nability. To further support simultaneous multi-drone serving, we extend the\nedge server design by developing a network-aware resource allocation algorithm,\nwhich allows provisioning containerized resources aligned with drones' demand.\nWe finally implement a proof-of-concept prototype with realistic devices and\nvalidate its performance in a real-world campus scene, as well as a simulation\nenvironment for thorough evaluation upon AirSim. Extensive experimental results\nshow that A3D can reduce end-to-end latency by 28.06% and extend the flight\ndistance by up to 27.28% compared with non-adaptive solutions.\n","authors":["Liekang Zeng","Haowei Chen","Daipeng Feng","Xiaoxi Zhang","Xu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.09880v1.pdf","comment":"Accepted by IEEE/ACM Transactions on Networking"},{"id":"http://arxiv.org/abs/2304.06403v2","updated":"2023-07-19T10:12:58Z","published":"2023-04-13T11:10:16Z","title":"Leveraging triplet loss for unsupervised action segmentation","summary":"  In this paper, we propose a novel fully unsupervised framework that learns\naction representations suitable for the action segmentation task from the\nsingle input video itself, without requiring any training data. Our method is a\ndeep metric learning approach rooted in a shallow network with a triplet loss\noperating on similarity distributions and a novel triplet selection strategy\nthat effectively models temporal and semantic priors to discover actions in the\nnew representational space. Under these circumstances, we successfully recover\ntemporal boundaries in the learned action representations with higher quality\ncompared with existing unsupervised approaches. The proposed method is\nevaluated on two widely used benchmark datasets for the action segmentation\ntask and it achieves competitive performance by applying a generic clustering\nalgorithm on the learned representations.\n","authors":["E. Bueno-Benito","B. Tura","M. Dimiccoli"],"pdf_url":"https://arxiv.org/pdf/2304.06403v2.pdf","comment":"Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern\n  Recognition (CVPR) Workshops, 2023, pp. 4921-4929"},{"id":"http://arxiv.org/abs/2304.05417v2","updated":"2023-07-19T10:01:29Z","published":"2023-04-11T18:00:02Z","title":"The MONET dataset: Multimodal drone thermal dataset recorded in rural\n  scenarios","summary":"  We present MONET, a new multimodal dataset captured using a thermal camera\nmounted on a drone that flew over rural areas, and recorded human and vehicle\nactivities. We captured MONET to study the problem of object localisation and\nbehaviour understanding of targets undergoing large-scale variations and being\nrecorded from different and moving viewpoints. Target activities occur in two\ndifferent land sites, each with unique scene structures and cluttered\nbackgrounds. MONET consists of approximately 53K images featuring 162K manually\nannotated bounding boxes. Each image is timestamp-aligned with drone metadata\nthat includes information about attitudes, speed, altitude, and GPS\ncoordinates. MONET is different from previous thermal drone datasets because it\nfeatures multimodal data, including rural scenes captured with thermal cameras\ncontaining both person and vehicle targets, along with trajectory information\nand metadata. We assessed the difficulty of the dataset in terms of transfer\nlearning between the two sites and evaluated nine object detection algorithms\nto identify the open challenges associated with this type of data. Project\npage: https://github.com/fabiopoiesi/monet_dataset.\n","authors":["Luigi Riz","Andrea Caraffa","Matteo Bortolon","Mohamed Lamine Mekhalfi","Davide Boscaini","André Moura","José Antunes","André Dias","Hugo Silva","Andreas Leonidou","Christos Constantinides","Christos Keleshis","Dante Abate","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2304.05417v2.pdf","comment":"Published in Computer Vision and Pattern Recognition (CVPR) Workshops\n  2023 - 6th Multimodal Learning and Applications Workshop"},{"id":"http://arxiv.org/abs/2307.09861v1","updated":"2023-07-19T09:45:06Z","published":"2023-07-19T09:45:06Z","title":"BSDM: Background Suppression Diffusion Model for Hyperspectral Anomaly\n  Detection","summary":"  Hyperspectral anomaly detection (HAD) is widely used in Earth observation and\ndeep space exploration. A major challenge for HAD is the complex background of\nthe input hyperspectral images (HSIs), resulting in anomalies confused in the\nbackground. On the other hand, the lack of labeled samples for HSIs leads to\npoor generalization of existing HAD methods. This paper starts the first\nattempt to study a new and generalizable background learning problem without\nlabeled samples. We present a novel solution BSDM (background suppression\ndiffusion model) for HAD, which can simultaneously learn latent background\ndistributions and generalize to different datasets for suppressing complex\nbackground. It is featured in three aspects: (1) For the complex background of\nHSIs, we design pseudo background noise and learn the potential background\ndistribution in it with a diffusion model (DM). (2) For the generalizability\nproblem, we apply a statistical offset module so that the BSDM adapts to\ndatasets of different domains without labeling samples. (3) For achieving\nbackground suppression, we innovatively improve the inference process of DM by\nfeeding the original HSIs into the denoising network, which removes the\nbackground as noise. Our work paves a new background suppression way for HAD\nthat can improve HAD performance without the prerequisite of manually labeled\ndata. Assessments and generalization experiments of four HAD methods on several\nreal HSI datasets demonstrate the above three unique properties of the proposed\nmethod. The code is available at https://github.com/majitao-xd/BSDM-HAD.\n","authors":["Jitao Ma","Weiying Xie","Yunsong Li","Leyuan Fang"],"pdf_url":"https://arxiv.org/pdf/2307.09861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09857v1","updated":"2023-07-19T09:36:08Z","published":"2023-07-19T09:36:08Z","title":"Blind Image Quality Assessment Using Multi-Stream Architecture with\n  Spatial and Channel Attention","summary":"  BIQA (Blind Image Quality Assessment) is an important field of study that\nevaluates images automatically. Although significant progress has been made,\nblind image quality assessment remains a difficult task since images vary in\ncontent and distortions. Most algorithms generate quality without emphasizing\nthe important region of interest. In order to solve this, a multi-stream\nspatial and channel attention-based algorithm is being proposed. This algorithm\ngenerates more accurate predictions with a high correlation to human perceptual\nassessment by combining hybrid features from two different backbones, followed\nby spatial and channel attention to provide high weights to the region of\ninterest. Four legacy image quality assessment datasets are used to validate\nthe effectiveness of our proposed approach. Authentic and synthetic distortion\nimage databases are used to demonstrate the effectiveness of the proposed\nmethod, and we show that it has excellent generalization properties with a\nparticular focus on the perceptual foreground information.\n","authors":["Hassan Khalid","Nisar Ahmed"],"pdf_url":"https://arxiv.org/pdf/2307.09857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02203v3","updated":"2023-07-19T09:34:22Z","published":"2023-07-05T10:54:50Z","title":"Neural Fields for Interactive Visualization of Statistical Dependencies\n  in 3D Simulation Ensembles","summary":"  We present the first neural network that has learned to compactly represent\nand can efficiently reconstruct the statistical dependencies between the values\nof physical variables at different spatial locations in large 3D simulation\nensembles. Going beyond linear dependencies, we consider mutual information as\na measure of non-linear dependence. We demonstrate learning and reconstruction\nwith a large weather forecast ensemble comprising 1000 members, each storing\nmultiple physical variables at a 250 x 352 x 20 simulation grid. By\ncircumventing compute-intensive statistical estimators at runtime, we\ndemonstrate significantly reduced memory and computation requirements for\nreconstructing the major dependence structures. This enables embedding the\nestimator into a GPU-accelerated direct volume renderer and interactively\nvisualizing all mutual dependencies for a selected domain point.\n","authors":["Fatemeh Farokhmanesh","Kevin Höhlein","Christoph Neuhauser","Rüdiger Westermann"],"pdf_url":"https://arxiv.org/pdf/2307.02203v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09856v1","updated":"2023-07-19T09:30:00Z","published":"2023-07-19T09:30:00Z","title":"Hierarchical Spatio-Temporal Representation Learning for Gait\n  Recognition","summary":"  Gait recognition is a biometric technique that identifies individuals by\ntheir unique walking styles, which is suitable for unconstrained environments\nand has a wide range of applications. While current methods focus on exploiting\nbody part-based representations, they often neglect the hierarchical\ndependencies between local motion patterns. In this paper, we propose a\nhierarchical spatio-temporal representation learning (HSTL) framework for\nextracting gait features from coarse to fine. Our framework starts with a\nhierarchical clustering analysis to recover multi-level body structures from\nthe whole body to local details. Next, an adaptive region-based motion\nextractor (ARME) is designed to learn region-independent motion features. The\nproposed HSTL then stacks multiple ARMEs in a top-down manner, with each ARME\ncorresponding to a specific partition level of the hierarchy. An adaptive\nspatio-temporal pooling (ASTP) module is used to capture gait features at\ndifferent levels of detail to perform hierarchical feature mapping. Finally, a\nframe-level temporal aggregation (FTA) module is employed to reduce redundant\ninformation in gait sequences through multi-scale temporal downsampling.\nExtensive experiments on CASIA-B, OUMVLP, GREW, and Gait3D datasets demonstrate\nthat our method outperforms the state-of-the-art while maintaining a reasonable\nbalance between model accuracy and complexity.\n","authors":["Lei Wang","Bo Liu","Fangfang Liang","Bincheng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09856v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2307.00574v2","updated":"2023-07-19T09:27:14Z","published":"2023-07-02T13:57:45Z","title":"Bidirectional Temporal Diffusion Model for Temporally Consistent Human\n  Animation","summary":"  We introduce a method to generate temporally coherent human animation from a\nsingle image, a video, or a random noise. This problem has been formulated as\nmodeling of an auto-regressive generation, i.e., to regress past frames to\ndecode future frames. However, such unidirectional generation is highly prone\nto motion drifting over time, generating unrealistic human animation with\nsignificant artifacts such as appearance distortion. We claim that\nbidirectional temporal modeling enforces temporal coherence on a generative\nnetwork by largely suppressing the motion ambiguity of human appearance. To\nprove our claim, we design a novel human animation framework using a denoising\ndiffusion model: a neural network learns to generate the image of a person by\ndenoising temporal Gaussian noises whose intermediate results are\ncross-conditioned bidirectionally between consecutive frames. In the\nexperiments, our method demonstrates strong performance compared to existing\nunidirectional approaches with realistic temporal coherence\n","authors":["Tserendorj Adiya","Sanghun Kim","Jung Eun Lee","Jae Shin Yoon","Hwasup Lim"],"pdf_url":"https://arxiv.org/pdf/2307.00574v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07873v2","updated":"2023-07-19T09:23:43Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding Adversarial\n  Transferability From Surrogate Training","summary":"  Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v2.pdf","comment":"Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21\n  pages, 12 figures, 13 tables"},{"id":"http://arxiv.org/abs/2208.10741v3","updated":"2023-07-19T09:15:05Z","published":"2022-08-23T05:27:32Z","title":"Hierarchically Decomposed Graph Convolutional Networks for\n  Skeleton-Based Action Recognition","summary":"  Graph convolutional networks (GCNs) are the most commonly used methods for\nskeleton-based action recognition and have achieved remarkable performance.\nGenerating adjacency matrices with semantically meaningful edges is\nparticularly important for this task, but extracting such edges is challenging\nproblem. To solve this, we propose a hierarchically decomposed graph\nconvolutional network (HD-GCN) architecture with a novel hierarchically\ndecomposed graph (HD-Graph). The proposed HD-GCN effectively decomposes every\njoint node into several sets to extract major structurally adjacent and distant\nedges, and uses them to construct an HD-Graph containing those edges in the\nsame semantic spaces of a human skeleton. In addition, we introduce an\nattention-guided hierarchy aggregation (A-HA) module to highlight the dominant\nhierarchical edge sets of the HD-Graph. Furthermore, we apply a new six-way\nensemble method, which uses only joint and bone stream without any motion\nstream. The proposed model is evaluated and achieves state-of-the-art\nperformance on four large, popular datasets. Finally, we demonstrate the\neffectiveness of our model with various comparative experiments.\n","authors":["Jungho Lee","Minhyeok Lee","Dogyoon Lee","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2208.10741v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.06689v2","updated":"2023-07-19T09:09:42Z","published":"2023-07-13T11:21:58Z","title":"YOLIC: An Efficient Method for Object Localization and Classification on\n  Edge Devices","summary":"  In the realm of Tiny AI, we introduce \"You Only Look at Interested Cells\"\n(YOLIC), an efficient method for object localization and classification on edge\ndevices. Seamlessly blending the strengths of semantic segmentation and object\ndetection, YOLIC offers superior computational efficiency and precision. By\nadopting Cells of Interest for classification instead of individual pixels,\nYOLIC encapsulates relevant information, reduces computational load, and\nenables rough object shape inference. Importantly, the need for bounding box\nregression is obviated, as YOLIC capitalizes on the predetermined cell\nconfiguration that provides information about potential object location, size,\nand shape. To tackle the issue of single-label classification limitations, a\nmulti-label classification approach is applied to each cell, effectively\nrecognizing overlapping or closely situated objects. This paper presents\nextensive experiments on multiple datasets, demonstrating that YOLIC achieves\ndetection performance comparable to the state-of-the-art YOLO algorithms while\nsurpassing in speed, exceeding 30fps on a Raspberry Pi 4B CPU. All resources\nrelated to this study, including datasets, cell designer, image annotation\ntool, and source code, have been made publicly available on our project website\nat https://kai3316.github.io/yolic.github.io\n","authors":["Kai Su","Qiangfu Zhao","Yoichi Tomioka","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.06689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09847v1","updated":"2023-07-19T09:09:24Z","published":"2023-07-19T09:09:24Z","title":"Cryo-forum: A framework for orientation recovery with uncertainty\n  measure with the application in cryo-EM image analysis","summary":"  In single-particle cryo-electron microscopy (cryo-EM), the efficient\ndetermination of orientation parameters for 2D projection images poses a\nsignificant challenge yet is crucial for reconstructing 3D structures. This\ntask is complicated by the high noise levels present in the cryo-EM datasets,\nwhich often include outliers, necessitating several time-consuming 2D clean-up\nprocesses. Recently, solutions based on deep learning have emerged, offering a\nmore streamlined approach to the traditionally laborious task of orientation\nestimation. These solutions often employ amortized inference, eliminating the\nneed to estimate parameters individually for each image. However, these methods\nfrequently overlook the presence of outliers and may not adequately concentrate\non the components used within the network. This paper introduces a novel\napproach that uses a 10-dimensional feature vector to represent the orientation\nand applies a Quadratically-Constrained Quadratic Program to derive the\npredicted orientation as a unit quaternion, supplemented by an uncertainty\nmetric. Furthermore, we propose a unique loss function that considers the\npairwise distances between orientations, thereby enhancing the accuracy of our\nmethod. Finally, we also comprehensively evaluate the design choices involved\nin constructing the encoder network, a topic that has not received sufficient\nattention in the literature. Our numerical analysis demonstrates that our\nmethodology effectively recovers orientations from 2D cryo-EM images in an\nend-to-end manner. Importantly, the inclusion of uncertainty quantification\nallows for direct clean-up of the dataset at the 3D level. Lastly, we package\nour proposed methods into a user-friendly software suite named cryo-forum,\ndesigned for easy accessibility by the developers.\n","authors":["Szu-Chi Chung"],"pdf_url":"https://arxiv.org/pdf/2307.09847v1.pdf","comment":"27 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.09841v1","updated":"2023-07-19T08:55:39Z","published":"2023-07-19T08:55:39Z","title":"Compressive Image Scanning Microscope","summary":"  We present a novel approach to implement compressive sensing in laser\nscanning microscopes (LSM), specifically in image scanning microscopy (ISM),\nusing a single-photon avalanche diode (SPAD) array detector. Our method\naddresses two significant limitations in applying compressive sensing to LSM:\nthe time to compute the sampling matrix and the quality of reconstructed\nimages. We employ a fixed sampling strategy, skipping alternate rows and\ncolumns during data acquisition, which reduces the number of points scanned by\na factor of four and eliminates the need to compute different sampling\nmatrices. By exploiting the parallel images generated by the SPAD array, we\nimprove the quality of the reconstructed compressive-ISM images compared to\nstandard compressive confocal LSM images. Our results demonstrate the\neffectiveness of our approach in producing higher-quality images with reduced\ndata acquisition time and potential benefits in reducing photobleaching.\n","authors":["Ajay Gunalan","Marco Castello","Simonluca Piazza","Shunlei Li","Alberto Diaspro","Leonardo S. Mattos","Paolo Bianchini"],"pdf_url":"https://arxiv.org/pdf/2307.09841v1.pdf","comment":"Presented in ISCS23"},{"id":"http://arxiv.org/abs/2111.01396v2","updated":"2023-07-19T08:55:05Z","published":"2021-11-02T06:58:22Z","title":"Boundary Distribution Estimation for Precise Object Detection","summary":"  In the field of state-of-the-art object detection, the task of object\nlocalization is typically accomplished through a dedicated subnet that\nemphasizes bounding box regression. This subnet traditionally predicts the\nobject's position by regressing the box's center position and scaling factors.\nDespite the widespread adoption of this approach, we have observed that the\nlocalization results often suffer from defects, leading to unsatisfactory\ndetector performance. In this paper, we address the shortcomings of previous\nmethods through theoretical analysis and experimental verification and present\nan innovative solution for precise object detection. Instead of solely focusing\non the object's center and size, our approach enhances the accuracy of bounding\nbox localization by refining the box edges based on the estimated distribution\nat the object's boundary. Experimental results demonstrate the potential and\ngeneralizability of our proposed method.\n","authors":["Peng Zhi","Haoran Zhou","Hang Huang","Rui Zhao","Rui Zhou","Qingguo Zhou"],"pdf_url":"https://arxiv.org/pdf/2111.01396v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09340v2","updated":"2023-07-19T08:55:01Z","published":"2023-03-16T14:21:45Z","title":"Improving Automated Hemorrhage Detection in Sparse-view Computed\n  Tomography via Deep Convolutional Neural Network based Artifact Reduction","summary":"  Purpose: Sparse-view computed tomography (CT) is an effective way to reduce\ndose by lowering the total number of views acquired, albeit at the expense of\nimage quality, which, in turn, can impact the ability to detect diseases. We\nexplore deep learning-based artifact reduction in sparse-view cranial CT scans\nand its impact on automated hemorrhage detection. Methods: We trained a U-Net\nfor artefact reduction on simulated sparse-view cranial CT scans from 3000\npatients obtained from a public dataset and reconstructed with varying levels\nof sub-sampling. Additionally, we trained a convolutional neural network on\nfully sampled CT data from 17,545 patients for automated hemorrhage detection.\nWe evaluated the classification performance using the area under the receiver\noperator characteristic curves (AUC-ROCs) with corresponding 95% confidence\nintervals (CIs) and the DeLong test, along with confusion matrices. The\nperformance of the U-Net was compared to an analytical approach based on total\nvariation (TV). Results: The U-Net performed superior compared to unprocessed\nand TV-processed images with respect to image quality and automated hemorrhage\ndiagnosis. With U-Net post-processing, the number of views can be reduced from\n4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;\n0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256\nviews (0.967; 0.964-0.969) with a slight performance decrease (P<.001).\nConclusion: The results suggest that U-Net based artifact reduction\nsubstantially enhances automated hemorrhage detection in sparse-view cranial\nCTs. Our findings highlight that appropriate post-processing is crucial for\noptimal image quality and diagnostic accuracy while minimizing radiation dose.\n","authors":["Johannes Thalhammer","Manuel Schultheiss","Tina Dorosti","Tobias Lasser","Franz Pfeiffer","Daniela Pfeiffer","Florian Schaff"],"pdf_url":"https://arxiv.org/pdf/2303.09340v2.pdf","comment":"11 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2210.06551v3","updated":"2023-07-19T08:54:27Z","published":"2022-10-12T19:46:25Z","title":"MotionBERT: A Unified Perspective on Learning Human Motion\n  Representations","summary":"  We present a unified perspective on tackling various human-centric video\ntasks by learning human motion representations from large-scale and\nheterogeneous data resources. Specifically, we propose a pretraining stage in\nwhich a motion encoder is trained to recover the underlying 3D motion from\nnoisy partial 2D observations. The motion representations acquired in this way\nincorporate geometric, kinematic, and physical knowledge about human motion,\nwhich can be easily transferred to multiple downstream tasks. We implement the\nmotion encoder with a Dual-stream Spatio-temporal Transformer (DSTformer)\nneural network. It could capture long-range spatio-temporal relationships among\nthe skeletal joints comprehensively and adaptively, exemplified by the lowest\n3D pose estimation error so far when trained from scratch. Furthermore, our\nproposed framework achieves state-of-the-art performance on all three\ndownstream tasks by simply finetuning the pretrained motion encoder with a\nsimple regression head (1-2 layers), which demonstrates the versatility of the\nlearned motion representations. Code and models are available at\nhttps://motionbert.github.io/\n","authors":["Wentao Zhu","Xiaoxuan Ma","Zhaoyang Liu","Libin Liu","Wayne Wu","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2210.06551v3.pdf","comment":"ICCV 2023 version"},{"id":"http://arxiv.org/abs/2307.09829v1","updated":"2023-07-19T08:34:25Z","published":"2023-07-19T08:34:25Z","title":"What do neural networks learn in image classification? A frequency\n  shortcut perspective","summary":"  Frequency analysis is useful for understanding the mechanisms of\nrepresentation learning in neural networks (NNs). Most research in this area\nfocuses on the learning dynamics of NNs for regression tasks, while little for\nclassification. This study empirically investigates the latter and expands the\nunderstanding of frequency shortcuts. First, we perform experiments on\nsynthetic datasets, designed to have a bias in different frequency bands. Our\nresults demonstrate that NNs tend to find simple solutions for classification,\nand what they learn first during training depends on the most distinctive\nfrequency characteristics, which can be either low- or high-frequencies.\nSecond, we confirm this phenomenon on natural images. We propose a metric to\nmeasure class-wise frequency characteristics and a method to identify frequency\nshortcuts. The results show that frequency shortcuts can be texture-based or\nshape-based, depending on what best simplifies the objective. Third, we\nvalidate the transferability of frequency shortcuts on out-of-distribution\n(OOD) test sets. Our results suggest that frequency shortcuts can be\ntransferred across datasets and cannot be fully avoided by larger model\ncapacity and data augmentation. We recommend that future research should focus\non effective training schemes mitigating frequency shortcut learning.\n","authors":["Shunxin Wang","Raymond Veldhuis","Christoph Brune","Nicola Strisciuglio"],"pdf_url":"https://arxiv.org/pdf/2307.09829v1.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2307.09827v1","updated":"2023-07-19T08:32:59Z","published":"2023-07-19T08:32:59Z","title":"Online Continual Learning for Robust Indoor Object Recognition","summary":"  Vision systems mounted on home robots need to interact with unseen classes in\nchanging environments. Robots have limited computational resources, labelled\ndata and storage capability. These requirements pose some unique challenges:\nmodels should adapt without forgetting past knowledge in a data- and\nparameter-efficient way. We characterize the problem as few-shot (FS) online\ncontinual learning (OCL), where robotic agents learn from a non-repeated stream\nof few-shot data updating only a few model parameters. Additionally, such\nmodels experience variable conditions at test time, where objects may appear in\ndifferent poses (e.g., horizontal or vertical) and environments (e.g., day or\nnight). To improve robustness of CL agents, we propose RobOCLe, which; 1)\nconstructs an enriched feature space computing high order statistical moments\nfrom the embedded features of samples; and 2) computes similarity between high\norder statistics of the samples on the enriched feature space, and predicts\ntheir class labels. We evaluate robustness of CL models to train/test\naugmentations in various cases. We show that different moments allow RobOCLe to\ncapture different properties of deformations, providing higher robustness with\nno decrease of inference speed.\n","authors":["Umberto Michieli","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2307.09827v1.pdf","comment":"IROS 2023"},{"id":"http://arxiv.org/abs/2307.09416v2","updated":"2023-07-19T08:27:50Z","published":"2023-07-18T16:33:30Z","title":"Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation\n  Evaluation","summary":"  Research in Image Generation has recently made significant progress,\nparticularly boosted by the introduction of Vision-Language models which are\nable to produce high-quality visual content based on textual inputs. Despite\nongoing advancements in terms of generation quality and realism, no methodical\nframeworks have been defined yet to quantitatively measure the quality of the\ngenerated content and the adherence with the prompted requests: so far, only\nhuman-based evaluations have been adopted for quality satisfaction and for\ncomparing different generative methods. We introduce a novel automated method\nfor Visual Concept Evaluation (ViCE), i.e. to assess consistency between a\ngenerated/edited image and the corresponding prompt/instructions, with a\nprocess inspired by the human cognitive behaviour. ViCE combines the strengths\nof Large Language Models (LLMs) and Visual Question Answering (VQA) into a\nunified pipeline, aiming to replicate the human cognitive process in quality\nassessment. This method outlines visual concepts, formulates image-specific\nverification questions, utilizes the Q&A system to investigate the image, and\nscores the combined outcome. Although this brave new hypothesis of mimicking\nhumans in the image evaluation process is in its preliminary assessment stage,\nresults are promising and open the door to a new form of automatic evaluation\nwhich could have significant impact as the image generation or the image target\nediting tasks become more and more sophisticated.\n","authors":["Federico Betti","Jacopo Staiano","Lorenzo Baraldi","Lorenzo Baraldi","Rita Cucchiara","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2307.09416v2.pdf","comment":"Accepted as oral at ACM MultiMedia 2023 (Brave New Ideas track)"},{"id":"http://arxiv.org/abs/2205.11397v5","updated":"2023-07-19T08:25:37Z","published":"2022-05-23T15:42:12Z","title":"Super Vision Transformer","summary":"  We attempt to reduce the computational costs in vision transformers (ViTs),\nwhich increase quadratically in the token number. We present a novel training\nparadigm that trains only one ViT model at a time, but is capable of providing\nimproved image recognition performance with various computational costs. Here,\nthe trained ViT model, termed super vision transformer (SuperViT), is empowered\nwith the versatile ability to solve incoming patches of multiple sizes as well\nas preserve informative tokens with multiple keeping rates (the ratio of\nkeeping tokens) to achieve good hardware efficiency for inference, given that\nthe available hardware resources often change from time to time. Experimental\nresults on ImageNet demonstrate that our SuperViT can considerably reduce the\ncomputational costs of ViT models with even performance increase. For example,\nwe reduce 2x FLOPs of DeiT-S while increasing the Top-1 accuracy by 0.2% and\n0.7% for 1.5x reduction. Also, our SuperViT significantly outperforms existing\nstudies on efficient vision transformers. For example, when consuming the same\namount of FLOPs, our SuperViT surpasses the recent state-of-the-art (SOTA) EViT\nby 1.1% when using DeiT-S as their backbones. The project of this work is made\npublicly available at https://github.com/lmbxmu/SuperViT.\n","authors":["Mingbao Lin","Mengzhao Chen","Yuxin Zhang","Chunhua Shen","Rongrong Ji","Liujuan Cao"],"pdf_url":"https://arxiv.org/pdf/2205.11397v5.pdf","comment":"Accepted by International Journal of Computer Vision (IJCV) in the\n  year of 2023"},{"id":"http://arxiv.org/abs/2307.09823v1","updated":"2023-07-19T08:21:01Z","published":"2023-07-19T08:21:01Z","title":"Multi-modal Learning based Prediction for Disease","summary":"  Non alcoholic fatty liver disease (NAFLD) is the most common cause of chronic\nliver disease, which can be predicted accurately to prevent advanced fibrosis\nand cirrhosis. While, a liver biopsy, the gold standard for NAFLD diagnosis, is\ninvasive, expensive, and prone to sampling errors. Therefore, non-invasive\nstudies are extremely promising, yet they are still in their infancy due to the\nlack of comprehensive research data and intelligent methods for multi-modal\ndata. This paper proposes a NAFLD diagnosis system (DeepFLDDiag) combining a\ncomprehensive clinical dataset (FLDData) and a multi-modal learning based NAFLD\nprediction method (DeepFLD). The dataset includes over 6000 participants\nphysical examinations, laboratory and imaging studies, extensive\nquestionnaires, and facial images of partial participants, which is\ncomprehensive and valuable for clinical studies. From the dataset, we\nquantitatively analyze and select clinical metadata that most contribute to\nNAFLD prediction. Furthermore, the proposed DeepFLD, a deep neural network\nmodel designed to predict NAFLD using multi-modal input, including metadata and\nfacial images, outperforms the approach that only uses metadata. Satisfactory\nperformance is also verified on other unseen datasets. Inspiringly, DeepFLD can\nachieve competitive results using only facial images as input rather than\nmetadata, paving the way for a more robust and simpler non-invasive NAFLD\ndiagnosis.\n","authors":["Yaran Chen","Xueyu Chen","Yu Han","Haoran Li","Dongbin Zhao","Jingzhong Li","Xu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08096v2","updated":"2023-07-19T08:19:58Z","published":"2023-03-14T17:33:39Z","title":"MELON: NeRF with Unposed Images in SO(3)","summary":"  Neural radiance fields enable novel-view synthesis and scene reconstruction\nwith photorealistic quality from a few images, but require known and accurate\ncamera poses. Conventional pose estimation algorithms fail on smooth or\nself-similar scenes, while methods performing inverse rendering from unposed\nviews require a rough initialization of the camera orientations. The main\ndifficulty of pose estimation lies in real-life objects being almost invariant\nunder certain transformations, making the photometric distance between rendered\nviews non-convex with respect to the camera parameters. Using an equivalence\nrelation that matches the distribution of local minima in camera space, we\nreduce this space to its quotient set, in which pose estimation becomes a more\nconvex problem. Using a neural-network to regularize pose estimation, we\ndemonstrate that our method - MELON - can reconstruct a neural radiance field\nfrom unposed images with state-of-the-art accuracy while requiring ten times\nfewer views than adversarial approaches.\n","authors":["Axel Levy","Mark Matthews","Matan Sela","Gordon Wetzstein","Dmitry Lagun"],"pdf_url":"https://arxiv.org/pdf/2303.08096v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09822v1","updated":"2023-07-19T08:19:08Z","published":"2023-07-19T08:19:08Z","title":"A Siamese-based Verification System for Open-set Architecture\n  Attribution of Synthetic Images","summary":"  Despite the wide variety of methods developed for synthetic image\nattribution, most of them can only attribute images generated by models or\narchitectures included in the training set and do not work with unknown\narchitectures, hindering their applicability in real-world scenarios. In this\npaper, we propose a verification framework that relies on a Siamese Network to\naddress the problem of open-set attribution of synthetic images to the\narchitecture that generated them. We consider two different settings. In the\nfirst setting, the system determines whether two images have been produced by\nthe same generative architecture or not. In the second setting, the system\nverifies a claim about the architecture used to generate a synthetic image,\nutilizing one or multiple reference images generated by the claimed\narchitecture. The main strength of the proposed system is its ability to\noperate in both closed and open-set scenarios so that the input images, either\nthe query and reference images, can belong to the architectures considered\nduring training or not. Experimental evaluations encompassing various\ngenerative architectures such as GANs, diffusion models, and transformers,\nfocusing on synthetic face image generation, confirm the excellent performance\nof our method in both closed and open-set settings, as well as its strong\ngeneralization capabilities.\n","authors":["Lydia Abady","Jun Wang","Benedetta Tondi","Mauro Barni"],"pdf_url":"https://arxiv.org/pdf/2307.09822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09821v1","updated":"2023-07-19T08:16:34Z","published":"2023-07-19T08:16:34Z","title":"Hierarchical Semantic Perceptual Listener Head Video Generation: A\n  High-performance Pipeline","summary":"  In dyadic speaker-listener interactions, the listener's head reactions along\nwith the speaker's head movements, constitute an important non-verbal semantic\nexpression together. The listener Head generation task aims to synthesize\nresponsive listener's head videos based on audios of the speaker and reference\nimages of the listener. Compared to the Talking-head generation, it is more\nchallenging to capture the correlation clues from the speaker's audio and\nvisual information. Following the ViCo baseline scheme, we propose a\nhigh-performance solution by enhancing the hierarchical semantic extraction\ncapability of the audio encoder module and improving the decoder part, renderer\nand post-processing modules. Our solution gets the first place on the official\nleaderboard for the track of listening head generation. This paper is a\ntechnical report of ViCo@2023 Conversational Head Generation Challenge in ACM\nMultimedia 2023 conference.\n","authors":["Zhigang Chang","Weitai Hu","Qing Yang","Shibao Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.09821v1.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2307.09818v1","updated":"2023-07-19T08:06:37Z","published":"2023-07-19T08:06:37Z","title":"Deep unrolling Shrinkage Network for Dynamic MR imaging","summary":"  Deep unrolling networks that utilize sparsity priors have achieved great\nsuccess in dynamic magnetic resonance (MR) imaging. The convolutional neural\nnetwork (CNN) is usually utilized to extract the transformed domain, and then\nthe soft thresholding (ST) operator is applied to the CNN-transformed data to\nenforce the sparsity priors. However, the ST operator is usually constrained to\nbe the same across all channels of the CNN-transformed data. In this paper, we\npropose a novel operator, called soft thresholding with channel attention\n(AST), that learns the threshold for each channel. In particular, we put\nforward a novel deep unrolling shrinkage network (DUS-Net) by unrolling the\nalternating direction method of multipliers (ADMM) for optimizing the\ntransformed $l_1$ norm dynamic MR reconstruction model. Experimental results on\nan open-access dynamic cine MR dataset demonstrate that the proposed DUS-Net\noutperforms the state-of-the-art methods. The source code is available at\n\\url{https://github.com/yhao-z/DUS-Net}.\n","authors":["Yinghao Zhang","Xiaodi Li","Weihang Li","Yue Hu"],"pdf_url":"https://arxiv.org/pdf/2307.09818v1.pdf","comment":"5 pages,3 figures,2 tables"},{"id":"http://arxiv.org/abs/2307.07813v3","updated":"2023-07-19T08:06:34Z","published":"2023-07-15T14:34:25Z","title":"TinyTracker: Ultra-Fast and Ultra-Low-Power Edge Vision In-Sensor for\n  Gaze Estimation","summary":"  Intelligent edge vision tasks encounter the critical challenge of ensuring\npower and latency efficiency due to the typically heavy computational load they\nimpose on edge platforms.This work leverages one of the first \"AI in sensor\"\nvision platforms, IMX500 by Sony, to achieve ultra-fast and ultra-low-power\nend-to-end edge vision applications. We evaluate the IMX500 and compare it to\nother edge platforms, such as the Google Coral Dev Micro and Sony Spresense, by\nexploring gaze estimation as a case study. We propose TinyTracker, a highly\nefficient, fully quantized model for 2D gaze estimation designed to maximize\nthe performance of the edge vision systems considered in this study.\nTinyTracker achieves a 41x size reduction (600Kb) compared to iTracker [1]\nwithout significant loss in gaze estimation accuracy (maximum of 0.16 cm when\nfully quantized). TinyTracker's deployment on the Sony IMX500 vision sensor\nresults in end-to-end latency of around 19ms. The camera takes around 17.9ms to\nread, process and transmit the pixels to the accelerator. The inference time of\nthe network is 0.86ms with an additional 0.24 ms for retrieving the results\nfrom the sensor. The overall energy consumption of the end-to-end system is 4.9\nmJ, including 0.06 mJ for inference. The end-to-end study shows that IMX500 is\n1.7x faster than CoralMicro (19ms vs 34.4ms) and 7x more power efficient (4.9mJ\nVS 34.2mJ)\n","authors":["Pietro Bonazzi","Thomas Ruegg","Sizhen Bian","Yawei Li","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2307.07813v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09815v1","updated":"2023-07-19T08:03:53Z","published":"2023-07-19T08:03:53Z","title":"LDP: Language-driven Dual-Pixel Image Defocus Deblurring Network","summary":"  Recovering sharp images from dual-pixel (DP) pairs with disparity-dependent\nblur is a challenging task.~Existing blur map-based deblurring methods have\ndemonstrated promising results. In this paper, we propose, to the best of our\nknowledge, the first framework to introduce the contrastive language-image\npre-training framework (CLIP) to achieve accurate blur map estimation from DP\npairs unsupervisedly. To this end, we first carefully design text prompts to\nenable CLIP to understand blur-related geometric prior knowledge from the DP\npair. Then, we propose a format to input stereo DP pair to the CLIP without any\nfine-tuning, where the CLIP is pre-trained on monocular images. Given the\nestimated blur map, we introduce a blur-prior attention block, a blur-weighting\nloss and a blur-aware loss to recover the all-in-focus image. Our method\nachieves state-of-the-art performance in extensive experiments.\n","authors":["Hao Yang","Liyuan Pan","Yan Yang","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2307.09815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09810v1","updated":"2023-07-19T07:58:21Z","published":"2023-07-19T07:58:21Z","title":"GenKL: An Iterative Framework for Resolving Label Ambiguity and Label\n  Non-conformity in Web Images Via a New Generalized KL Divergence","summary":"  Web image datasets curated online inherently contain ambiguous\nin-distribution (ID) instances and out-of-distribution (OOD) instances, which\nwe collectively call non-conforming (NC) instances. In many recent approaches\nfor mitigating the negative effects of NC instances, the core implicit\nassumption is that the NC instances can be found via entropy maximization. For\n\"entropy\" to be well-defined, we are interpreting the output prediction vector\nof an instance as the parameter vector of a multinomial random variable, with\nrespect to some trained model with a softmax output layer. Hence, entropy\nmaximization is based on the idealized assumption that NC instances have\npredictions that are \"almost\" uniformly distributed. However, in real-world web\nimage datasets, there are numerous NC instances whose predictions are far from\nbeing uniformly distributed. To tackle the limitation of entropy maximization,\nwe propose $(\\alpha, \\beta)$-generalized KL divergence,\n$\\mathcal{D}_{\\text{KL}}^{\\alpha, \\beta}(p\\|q)$, which can be used to identify\nsignificantly more NC instances. Theoretical properties of\n$\\mathcal{D}_{\\text{KL}}^{\\alpha, \\beta}(p\\|q)$ are proven, and we also show\nempirically that a simple use of $\\mathcal{D}_{\\text{KL}}^{\\alpha,\n\\beta}(p\\|q)$ outperforms all baselines on the NC instance identification task.\nBuilding upon $(\\alpha,\\beta)$-generalized KL divergence, we also introduce a\nnew iterative training framework, GenKL, that identifies and relabels NC\ninstances. When evaluated on three web image datasets, Clothing1M,\nFood101/Food101N, and mini WebVision 1.0, we achieved new state-of-the-art\nclassification accuracies: $81.34\\%$, $85.73\\%$ and $78.99\\%$/$92.54\\%$\n(top-1/top-5), respectively.\n","authors":["Xia Huang","Kai Fong Ernest Chong"],"pdf_url":"https://arxiv.org/pdf/2307.09810v1.pdf","comment":"Published (with open access) at International Journal of Computer\n  Vision (IJCV, 2023). 25 pages, 8 figures. Code is available at:\n  https://github.com/codetopaper/GenKL"},{"id":"http://arxiv.org/abs/2307.09804v1","updated":"2023-07-19T07:47:23Z","published":"2023-07-19T07:47:23Z","title":"Fix your downsampling ASAP! Be natively more robust via Aliasing and\n  Spectral Artifact free Pooling","summary":"  Convolutional neural networks encode images through a sequence of\nconvolutions, normalizations and non-linearities as well as downsampling\noperations into potentially strong semantic embeddings. Yet, previous work\nshowed that even slight mistakes during sampling, leading to aliasing, can be\ndirectly attributed to the networks' lack in robustness. To address such issues\nand facilitate simpler and faster adversarial training, [12] recently proposed\nFLC pooling, a method for provably alias-free downsampling - in theory. In this\nwork, we conduct a further analysis through the lens of signal processing and\nfind that such current pooling methods, which address aliasing in the frequency\ndomain, are still prone to spectral leakage artifacts. Hence, we propose\naliasing and spectral artifact-free pooling, short ASAP. While only introducing\na few modifications to FLC pooling, networks using ASAP as downsampling method\nexhibit higher native robustness against common corruptions, a property that\nFLC pooling was missing. ASAP also increases native robustness against\nadversarial attacks on high and low resolution data while maintaining similar\nclean accuracy or even outperforming the baseline.\n","authors":["Julia Grabinski","Janis Keuper","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.09804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08723v2","updated":"2023-07-19T07:35:54Z","published":"2023-07-17T11:19:41Z","title":"Revisiting Scene Text Recognition: A Data Perspective","summary":"  This paper aims to re-assess scene text recognition (STR) from a\ndata-oriented perspective. We begin by revisiting the six commonly used\nbenchmarks in STR and observe a trend of performance saturation, whereby only\n2.91% of the benchmark images cannot be accurately recognized by an ensemble of\n13 representative models. While these results are impressive and suggest that\nSTR could be considered solved, however, we argue that this is primarily due to\nthe less challenging nature of the common benchmarks, thus concealing the\nunderlying issues that STR faces. To this end, we consolidate a large-scale\nreal STR dataset, namely Union14M, which comprises 4 million labeled images and\n10 million unlabeled images, to assess the performance of STR models in more\ncomplex real-world scenarios. Our experiments demonstrate that the 13 models\ncan only achieve an average accuracy of 66.53% on the 4 million labeled images,\nindicating that STR still faces numerous challenges in the real world. By\nanalyzing the error patterns of the 13 models, we identify seven open\nchallenges in STR and develop a challenge-driven benchmark consisting of eight\ndistinct subsets to facilitate further progress in the field. Our exploration\ndemonstrates that STR is far from being solved and leveraging data may be a\npromising solution. In this regard, we find that utilizing the 10 million\nunlabeled images through self-supervised pre-training can significantly improve\nthe robustness of STR model in real-world scenarios and leads to\nstate-of-the-art performance.\n","authors":["Qing Jiang","Jiapeng Wang","Dezhi Peng","Chongyu Liu","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2307.08723v2.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2210.16117v4","updated":"2023-07-19T07:34:37Z","published":"2022-10-28T13:25:59Z","title":"Improving the Transferability of Adversarial Attacks on Face Recognition\n  with Beneficial Perturbation Feature Augmentation","summary":"  Face recognition (FR) models can be easily fooled by adversarial examples,\nwhich are crafted by adding imperceptible perturbations on benign face images.\nThe existence of adversarial face examples poses a great threat to the security\nof society. In order to build a more sustainable digital nation, in this paper,\nwe improve the transferability of adversarial face examples to expose more\nblind spots of existing FR models. Though generating hard samples has shown its\neffectiveness in improving the generalization of models in training tasks, the\neffectiveness of utilizing this idea to improve the transferability of\nadversarial face examples remains unexplored. To this end, based on the\nproperty of hard samples and the symmetry between training tasks and\nadversarial attack tasks, we propose the concept of hard models, which have\nsimilar effects as hard samples for adversarial attack tasks. Utilizing the\nconcept of hard models, we propose a novel attack method called Beneficial\nPerturbation Feature Augmentation Attack (BPFA), which reduces the overfitting\nof adversarial examples to surrogate FR models by constantly generating new\nhard models to craft the adversarial examples. Specifically, in the\nbackpropagation, BPFA records the gradients on pre-selected feature maps and\nuses the gradient on the input image to craft the adversarial example. In the\nnext forward propagation, BPFA leverages the recorded gradients to add\nbeneficial perturbations on their corresponding feature maps to increase the\nloss. Extensive experiments demonstrate that BPFA can significantly boost the\ntransferability of adversarial attacks on FR.\n","authors":["Fengfan Zhou","Hefei Ling","Yuxuan Shi","Jiazhong Chen","Zongyi Li","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2210.16117v4.pdf","comment":"\\c{opyright} 2023 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2302.05086v3","updated":"2023-07-19T07:31:35Z","published":"2023-02-10T07:08:13Z","title":"Making Substitute Models More Bayesian Can Enhance Transferability of\n  Adversarial Examples","summary":"  The transferability of adversarial examples across deep neural networks\n(DNNs) is the crux of many black-box attacks. Many prior efforts have been\ndevoted to improving the transferability via increasing the diversity in inputs\nof some substitute models. In this paper, by contrast, we opt for the diversity\nin substitute models and advocate to attack a Bayesian model for achieving\ndesirable transferability. Deriving from the Bayesian formulation, we develop a\nprincipled strategy for possible finetuning, which can be combined with many\noff-the-shelf Gaussian posterior approximations over DNN parameters. Extensive\nexperiments have been conducted to verify the effectiveness of our method, on\ncommon benchmark datasets, and the results demonstrate that our method\noutperforms recent state-of-the-arts by large margins (roughly 19% absolute\nincrease in average attack success rate on ImageNet), and, by combining with\nthese recent methods, further performance gain can be obtained. Our code:\nhttps://github.com/qizhangli/MoreBayesian-attack.\n","authors":["Qizhang Li","Yiwen Guo","Wangmeng Zuo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2302.05086v3.pdf","comment":"Accepted by ICLR 2023, fix typos"},{"id":"http://arxiv.org/abs/2307.09795v1","updated":"2023-07-19T07:29:14Z","published":"2023-07-19T07:29:14Z","title":"From West to East: Who can understand the music of the others better?","summary":"  Recent developments in MIR have led to several benchmark deep learning models\nwhose embeddings can be used for a variety of downstream tasks. At the same\ntime, the vast majority of these models have been trained on Western pop/rock\nmusic and related styles. This leads to research questions on whether these\nmodels can be used to learn representations for different music cultures and\nstyles, or whether we can build similar music audio embedding models trained on\ndata from different cultures or styles. To that end, we leverage transfer\nlearning methods to derive insights about the similarities between the\ndifferent music cultures to which the data belongs to. We use two Western music\ndatasets, two traditional/folk datasets coming from eastern Mediterranean\ncultures, and two datasets belonging to Indian art music. Three deep audio\nembedding models are trained and transferred across domains, including two\nCNN-based and a Transformer-based architecture, to perform auto-tagging for\neach target domain dataset. Experimental results show that competitive\nperformance is achieved in all domains via transfer learning, while the best\nsource dataset varies for each music culture. The implementation and the\ntrained models are both provided in a public repository.\n","authors":["Charilaos Papaioannou","Emmanouil Benetos","Alexandros Potamianos"],"pdf_url":"https://arxiv.org/pdf/2307.09795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09794v1","updated":"2023-07-19T07:25:33Z","published":"2023-07-19T07:25:33Z","title":"DiffDP: Radiotherapy Dose Prediction via a Diffusion Model","summary":"  Currently, deep learning (DL) has achieved the automatic prediction of dose\ndistribution in radiotherapy planning, enhancing its efficiency and quality.\nHowever, existing methods suffer from the over-smoothing problem for their\ncommonly used L_1 or L_2 loss with posterior average calculations. To alleviate\nthis limitation, we innovatively introduce a diffusion-based dose prediction\n(DiffDP) model for predicting the radiotherapy dose distribution of cancer\npatients. Specifically, the DiffDP model contains a forward process and a\nreverse process. In the forward process, DiffDP gradually transforms dose\ndistribution maps into Gaussian noise by adding small noise and trains a noise\npredictor to predict the noise added in each timestep. In the reverse process,\nit removes the noise from the original Gaussian noise in multiple steps with\nthe well-trained noise predictor and finally outputs the predicted dose\ndistribution map. To ensure the accuracy of the prediction, we further design a\nstructure encoder to extract anatomical information from patient anatomy images\nand enable the noise predictor to be aware of the dose constraints within\nseveral essential organs, i.e., the planning target volume and organs at risk.\nExtensive experiments on an in-house dataset with 130 rectum cancer patients\ndemonstrate the s\n","authors":["Zhenghao Feng","Lu Wen","Peng Wang","Binyu Yan","Xi Wu","Jiliu Zhou","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09794v1.pdf","comment":"to be published in MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.08015v2","updated":"2023-07-19T07:18:12Z","published":"2023-07-16T11:52:27Z","title":"Boosting 3-DoF Ground-to-Satellite Camera Localization Accuracy via\n  Geometry-Guided Cross-View Transformer","summary":"  Image retrieval-based cross-view localization methods often lead to very\ncoarse camera pose estimation, due to the limited sampling density of the\ndatabase satellite images. In this paper, we propose a method to increase the\naccuracy of a ground camera's location and orientation by estimating the\nrelative rotation and translation between the ground-level image and its\nmatched/retrieved satellite image. Our approach designs a geometry-guided\ncross-view transformer that combines the benefits of conventional geometry and\nlearnable cross-view transformers to map the ground-view observations to an\noverhead view. Given the synthesized overhead view and observed satellite\nfeature maps, we construct a neural pose optimizer with strong global\ninformation embedding ability to estimate the relative rotation between them.\nAfter aligning their rotations, we develop an uncertainty-guided spatial\ncorrelation to generate a probability map of the vehicle locations, from which\nthe relative translation can be determined. Experimental results demonstrate\nthat our method significantly outperforms the state-of-the-art. Notably, the\nlikelihood of restricting the vehicle lateral pose to be within 1m of its\nGround Truth (GT) value on the cross-view KITTI dataset has been improved from\n$35.54\\%$ to $76.44\\%$, and the likelihood of restricting the vehicle\norientation to be within $1^{\\circ}$ of its GT value has been improved from\n$19.64\\%$ to $99.10\\%$.\n","authors":["Yujiao Shi","Fei Wu","Ankit Vora","Akhil Perincherry","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2307.08015v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09788v1","updated":"2023-07-19T07:11:45Z","published":"2023-07-19T07:11:45Z","title":"Density-invariant Features for Distant Point Cloud Registration","summary":"  Registration of distant outdoor LiDAR point clouds is crucial to extending\nthe 3D vision of collaborative autonomous vehicles, and yet is challenging due\nto small overlapping area and a huge disparity between observed point\ndensities. In this paper, we propose Group-wise Contrastive Learning (GCL)\nscheme to extract density-invariant geometric features to register distant\noutdoor LiDAR point clouds. We mark through theoretical analysis and\nexperiments that, contrastive positives should be independent and identically\ndistributed (i.i.d.), in order to train densityinvariant feature extractors. We\npropose upon the conclusion a simple yet effective training scheme to force the\nfeature of multiple point clouds in the same spatial location (referred to as\npositive groups) to be similar, which naturally avoids the sampling bias\nintroduced by a pair of point clouds to conform with the i.i.d. principle. The\nresulting fully-convolutional feature extractor is more powerful and\ndensity-invariant than state-of-the-art methods, improving the registration\nrecall of distant scenarios on KITTI and nuScenes benchmarks by 40.9% and\n26.9%, respectively. The code will be open-sourced.\n","authors":["Quan Liu","Hongzi Zhu","Yunsong Zhou","Hongyang Li","Shan Chang","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2307.09788v1.pdf","comment":"In Proceedings of the IEEE/CVF International Conference on Computer\n  Vision (ICCV), 2023"},{"id":"http://arxiv.org/abs/2307.09787v1","updated":"2023-07-19T07:11:11Z","published":"2023-07-19T07:11:11Z","title":"DVPT: Dynamic Visual Prompt Tuning of Large Pre-trained Models for\n  Medical Image Analysis","summary":"  Limited labeled data makes it hard to train models from scratch in medical\ndomain, and an important paradigm is pre-training and then fine-tuning. Large\npre-trained models contain rich representations, which can be adapted to\ndownstream medical tasks. However, existing methods either tune all the\nparameters or the task-specific layers of the pre-trained models, ignoring the\ninput variations of medical images, and thus they are not efficient or\neffective. In this work, we aim to study parameter-efficient fine-tuning (PEFT)\nfor medical image analysis, and propose a dynamic visual prompt tuning method,\nnamed DVPT. It can extract knowledge beneficial to downstream tasks from large\nmodels with a few trainable parameters. Firstly, the frozen features are\ntransformed by an lightweight bottleneck layer to learn the domain-specific\ndistribution of downstream medical tasks, and then a few learnable visual\nprompts are used as dynamic queries and then conduct cross-attention with the\ntransformed features, attempting to acquire sample-specific knowledge that are\nsuitable for each sample. Finally, the features are projected to original\nfeature dimension and aggregated with the frozen features. This DVPT module can\nbe shared between different Transformer layers, further reducing the trainable\nparameters. To validate DVPT, we conduct extensive experiments with different\npre-trained models on medical classification and segmentation tasks. We find\nsuch PEFT method can not only efficiently adapt the pre-trained models to the\nmedical domain, but also brings data efficiency with partial labeled data. For\nexample, with 0.5\\% extra trainable parameters, our method not only outperforms\nstate-of-the-art PEFT methods, even surpasses the full fine-tuning by more than\n2.20\\% Kappa score on medical classification task. It can saves up to 60\\%\nlabeled data and 99\\% storage cost of ViT-B/16.\n","authors":["Along He","Kai Wang","Zhihong Wang","Tao Li","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2307.09787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09781v1","updated":"2023-07-19T06:56:07Z","published":"2023-07-19T06:56:07Z","title":"Text2Layer: Layered Image Generation using Latent Diffusion Model","summary":"  Layer compositing is one of the most popular image editing workflows among\nboth amateurs and professionals. Motivated by the success of diffusion models,\nwe explore layer compositing from a layered image generation perspective.\nInstead of generating an image, we propose to generate background, foreground,\nlayer mask, and the composed image simultaneously. To achieve layered image\ngeneration, we train an autoencoder that is able to reconstruct layered images\nand train diffusion models on the latent representation. One benefit of the\nproposed problem is to enable better compositing workflows in addition to the\nhigh-quality image output. Another benefit is producing higher-quality layer\nmasks compared to masks produced by a separate step of image segmentation.\nExperimental results show that the proposed method is able to generate\nhigh-quality layered images and initiates a benchmark for future work.\n","authors":["Xinyang Zhang","Wentian Zhao","Xin Lu","Jeff Chien"],"pdf_url":"https://arxiv.org/pdf/2307.09781v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2307.01533v2","updated":"2023-07-19T06:39:36Z","published":"2023-07-04T07:36:48Z","title":"Unsupervised Video Anomaly Detection with Diffusion Models Conditioned\n  on Compact Motion Representations","summary":"  This paper aims to address the unsupervised video anomaly detection (VAD)\nproblem, which involves classifying each frame in a video as normal or\nabnormal, without any access to labels. To accomplish this, the proposed method\nemploys conditional diffusion models, where the input data is the\nspatiotemporal features extracted from a pre-trained network, and the condition\nis the features extracted from compact motion representations that summarize a\ngiven video segment in terms of its motion and appearance. Our method utilizes\na data-driven threshold and considers a high reconstruction error as an\nindicator of anomalous events. This study is the first to utilize compact\nmotion representations for VAD and the experiments conducted on two large-scale\nVAD benchmarks demonstrate that they supply relevant information to the\ndiffusion model, and consequently improve VAD performances w.r.t the prior art.\nImportantly, our method exhibits better generalization performance across\ndifferent datasets, notably outperforming both the state-of-the-art and\nbaseline methods. The code of our method is available at\nhttps://github.com/AnilOsmanTur/conditioned_video_anomaly_diffusion\n","authors":["Anil Osman Tur","Nicola Dall'Asen","Cigdem Beyan","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2307.01533v2.pdf","comment":"Accepted to ICIAP 2023"},{"id":"http://arxiv.org/abs/2307.09769v1","updated":"2023-07-19T06:07:12Z","published":"2023-07-19T06:07:12Z","title":"Source-Free Domain Adaptation for Medical Image Segmentation via\n  Prototype-Anchored Feature Alignment and Contrastive Learning","summary":"  Unsupervised domain adaptation (UDA) has increasingly gained interests for\nits capacity to transfer the knowledge learned from a labeled source domain to\nan unlabeled target domain. However, typical UDA methods require concurrent\naccess to both the source and target domain data, which largely limits its\napplication in medical scenarios where source data is often unavailable due to\nprivacy concern. To tackle the source data-absent problem, we present a novel\ntwo-stage source-free domain adaptation (SFDA) framework for medical image\nsegmentation, where only a well-trained source segmentation model and unlabeled\ntarget data are available during domain adaptation. Specifically, in the\nprototype-anchored feature alignment stage, we first utilize the weights of the\npre-trained pixel-wise classifier as source prototypes, which preserve the\ninformation of source features. Then, we introduce the bi-directional transport\nto align the target features with class prototypes by minimizing its expected\ncost. On top of that, a contrastive learning stage is further devised to\nutilize those pixels with unreliable predictions for a more compact target\nfeature distribution. Extensive experiments on a cross-modality medical\nsegmentation task demonstrate the superiority of our method in large domain\ndiscrepancy settings compared with the state-of-the-art SFDA approaches and\neven some UDA methods. Code is available at\nhttps://github.com/CSCYQJ/MICCAI23-ProtoContra-SFDA.\n","authors":["Qinji Yu","Nan Xi","Junsong Yuan","Ziyu Zhou","Kang Dang","Xiaowei Ding"],"pdf_url":"https://arxiv.org/pdf/2307.09769v1.pdf","comment":"Accepted by MICCAI23"},{"id":"http://arxiv.org/abs/2009.06205v3","updated":"2023-07-19T06:05:27Z","published":"2020-09-14T05:23:58Z","title":"Joint Demosaicking and Denoising Benefits from a Two-stage Training\n  Strategy","summary":"  Image demosaicking and denoising are the first two key steps of the color\nimage production pipeline. The classical processing sequence has for a long\ntime consisted of applying denoising first, and then demosaicking. Applying the\noperations in this order leads to oversmoothing and checkerboard effects. Yet,\nit was difficult to change this order, because once the image is demosaicked,\nthe statistical properties of the noise are dramatically changed and hard to\nhandle by traditional denoising models. In this paper, we address this problem\nby a hybrid machine learning method. We invert the traditional color filter\narray (CFA) processing pipeline by first demosaicking and then denoising. Our\ndemosaicking algorithm, trained on noiseless images, combines a traditional\nmethod and a residual convolutional neural network (CNN). This first stage\nretains all known information, which is the key point to obtain faithful final\nresults. The noisy demosaicked image is then passed through a second CNN\nrestoring a noiseless full-color image. This pipeline order completely avoids\ncheckerboard effects and restores fine image detail. Although CNNs can be\ntrained to solve jointly demosaicking-denoising end-to-end, we find that this\ntwo-stage training performs better and is less prone to failure. It is shown\nexperimentally to improve on the state of the art, both quantitatively and in\nterms of visual quality.\n","authors":["Yu Guo","Qiyu Jin","Gabriele Facciolo","Tieyong Zeng","Jean-Michel Morel"],"pdf_url":"https://arxiv.org/pdf/2009.06205v3.pdf","comment":"28 pages, 40 figures"},{"id":"http://arxiv.org/abs/2307.09763v1","updated":"2023-07-19T05:46:56Z","published":"2023-07-19T05:46:56Z","title":"Towards Building More Robust Models with Frequency Bias","summary":"  The vulnerability of deep neural networks to adversarial samples has been a\nmajor impediment to their broad applications, despite their success in various\nfields. Recently, some works suggested that adversarially-trained models\nemphasize the importance of low-frequency information to achieve higher\nrobustness. While several attempts have been made to leverage this frequency\ncharacteristic, they have all faced the issue that applying low-pass filters\ndirectly to input images leads to irreversible loss of discriminative\ninformation and poor generalizability to datasets with distinct frequency\nfeatures. This paper presents a plug-and-play module called the Frequency\nPreference Control Module that adaptively reconfigures the low- and\nhigh-frequency components of intermediate feature representations, providing\nbetter utilization of frequency in robust learning. Empirical studies show that\nour proposed module can be easily incorporated into any adversarial training\nframework, further improving model robustness across different architectures\nand datasets. Additionally, experiments were conducted to examine how the\nfrequency bias of robust models impacts the adversarial training process and\nits final robustness, revealing interesting insights.\n","authors":["Qingwen Bu","Dong Huang","Heming Cui"],"pdf_url":"https://arxiv.org/pdf/2307.09763v1.pdf","comment":"Accepted by ICCV23"},{"id":"http://arxiv.org/abs/2307.08779v2","updated":"2023-07-19T05:43:45Z","published":"2023-07-17T18:50:15Z","title":"Similarity Min-Max: Zero-Shot Day-Night Domain Adaptation","summary":"  Low-light conditions not only hamper human visual experience but also degrade\nthe model's performance on downstream vision tasks. While existing works make\nremarkable progress on day-night domain adaptation, they rely heavily on domain\nknowledge derived from the task-specific nighttime dataset. This paper\nchallenges a more complicated scenario with border applicability, i.e.,\nzero-shot day-night domain adaptation, which eliminates reliance on any\nnighttime data. Unlike prior zero-shot adaptation approaches emphasizing either\nimage-level translation or model-level adaptation, we propose a similarity\nmin-max paradigm that considers them under a unified framework. On the image\nlevel, we darken images towards minimum feature similarity to enlarge the\ndomain gap. Then on the model level, we maximize the feature similarity between\nthe darkened images and their normal-light counterparts for better model\nadaptation. To the best of our knowledge, this work represents the pioneering\neffort in jointly optimizing both aspects, resulting in a significant\nimprovement of model generalizability. Extensive experiments demonstrate our\nmethod's effectiveness and broad applicability on various nighttime vision\ntasks, including classification, semantic segmentation, visual place\nrecognition, and video action recognition. Code and pre-trained models are\navailable at https://red-fairy.github.io/ZeroShotDayNightDA-Webpage/.\n","authors":["Rundong Luo","Wenjing Wang","Wenhan Yang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2307.08779v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09758v1","updated":"2023-07-19T05:41:14Z","published":"2023-07-19T05:41:14Z","title":"Longitudinal Data and a Semantic Similarity Reward for Chest X-Ray\n  Report Generation","summary":"  Chest X-Ray (CXR) report generation is a promising approach to improving the\nefficiency of CXR interpretation. However, a significant increase in diagnostic\naccuracy is required before that can be realised. Motivated by this, we propose\na framework that is more inline with a radiologist's workflow by considering\nlongitudinal data. Here, the decoder is additionally conditioned on the report\nfrom the subject's previous imaging study via a prompt. We also propose a new\nreward for reinforcement learning based on CXR-BERT, which computes the\nsimilarity between reports. We conduct experiments on the MIMIC-CXR dataset.\nThe results indicate that longitudinal data improves CXR report generation.\nCXR-BERT is also shown to be a promising alternative to the current\nstate-of-the-art reward based on RadGraph. This investigation indicates that\nlongitudinal CXR report generation can offer a substantial increase in\ndiagnostic accuracy. Our Hugging Face model is available at:\nhttps://huggingface.co/aehrc/cxrmate and code is available at:\nhttps://github.com/aehrc/cxrmate.\n","authors":["Aaron Nicolson","Jason Dowling","Bevan Koopman"],"pdf_url":"https://arxiv.org/pdf/2307.09758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09756v1","updated":"2023-07-19T05:40:38Z","published":"2023-07-19T05:40:38Z","title":"Generative Prompt Model for Weakly Supervised Object Localization","summary":"  Weakly supervised object localization (WSOL) remains challenging when\nlearning object localization models from image category labels. Conventional\nmethods that discriminatively train activation models ignore representative yet\nless discriminative object parts. In this study, we propose a generative prompt\nmodel (GenPromp), defining the first generative pipeline to localize less\ndiscriminative object parts by formulating WSOL as a conditional image\ndenoising procedure. During training, GenPromp converts image category labels\nto learnable prompt embeddings which are fed to a generative model to\nconditionally recover the input image with noise and learn representative\nembeddings. During inference, enPromp combines the representative embeddings\nwith discriminative embeddings (queried from an off-the-shelf vision-language\nmodel) for both representative and discriminative capacity. The combined\nembeddings are finally used to generate multi-scale high-quality attention\nmaps, which facilitate localizing full object extent. Experiments on\nCUB-200-2011 and ILSVRC show that GenPromp respectively outperforms the best\ndiscriminative models by 5.2% and 5.6% (Top-1 Loc), setting a solid baseline\nfor WSOL with the generative model. Code is available at\nhttps://github.com/callsys/GenPromp.\n","authors":["Yuzhong Zhao","Qixiang Ye","Weijia Wu","Chunhua Shen","Fang Wan"],"pdf_url":"https://arxiv.org/pdf/2307.09756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09755v1","updated":"2023-07-19T05:39:15Z","published":"2023-07-19T05:39:15Z","title":"Space Engage: Collaborative Space Supervision for Contrastive-based\n  Semi-Supervised Semantic Segmentation","summary":"  Semi-Supervised Semantic Segmentation (S4) aims to train a segmentation model\nwith limited labeled images and a substantial volume of unlabeled images. To\nimprove the robustness of representations, powerful methods introduce a\npixel-wise contrastive learning approach in latent space (i.e., representation\nspace) that aggregates the representations to their prototypes in a fully\nsupervised manner. However, previous contrastive-based S4 methods merely rely\non the supervision from the model's output (logits) in logit space during\nunlabeled training. In contrast, we utilize the outputs in both logit space and\nrepresentation space to obtain supervision in a collaborative way. The\nsupervision from two spaces plays two roles: 1) reduces the risk of\nover-fitting to incorrect semantic information in logits with the help of\nrepresentations; 2) enhances the knowledge exchange between the two spaces.\nFurthermore, unlike previous approaches, we use the similarity between\nrepresentations and prototypes as a new indicator to tilt training those\nunder-performing representations and achieve a more efficient contrastive\nlearning process. Results on two public benchmarks demonstrate the competitive\nperformance of our method compared with state-of-the-art methods.\n","authors":["Changqi Wang","Haoyu Xie","Yuhui Yuan","Chong Fu","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.09755v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09749v1","updated":"2023-07-19T05:08:47Z","published":"2023-07-19T05:08:47Z","title":"Towards Robust Scene Text Image Super-resolution via Explicit Location\n  Enhancement","summary":"  Scene text image super-resolution (STISR), aiming to improve image quality\nwhile boosting downstream scene text recognition accuracy, has recently\nachieved great success. However, most existing methods treat the foreground\n(character regions) and background (non-character regions) equally in the\nforward process, and neglect the disturbance from the complex background, thus\nlimiting the performance. To address these issues, in this paper, we propose a\nnovel method LEMMA that explicitly models character regions to produce\nhigh-level text-specific guidance for super-resolution. To model the location\nof characters effectively, we propose the location enhancement module to\nextract character region features based on the attention map sequence. Besides,\nwe propose the multi-modal alignment module to perform bidirectional\nvisual-semantic alignment to generate high-quality prior guidance, which is\nthen incorporated into the super-resolution branch in an adaptive manner using\nthe proposed adaptive fusion module. Experiments on TextZoom and four scene\ntext recognition benchmarks demonstrate the superiority of our method over\nother state-of-the-art methods. Code is available at\nhttps://github.com/csguoh/LEMMA.\n","authors":["Hang Guo","Tao Dai","Guanghao Meng","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2307.09749v1.pdf","comment":"Accepted as IJCAI2023 paper"},{"id":"http://arxiv.org/abs/2307.09748v1","updated":"2023-07-19T04:59:58Z","published":"2023-07-19T04:59:58Z","title":"Watch out Venomous Snake Species: A Solution to SnakeCLEF2023","summary":"  The SnakeCLEF2023 competition aims to the development of advanced algorithms\nfor snake species identification through the analysis of images and\naccompanying metadata. This paper presents a method leveraging utilization of\nboth images and metadata. Modern CNN models and strong data augmentation are\nutilized to learn better representation of images. To relieve the challenge of\nlong-tailed distribution, seesaw loss is utilized in our method. We also design\na light model to calculate prior probabilities using metadata features\nextracted from CLIP in post processing stage. Besides, we attach more\nimportance to venomous species by assigning venomous species labels to some\nexamples that model is uncertain about. Our method achieves 91.31% score of the\nfinal metric combined of F1 and other metrics on private leaderboard, which is\nthe 1st place among the participators. The code is available at\nhttps://github.com/xiaoxsparraw/CLEF2023.\n","authors":["Feiran Hu","Peng Wang","Yangyang Li","Chenlong Duan","Zijian Zhu","Fei Wang","Faen Zhang","Yong Li","Xiu-Shen Wei"],"pdf_url":"https://arxiv.org/pdf/2307.09748v1.pdf","comment":"This work was the winner solution of the SnakeCLEF2023 challenge"},{"id":"http://arxiv.org/abs/2307.09742v1","updated":"2023-07-19T04:07:33Z","published":"2023-07-19T04:07:33Z","title":"Improved Distribution Matching for Dataset Condensation","summary":"  Dataset Condensation aims to condense a large dataset into a smaller one\nwhile maintaining its ability to train a well-performing model, thus reducing\nthe storage cost and training effort in deep learning applications. However,\nconventional dataset condensation methods are optimization-oriented and\ncondense the dataset by performing gradient or parameter matching during model\noptimization, which is computationally intensive even on small datasets and\nmodels. In this paper, we propose a novel dataset condensation method based on\ndistribution matching, which is more efficient and promising. Specifically, we\nidentify two important shortcomings of naive distribution matching (i.e.,\nimbalanced feature numbers and unvalidated embeddings for distance computation)\nand address them with three novel techniques (i.e., partitioning and expansion\naugmentation, efficient and enriched model sampling, and class-aware\ndistribution regularization). Our simple yet effective method outperforms most\nprevious optimization-oriented methods with much fewer computational resources,\nthereby scaling data condensation to larger datasets and models. Extensive\nexperiments demonstrate the effectiveness of our method. Codes are available at\nhttps://github.com/uitrbn/IDM\n","authors":["Ganlong Zhao","Guanbin Li","Yipeng Qin","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2307.09742v1.pdf","comment":"CVPR2023"},{"id":"http://arxiv.org/abs/2306.13074v3","updated":"2023-07-19T03:46:37Z","published":"2023-06-22T17:47:08Z","title":"Iterative Scale-Up ExpansionIoU and Deep Features Association for\n  Multi-Object Tracking in Sports","summary":"  Multi-object tracking algorithms have made significant advancements due to\nthe recent developments in object detection. However, most existing methods\nprimarily focus on tracking pedestrians or vehicles, which exhibit relatively\nsimple and regular motion patterns. Consequently, there is a scarcity of\nalgorithms that address the tracking of targets with irregular or non-linear\nmotion, such as multi-athlete tracking. Furthermore, popular tracking\nalgorithms often rely on the Kalman filter for object motion modeling, which\nfails to track objects when their motion contradicts the linear motion\nassumption of the Kalman filter. Due to this reason, we proposed a novel online\nand robust multi-object tracking approach, named Iterative Scale-Up\nExpansionIoU and Deep Features for multi-object tracking. Unlike conventional\nmethods, we abandon the use of the Kalman filter and propose utilizing the\niterative scale-up expansion IoU. This approach achieves superior tracking\nperformance without requiring additional training data or adopting a more\nrobust detector, all while maintaining a lower computational cost compared to\nother appearance-based methods. Our proposed method demonstrates remarkable\neffectiveness in tracking irregular motion objects, achieving a score of 76.9%\nin HOTA. It outperforms all state-of-the-art tracking algorithms on the\nSportsMOT dataset, covering various kinds of sport scenarios.\n","authors":["Hsiang-Wei Huang","Cheng-Yen Yang","Jiacheng Sun","Jenq-Neng Hwang","Chung-I Huang"],"pdf_url":"https://arxiv.org/pdf/2306.13074v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07859v2","updated":"2023-07-19T03:04:50Z","published":"2023-07-15T17:45:17Z","title":"Unified Adversarial Patch for Cross-modal Attacks in the Physical World","summary":"  Recently, physical adversarial attacks have been presented to evade\nDNNs-based object detectors. To ensure the security, many scenarios are\nsimultaneously deployed with visible sensors and infrared sensors, leading to\nthe failures of these single-modal physical attacks. To show the potential\nrisks under such scenes, we propose a unified adversarial patch to perform\ncross-modal physical attacks, i.e., fooling visible and infrared object\ndetectors at the same time via a single patch. Considering different imaging\nmechanisms of visible and infrared sensors, our work focuses on modeling the\nshapes of adversarial patches, which can be captured in different modalities\nwhen they change. To this end, we design a novel boundary-limited shape\noptimization to achieve the compact and smooth shapes, and thus they can be\neasily implemented in the physical world. In addition, to balance the fooling\ndegree between visible detector and infrared detector during the optimization\nprocess, we propose a score-aware iterative evaluation, which can guide the\nadversarial patch to iteratively reduce the predicted scores of the multi-modal\nsensors. We finally test our method against the one-stage detector: YOLOv3 and\nthe two-stage detector: Faster RCNN. Results show that our unified patch\nachieves an Attack Success Rate (ASR) of 73.33% and 69.17%, respectively. More\nimportantly, we verify the effective attacks in the physical world when visible\nand infrared sensors shoot the objects under various settings like different\nangles, distances, postures, and scenes.\n","authors":["Xingxing Wei","Yao Huang","Yitong Sun","Jie Yu"],"pdf_url":"https://arxiv.org/pdf/2307.07859v2.pdf","comment":"10 pages, 8 figures, accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2306.16197v3","updated":"2023-07-19T02:53:36Z","published":"2023-06-28T13:23:33Z","title":"Multi-IMU with Online Self-Consistency for Freehand 3D Ultrasound\n  Reconstruction","summary":"  Ultrasound (US) imaging is a popular tool in clinical diagnosis, offering\nsafety, repeatability, and real-time capabilities. Freehand 3D US is a\ntechnique that provides a deeper understanding of scanned regions without\nincreasing complexity. However, estimating elevation displacement and\naccumulation error remains challenging, making it difficult to infer the\nrelative position using images alone. The addition of external lightweight\nsensors has been proposed to enhance reconstruction performance without adding\ncomplexity, which has been shown to be beneficial. We propose a novel online\nself-consistency network (OSCNet) using multiple inertial measurement units\n(IMUs) to improve reconstruction performance. OSCNet utilizes a modal-level\nself-supervised strategy to fuse multiple IMU information and reduce\ndifferences between reconstruction results obtained from each IMU data.\nAdditionally, a sequence-level self-consistency strategy is proposed to improve\nthe hierarchical consistency of prediction results among the scanning sequence\nand its sub-sequences. Experiments on large-scale arm and carotid datasets with\nmultiple scanning tactics demonstrate that our OSCNet outperforms previous\nmethods, achieving state-of-the-art reconstruction performance.\n","authors":["Mingyuan Luo","Xin Yang","Zhongnuo Yan","Junyu Li","Yuanji Zhang","Jiongquan Chen","Xindi Hu","Jikuan Qian","Jun Cheng","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2306.16197v3.pdf","comment":"Accepted by MICCAI-2023"},{"id":"http://arxiv.org/abs/2307.09732v1","updated":"2023-07-19T02:49:44Z","published":"2023-07-19T02:49:44Z","title":"ClickSeg: 3D Instance Segmentation with Click-Level Weak Annotations","summary":"  3D instance segmentation methods often require fully-annotated dense labels\nfor training, which are costly to obtain. In this paper, we present ClickSeg, a\nnovel click-level weakly supervised 3D instance segmentation method that\nrequires one point per instance annotation merely. Such a problem is very\nchallenging due to the extremely limited labels, which has rarely been solved\nbefore. We first develop a baseline weakly-supervised training method, which\ngenerates pseudo labels for unlabeled data by the model itself. To utilize the\nproperty of click-level annotation setting, we further propose a new training\nframework. Instead of directly using the model inference way, i.e., mean-shift\nclustering, to generate the pseudo labels, we propose to use k-means with fixed\ninitial seeds: the annotated points. New similarity metrics are further\ndesigned for clustering. Experiments on ScanNetV2 and S3DIS datasets show that\nthe proposed ClickSeg surpasses the previous best weakly supervised instance\nsegmentation result by a large margin (e.g., +9.4% mAP on ScanNetV2). Using\n0.02% supervision signals merely, ClickSeg achieves $\\sim$90% of the accuracy\nof the fully-supervised counterpart. Meanwhile, it also achieves\nstate-of-the-art semantic segmentation results among weakly supervised methods\nthat use the same annotation settings.\n","authors":["Leyao Liu","Tao Kong","Minzhao Zhu","Jiashuo Fan","Lu Fang"],"pdf_url":"https://arxiv.org/pdf/2307.09732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09729v1","updated":"2023-07-19T02:33:42Z","published":"2023-07-19T02:33:42Z","title":"NTIRE 2023 Quality Assessment of Video Enhancement Challenge","summary":"  This paper reports on the NTIRE 2023 Quality Assessment of Video Enhancement\nChallenge, which will be held in conjunction with the New Trends in Image\nRestoration and Enhancement Workshop (NTIRE) at CVPR 2023. This challenge is to\naddress a major challenge in the field of video processing, namely, video\nquality assessment (VQA) for enhanced videos. The challenge uses the VQA\nDataset for Perceptual Video Enhancement (VDPVE), which has a total of 1211\nenhanced videos, including 600 videos with color, brightness, and contrast\nenhancements, 310 videos with deblurring, and 301 deshaked videos. The\nchallenge has a total of 167 registered participants. 61 participating teams\nsubmitted their prediction results during the development phase, with a total\nof 3168 submissions. A total of 176 submissions were submitted by 37\nparticipating teams during the final testing phase. Finally, 19 participating\nteams submitted their models and fact sheets, and detailed the methods they\nused. Some methods have achieved better results than baseline methods, and the\nwinning methods have demonstrated superior prediction performance.\n","authors":["Xiaohong Liu","Xiongkuo Min","Wei Sun","Yulun Zhang","Kai Zhang","Radu Timofte","Guangtao Zhai","Yixuan Gao","Yuqin Cao","Tengchuan Kou","Yunlong Dong","Ziheng Jia","Yilin Li","Wei Wu","Shuming Hu","Sibin Deng","Pengxiang Xiao","Ying Chen","Kai Li","Kai Zhao","Kun Yuan","Ming Sun","Heng Cong","Hao Wang","Lingzhi Fu","Yusheng Zhang","Rongyu Zhang","Hang Shi","Qihang Xu","Longan Xiao","Zhiliang Ma","Mirko Agarla","Luigi Celona","Claudio Rota","Raimondo Schettini","Zhiwei Huang","Yanan Li","Xiaotao Wang","Lei Lei","Hongye Liu","Wei Hong","Ironhead Chuang","Allen Lin","Drake Guan","Iris Chen","Kae Lou","Willy Huang","Yachun Tasi","Yvonne Kao","Haotian Fan","Fangyuan Kong","Shiqi Zhou","Hao Liu","Yu Lai","Shanshan Chen","Wenqi Wang","Haoning Wu","Chaofeng Chen","Chunzheng Zhu","Zekun Guo","Shiling Zhao","Haibing Yin","Hongkui Wang","Hanene Brachemi Meftah","Sid Ahmed Fezza","Wassim Hamidouche","Olivier Déforges","Tengfei Shi","Azadeh Mansouri","Hossein Motamednia","Amir Hossein Bakhtiari","Ahmad Mahmoudi Aznaveh"],"pdf_url":"https://arxiv.org/pdf/2307.09729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09023v2","updated":"2023-07-19T02:30:48Z","published":"2023-07-18T07:25:38Z","title":"LA-Net: Landmark-Aware Learning for Reliable Facial Expression\n  Recognition under Label Noise","summary":"  Facial expression recognition (FER) remains a challenging task due to the\nambiguity of expressions. The derived noisy labels significantly harm the\nperformance in real-world scenarios. To address this issue, we present a new\nFER model named Landmark-Aware Net~(LA-Net), which leverages facial landmarks\nto mitigate the impact of label noise from two perspectives. Firstly, LA-Net\nuses landmark information to suppress the uncertainty in expression space and\nconstructs the label distribution of each sample by neighborhood aggregation,\nwhich in turn improves the quality of training supervision. Secondly, the model\nincorporates landmark information into expression representations using the\ndevised expression-landmark contrastive loss. The enhanced expression feature\nextractor can be less susceptible to label noise. Our method can be integrated\nwith any deep neural network for better training supervision without\nintroducing extra inference costs. We conduct extensive experiments on both\nin-the-wild datasets and synthetic noisy datasets and demonstrate that LA-Net\nachieves state-of-the-art performance.\n","authors":["Zhiyu Wu","Jinshi Cui"],"pdf_url":"https://arxiv.org/pdf/2307.09023v2.pdf","comment":"accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09728v1","updated":"2023-07-19T02:29:57Z","published":"2023-07-19T02:29:57Z","title":"Uncertainty-Driven Multi-Scale Feature Fusion Network for Real-time\n  Image Deraining","summary":"  Visual-based measurement systems are frequently affected by rainy weather due\nto the degradation caused by rain streaks in captured images, and existing\nimaging devices struggle to address this issue in real-time. While most efforts\nleverage deep networks for image deraining and have made progress, their large\nparameter sizes hinder deployment on resource-constrained devices.\nAdditionally, these data-driven models often produce deterministic results,\nwithout considering their inherent epistemic uncertainty, which can lead to\nundesired reconstruction errors. Well-calibrated uncertainty can help alleviate\nprediction errors and assist measurement devices in mitigating risks and\nimproving usability. Therefore, we propose an Uncertainty-Driven Multi-Scale\nFeature Fusion Network (UMFFNet) that learns the probability mapping\ndistribution between paired images to estimate uncertainty. Specifically, we\nintroduce an uncertainty feature fusion block (UFFB) that utilizes uncertainty\ninformation to dynamically enhance acquired features and focus on blurry\nregions obscured by rain streaks, reducing prediction errors. In addition, to\nfurther boost the performance of UMFFNet, we fused feature information from\nmultiple scales to guide the network for efficient collaborative rain removal.\nExtensive experiments demonstrate that UMFFNet achieves significant performance\nimprovements with few parameters, surpassing state-of-the-art image deraining\nmethods.\n","authors":["Ming Tong","Xuefeng Yan","Yongzhen Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09727v1","updated":"2023-07-19T02:28:41Z","published":"2023-07-19T02:28:41Z","title":"SAMConvex: Fast Discrete Optimization for CT Registration using\n  Self-supervised Anatomical Embedding and Correlation Pyramid","summary":"  Estimating displacement vector field via a cost volume computed in the\nfeature space has shown great success in image registration, but it suffers\nexcessive computation burdens. Moreover, existing feature descriptors only\nextract local features incapable of representing the global semantic\ninformation, which is especially important for solving large transformations.\nTo address the discussed issues, we propose SAMConvex, a fast coarse-to-fine\ndiscrete optimization method for CT registration that includes a decoupled\nconvex optimization procedure to obtain deformation fields based on a\nself-supervised anatomical embedding (SAM) feature extractor that captures both\nlocal and global information. To be specific, SAMConvex extracts per-voxel\nfeatures and builds 6D correlation volumes based on SAM features, and\niteratively updates a flow field by performing lookups on the correlation\nvolumes with a coarse-to-fine scheme. SAMConvex outperforms the\nstate-of-the-art learning-based methods and optimization-based methods over two\ninter-patient registration datasets (Abdomen CT and HeadNeck CT) and one\nintra-patient registration dataset (Lung CT). Moreover, as an\noptimization-based method, SAMConvex only takes $\\sim2$s ($\\sim5s$ with\ninstance optimization) for one paired images.\n","authors":["Zi Li","Lin Tian","Tony C. W. Mok","Xiaoyu Bai","Puyang Wang","Jia Ge","Jingren Zhou","Le Lu","Xianghua Ye","Ke Yan","Dakai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.09727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09724v1","updated":"2023-07-19T02:26:20Z","published":"2023-07-19T02:26:20Z","title":"AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks","summary":"  To deliver the artistic expression of the target style, recent studies\nexploit the attention mechanism owing to its ability to map the local patches\nof the style image to the corresponding patches of the content image. However,\nbecause of the low semantic correspondence between arbitrary content and\nartworks, the attention module repeatedly abuses specific local patches from\nthe style image, resulting in disharmonious and evident repetitive artifacts.\nTo overcome this limitation and accomplish impeccable artistic style transfer,\nwe focus on enhancing the attention mechanism and capturing the rhythm of\npatterns that organize the style. In this paper, we introduce a novel metric,\nnamely pattern repeatability, that quantifies the repetition of patterns in the\nstyle image. Based on the pattern repeatability, we propose Aesthetic\nPattern-Aware style transfer Networks (AesPA-Net) that discover the sweet spot\nof local and global style expressions. In addition, we propose a novel\nself-supervisory task to encourage the attention mechanism to learn precise and\nmeaningful semantic correspondence. Lastly, we introduce the patch-wise style\nloss to transfer the elaborate rhythm of local patterns. Through qualitative\nand quantitative evaluations, we verify the reliability of the proposed pattern\nrepeatability that aligns with human perception, and demonstrate the\nsuperiority of the proposed framework.\n","authors":["Kibeom Hong","Seogkyu Jeon","Junsoo Lee","Namhyuk Ahn","Kunhee Kim","Pilhyeon Lee","Daesik Kim","Youngjung Uh","Hyeran Byun"],"pdf_url":"https://arxiv.org/pdf/2307.09724v1.pdf","comment":"Accepted by ICCV 2023. Code is available at this\n  https://github.com/Kibeom-Hong/AesPA-Net"},{"id":"http://arxiv.org/abs/2212.04761v2","updated":"2023-07-19T02:20:18Z","published":"2022-12-09T10:37:22Z","title":"Leveraging Spatio-Temporal Dependency for Skeleton-Based Action\n  Recognition","summary":"  Skeleton-based action recognition has attracted considerable attention due to\nits compact representation of the human body's skeletal sructure. Many recent\nmethods have achieved remarkable performance using graph convolutional networks\n(GCNs) and convolutional neural networks (CNNs), which extract spatial and\ntemporal features, respectively. Although spatial and temporal dependencies in\nthe human skeleton have been explored separately, spatio-temporal dependency is\nrarely considered. In this paper, we propose the Spatio-Temporal Curve Network\n(STC-Net) to effectively leverage the spatio-temporal dependency of the human\nskeleton. Our proposed network consists of two novel elements: 1) The\nSpatio-Temporal Curve (STC) module; and 2) Dilated Kernels for Graph\nConvolution (DK-GC). The STC module dynamically adjusts the receptive field by\nidentifying meaningful node connections between every adjacent frame and\ngenerating spatio-temporal curves based on the identified node connections,\nproviding an adaptive spatio-temporal coverage. In addition, we propose DK-GC\nto consider long-range dependencies, which results in a large receptive field\nwithout any additional parameters by applying an extended kernel to the given\nadjacency matrices of the graph. Our STC-Net combines these two modules and\nachieves state-of-the-art performance on four skeleton-based action recognition\nbenchmarks.\n","authors":["Jungho Lee","Minhyeok Lee","Suhwan Cho","Sungmin Woo","Sungjun Jang","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2212.04761v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09721v1","updated":"2023-07-19T02:11:19Z","published":"2023-07-19T02:11:19Z","title":"Multi-Grained Multimodal Interaction Network for Entity Linking","summary":"  Multimodal entity linking (MEL) task, which aims at resolving ambiguous\nmentions to a multimodal knowledge graph, has attracted wide attention in\nrecent years. Though large efforts have been made to explore the complementary\neffect among multiple modalities, however, they may fail to fully absorb the\ncomprehensive expression of abbreviated textual context and implicit visual\nindication. Even worse, the inevitable noisy data may cause inconsistency of\ndifferent modalities during the learning process, which severely degenerates\nthe performance. To address the above issues, in this paper, we propose a novel\nMulti-GraIned Multimodal InteraCtion Network $\\textbf{(MIMIC)}$ framework for\nsolving the MEL task. Specifically, the unified inputs of mentions and entities\nare first encoded by textual/visual encoders separately, to extract global\ndescriptive features and local detailed features. Then, to derive the\nsimilarity matching score for each mention-entity pair, we device three\ninteraction units to comprehensively explore the intra-modal interaction and\ninter-modal fusion among features of entities and mentions. In particular,\nthree modules, namely the Text-based Global-Local interaction Unit (TGLU),\nVision-based DuaL interaction Unit (VDLU) and Cross-Modal Fusion-based\ninteraction Unit (CMFU) are designed to capture and integrate the fine-grained\nrepresentation lying in abbreviated text and implicit visual cues. Afterwards,\nwe introduce a unit-consistency objective function via contrastive learning to\navoid inconsistency and model degradation. Experimental results on three public\nbenchmark datasets demonstrate that our solution outperforms various\nstate-of-the-art baselines, and ablation studies verify the effectiveness of\ndesigned modules.\n","authors":["Pengfei Luo","Tong Xu","Shiwei Wu","Chen Zhu","Linli Xu","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2307.09721v1.pdf","comment":"Accepted by KDD 2023"},{"id":"http://arxiv.org/abs/2307.09715v1","updated":"2023-07-19T01:57:31Z","published":"2023-07-19T01:57:31Z","title":"Semantic-Aware Dual Contrastive Learning for Multi-label Image\n  Classification","summary":"  Extracting image semantics effectively and assigning corresponding labels to\nmultiple objects or attributes for natural images is challenging due to the\ncomplex scene contents and confusing label dependencies. Recent works have\nfocused on modeling label relationships with graph and understanding object\nregions using class activation maps (CAM). However, these methods ignore the\ncomplex intra- and inter-category relationships among specific semantic\nfeatures, and CAM is prone to generate noisy information. To this end, we\npropose a novel semantic-aware dual contrastive learning framework that\nincorporates sample-to-sample contrastive learning (SSCL) as well as\nprototype-to-sample contrastive learning (PSCL). Specifically, we leverage\nsemantic-aware representation learning to extract category-related local\ndiscriminative features and construct category prototypes. Then based on SSCL,\nlabel-level visual representations of the same category are aggregated\ntogether, and features belonging to distinct categories are separated.\nMeanwhile, we construct a novel PSCL module to narrow the distance between\npositive samples and category prototypes and push negative samples away from\nthe corresponding category prototypes. Finally, the discriminative label-level\nfeatures related to the image content are accurately captured by the joint\ntraining of the above three parts. Experiments on five challenging large-scale\npublic datasets demonstrate that our proposed method is effective and\noutperforms the state-of-the-art methods. Code and supplementary materials are\nreleased on https://github.com/yu-gi-oh-leilei/SADCL.\n","authors":["Leilei Ma","Dengdi Sun","Lei Wang","Haifang Zhao","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2307.09715v1.pdf","comment":"8 pages, 6 figures, accepted by ECAI 23"},{"id":"http://arxiv.org/abs/2307.07928v2","updated":"2023-07-19T01:43:59Z","published":"2023-07-16T02:44:19Z","title":"Reinforced Disentanglement for Face Swapping without Skip Connection","summary":"  The SOTA face swap models still suffer the problem of either target identity\n(i.e., shape) being leaked or the target non-identity attributes (i.e.,\nbackground, hair) failing to be fully preserved in the final results. We show\nthat this insufficient disentanglement is caused by two flawed designs that\nwere commonly adopted in prior models: (1) counting on only one compressed\nencoder to represent both the semantic-level non-identity facial\nattributes(i.e., pose) and the pixel-level non-facial region details, which is\ncontradictory to satisfy at the same time; (2) highly relying on long\nskip-connections between the encoder and the final generator, leaking a certain\namount of target face identity into the result. To fix them, we introduce a new\nface swap framework called 'WSC-swap' that gets rid of skip connections and\nuses two target encoders to respectively capture the pixel-level non-facial\nregion attributes and the semantic non-identity attributes in the face region.\nTo further reinforce the disentanglement learning for the target encoder, we\nemploy both identity removal loss via adversarial training (i.e., GAN) and the\nnon-identity preservation loss via prior 3DMM models like [11]. Extensive\nexperiments on both FaceForensics++ and CelebA-HQ show that our results\nsignificantly outperform previous works on a rich set of metrics, including one\nnovel metric for measuring identity consistency that was completely neglected\nbefore.\n","authors":["Xiaohang Ren","Xingyu Chen","Pengfei Yao","Heung-Yeung Shum","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.07928v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.03135v2","updated":"2023-07-19T01:28:30Z","published":"2023-07-06T17:05:26Z","title":"Distilling Large Vision-Language Model with Out-of-Distribution\n  Generalizability","summary":"  Large vision-language models have achieved outstanding performance, but their\nsize and computational requirements make their deployment on\nresource-constrained devices and time-sensitive tasks impractical. Model\ndistillation, the process of creating smaller, faster models that maintain the\nperformance of larger models, is a promising direction towards the solution.\nThis paper investigates the distillation of visual representations in large\nteacher vision-language models into lightweight student models using a small-\nor mid-scale dataset. Notably, this study focuses on open-vocabulary\nout-of-distribution (OOD) generalization, a challenging problem that has been\noverlooked in previous model distillation literature. We propose two principles\nfrom vision and language modality perspectives to enhance student's OOD\ngeneralization: (1) by better imitating teacher's visual representation space,\nand carefully promoting better coherence in vision-language alignment with the\nteacher; (2) by enriching the teacher's language representations with\ninformative and finegrained semantic attributes to effectively distinguish\nbetween different labels. We propose several metrics and conduct extensive\nexperiments to investigate their techniques. The results demonstrate\nsignificant improvements in zero-shot and few-shot student performance on\nopen-vocabulary out-of-distribution classification, highlighting the\neffectiveness of our proposed approaches. Code released at\nhttps://github.com/xuanlinli17/large_vlm_distillation_ood\n","authors":["Xuanlin Li","Yunhao Fang","Minghua Liu","Zhan Ling","Zhuowen Tu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2307.03135v2.pdf","comment":"Published at International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2307.09153v2","updated":"2023-07-19T01:27:17Z","published":"2023-07-18T11:24:42Z","title":"OPHAvatars: One-shot Photo-realistic Head Avatars","summary":"  We propose a method for synthesizing photo-realistic digital avatars from\nonly one portrait as the reference. Given a portrait, our method synthesizes a\ncoarse talking head video using driving keypoints features. And with the coarse\nvideo, our method synthesizes a coarse talking head avatar with a deforming\nneural radiance field. With rendered images of the coarse avatar, our method\nupdates the low-quality images with a blind face restoration model. With\nupdated images, we retrain the avatar for higher quality. After several\niterations, our method can synthesize a photo-realistic animatable 3D neural\nhead avatar. The motivation of our method is deformable neural radiance field\ncan eliminate the unnatural distortion caused by the image2video method. Our\nmethod outperforms state-of-the-art methods in quantitative and qualitative\nstudies on various subjects.\n","authors":["Shaoxu Li"],"pdf_url":"https://arxiv.org/pdf/2307.09153v2.pdf","comment":"code: https://github.com/lsx0101/OPHAvatars"},{"id":"http://arxiv.org/abs/2307.09696v1","updated":"2023-07-19T00:41:39Z","published":"2023-07-19T00:41:39Z","title":"Towards Saner Deep Image Registration","summary":"  With recent advances in computing hardware and surges of deep-learning\narchitectures, learning-based deep image registration methods have surpassed\ntheir traditional counterparts, in terms of metric performance and inference\ntime. However, these methods focus on improving performance measurements such\nas Dice, resulting in less attention given to model behaviors that are equally\ndesirable for registrations, especially for medical imaging. This paper\ninvestigates these behaviors for popular learning-based deep registrations\nunder a sanity-checking microscope. We find that most existing registrations\nsuffer from low inverse consistency and nondiscrimination of identical pairs\ndue to overly optimized image similarities. To rectify these behaviors, we\npropose a novel regularization-based sanity-enforcer method that imposes two\nsanity checks on the deep model to reduce its inverse consistency errors and\nincrease its discriminative power simultaneously. Moreover, we derive a set of\ntheoretical guarantees for our sanity-checked image registration method, with\nexperimental results supporting our theoretical findings and their\neffectiveness in increasing the sanity of models without sacrificing any\nperformance. Our code and models are available at\n\\url{https://github.com/tuffr5/Saner-deep-registration}.\n","authors":["Bin Duan","Ming Zhong","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2307.09696v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09693v1","updated":"2023-07-19T00:36:05Z","published":"2023-07-19T00:36:05Z","title":"GlobalMapper: Arbitrary-Shaped Urban Layout Generation","summary":"  Modeling and designing urban building layouts is of significant interest in\ncomputer vision, computer graphics, and urban applications. A building layout\nconsists of a set of buildings in city blocks defined by a network of roads. We\nobserve that building layouts are discrete structures, consisting of multiple\nrows of buildings of various shapes, and are amenable to skeletonization for\nmapping arbitrary city block shapes to a canonical form. Hence, we propose a\nfully automatic approach to building layout generation using graph attention\nnetworks. Our method generates realistic urban layouts given arbitrary road\nnetworks, and enables conditional generation based on learned priors. Our\nresults, including user study, demonstrate superior performance as compared to\nprior layout generation networks, support arbitrary city block and varying\nbuilding shapes as demonstrated by generating layouts for 28 large cities.\n","authors":["Liu He","Daniel Aliaga"],"pdf_url":"https://arxiv.org/pdf/2307.09693v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10501v1","updated":"2023-07-19T23:57:39Z","published":"2023-07-19T23:57:39Z","title":"Eye Disease Classification Using Deep Learning Techniques","summary":"  Eye is the essential sense organ for vision function. Due to the fact that\ncertain eye disorders might result in vision loss, it is essential to diagnose\nand treat eye diseases early on. By identifying common eye illnesses and\nperforming an eye check, eye care providers can safeguard patients against\nvision loss or blindness. Convolutional neural networks (CNN) and transfer\nlearning were employed in this study to discriminate between a normal eye and\none with diabetic retinopathy, cataract, or glaucoma disease. Using transfer\nlearning for multi-class classification, high accuracy was achieved at 94%\nwhile the traditional CNN achieved 84% rate.\n","authors":["Tareq Babaqi","Manar Jaradat","Ayse Erdem Yildirim","Saif H. Al-Nimer","Daehan Won"],"pdf_url":"https://arxiv.org/pdf/2307.10501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10499v1","updated":"2023-07-19T23:55:15Z","published":"2023-07-19T23:55:15Z","title":"Mining Conditional Part Semantics with Occluded Extrapolation for\n  Human-Object Interaction Detection","summary":"  Human-Object Interaction Detection is a crucial aspect of human-centric scene\nunderstanding, with important applications in various domains. Despite recent\nprogress in this field, recognizing subtle and detailed interactions remains\nchallenging. Existing methods try to use human-related clues to alleviate the\ndifficulty, but rely heavily on external annotations or knowledge, limiting\ntheir practical applicability in real-world scenarios. In this work, we propose\na novel Part Semantic Network (PSN) to solve this problem. The core of PSN is a\nConditional Part Attention (CPA) mechanism, where human features are taken as\nkeys and values, and the object feature is used as query for the computation in\na cross-attention mechanism. In this way, our model learns to automatically\nfocus on the most informative human parts conditioned on the involved object,\ngenerating more semantically meaningful features for interaction recognition.\nAdditionally, we propose an Occluded Part Extrapolation (OPE) strategy to\nfacilitate interaction recognition under occluded scenarios, which teaches the\nmodel to extrapolate detailed features from partially occluded ones. Our method\nconsistently outperforms prior approaches on the V-COCO and HICO-DET datasets,\nwithout external data or extra annotations. Additional ablation studies\nvalidate the effectiveness of each component of our proposed method.\n","authors":["Guangzhi Wang","Yangyang Guo","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2307.10499v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2307.10495v1","updated":"2023-07-19T23:25:21Z","published":"2023-07-19T23:25:21Z","title":"Novel Batch Active Learning Approach and Its Application to Synthetic\n  Aperture Radar Datasets","summary":"  Active learning improves the performance of machine learning methods by\njudiciously selecting a limited number of unlabeled data points to query for\nlabels, with the aim of maximally improving the underlying classifier's\nperformance. Recent gains have been made using sequential active learning for\nsynthetic aperture radar (SAR) data arXiv:2204.00005. In each iteration,\nsequential active learning selects a query set of size one while batch active\nlearning selects a query set of multiple datapoints. While batch active\nlearning methods exhibit greater efficiency, the challenge lies in maintaining\nmodel accuracy relative to sequential active learning methods. We developed a\nnovel, two-part approach for batch active learning: Dijkstra's Annulus Core-Set\n(DAC) for core-set generation and LocalMax for batch sampling. The batch active\nlearning process that combines DAC and LocalMax achieves nearly identical\naccuracy as sequential active learning but is more efficient, proportional to\nthe batch size. As an application, a pipeline is built based on transfer\nlearning feature embedding, graph learning, DAC, and LocalMax to classify the\nFUSAR-Ship and OpenSARShip datasets. Our pipeline outperforms the\nstate-of-the-art CNN-based methods.\n","authors":["James Chapman","Bohan Chen","Zheng Tan","Jeff Calder","Kevin Miller","Andrea L. Bertozzi"],"pdf_url":"https://arxiv.org/pdf/2307.10495v1.pdf","comment":"16 pages, 7 figures, Preprint"},{"id":"http://arxiv.org/abs/2307.10487v1","updated":"2023-07-19T22:46:35Z","published":"2023-07-19T22:46:35Z","title":"Backdoor Attack against Object Detection with Clean Annotation","summary":"  Deep neural networks (DNNs) have shown unprecedented success in object\ndetection tasks. However, it was also discovered that DNNs are vulnerable to\nmultiple kinds of attacks, including Backdoor Attacks. Through the attack, the\nattacker manages to embed a hidden backdoor into the DNN such that the model\nbehaves normally on benign data samples, but makes attacker-specified judgments\ngiven the occurrence of a predefined trigger. Although numerous backdoor\nattacks have been experimented on image classification, backdoor attacks on\nobject detection tasks have not been properly investigated and explored. As\nobject detection has been adopted as an important module in multiple\nsecurity-sensitive applications such as autonomous driving, backdoor attacks on\nobject detection could pose even more severe threats. Inspired by the inherent\nproperty of deep learning-based object detectors, we propose a simple yet\neffective backdoor attack method against object detection without modifying the\nground truth annotations, specifically focusing on the object disappearance\nattack and object generation attack. Extensive experiments and ablation studies\nprove the effectiveness of our attack on two benchmark object detection\ndatasets, PASCAL VOC07+12 and MSCOCO, on which we achieve an attack success\nrate of more than 92% with a poison rate of only 5%.\n","authors":["Yize Cheng","Wenbin Hu","Minhao Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.10487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10475v1","updated":"2023-07-19T22:14:49Z","published":"2023-07-19T22:14:49Z","title":"Findings of Factify 2: Multimodal Fake News Detection","summary":"  With social media usage growing exponentially in the past few years, fake\nnews has also become extremely prevalent. The detrimental impact of fake news\nemphasizes the need for research focused on automating the detection of false\ninformation and verifying its accuracy. In this work, we present the outcome of\nthe Factify 2 shared task, which provides a multi-modal fact verification and\nsatire news dataset, as part of the DeFactify 2 workshop at AAAI'23. The data\ncalls for a comparison based approach to the task by pairing social media\nclaims with supporting documents, with both text and image, divided into 5\nclasses based on multi-modal relations. In the second iteration of this task we\nhad over 60 participants and 9 final test-set submissions. The best\nperformances came from the use of DeBERTa for text and Swinv2 and CLIP for\nimage. The highest F1 score averaged for all five classes was 81.82%.\n","authors":["S Suryavardan","Shreyash Mishra","Megha Chakraborty","Parth Patwa","Anku Rani","Aman Chadha","Aishwarya Reganti","Amitava Das","Amit Sheth","Manoj Chinnakotla","Asif Ekbal","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2307.10475v1.pdf","comment":"Defactify2 @AAAI 2023"},{"id":"http://arxiv.org/abs/2307.10471v1","updated":"2023-07-19T21:45:07Z","published":"2023-07-19T21:45:07Z","title":"Classification of Visualization Types and Perspectives in Patents","summary":"  Due to the swift growth of patent applications each year, information and\nmultimedia retrieval approaches that facilitate patent exploration and\nretrieval are of utmost importance. Different types of visualizations (e.g.,\ngraphs, technical drawings) and perspectives (e.g., side view, perspective) are\nused to visualize details of innovations in patents. The classification of\nthese images enables a more efficient search and allows for further analysis.\nSo far, datasets for image type classification miss some important\nvisualization types for patents. Furthermore, related work does not make use of\nrecent deep learning approaches including transformers. In this paper, we adopt\nstate-of-the-art deep learning methods for the classification of visualization\ntypes and perspectives in patent images. We extend the CLEF-IP dataset for\nimage type classification in patents to ten classes and provide manual ground\ntruth annotations. In addition, we derive a set of hierarchical classes from a\ndataset that provides weakly-labeled data for image perspectives. Experimental\nresults have demonstrated the feasibility of the proposed approaches. Source\ncode, models, and dataset will be made publicly available.\n","authors":["Junaid Ahmed Ghauri","Eric Müller-Budack","Ralph Ewerth"],"pdf_url":"https://arxiv.org/pdf/2307.10471v1.pdf","comment":"Accepted in International Conference on Theory and Practice of\n  Digital Libraries (TPDL) 2023 (They have the copyright to publish\n  camera-ready version of this work)"},{"id":"http://arxiv.org/abs/2307.10455v1","updated":"2023-07-19T20:54:08Z","published":"2023-07-19T20:54:08Z","title":"A Step Towards Worldwide Biodiversity Assessment: The BIOSCAN-1M Insect\n  Dataset","summary":"  In an effort to catalog insect biodiversity, we propose a new large dataset\nof hand-labelled insect images, the BIOSCAN-Insect Dataset. Each record is\ntaxonomically classified by an expert, and also has associated genetic\ninformation including raw nucleotide barcode sequences and assigned barcode\nindex numbers, which are genetically-based proxies for species classification.\nThis paper presents a curated million-image dataset, primarily to train\ncomputer-vision models capable of providing image-based taxonomic assessment,\nhowever, the dataset also presents compelling characteristics, the study of\nwhich would be of interest to the broader machine learning community. Driven by\nthe biological nature inherent to the dataset, a characteristic long-tailed\nclass-imbalance distribution is exhibited. Furthermore, taxonomic labelling is\na hierarchical classification scheme, presenting a highly fine-grained\nclassification problem at lower levels. Beyond spurring interest in\nbiodiversity research within the machine learning community, progress on\ncreating an image-based taxonomic classifier will also further the ultimate\ngoal of all BIOSCAN research: to lay the foundation for a comprehensive survey\nof global biodiversity. This paper introduces the dataset and explores the\nclassification task through the implementation and analysis of a baseline\nclassifier.\n","authors":["Zahra Gharaee","ZeMing Gong","Nicholas Pellegrino","Iuliia Zarubiieva","Joakim Bruslund Haurum","Scott C. Lowe","Jaclyn T. A. McKeown","Chris C. Y. Ho","Joschka McLeod","Yi-Yun C Wei","Jireh Agda","Sujeevan Ratnasingham","Dirk Steinke","Angel X. Chang","Graham W. Taylor","Paul Fieguth"],"pdf_url":"https://arxiv.org/pdf/2307.10455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10440v1","updated":"2023-07-19T20:11:30Z","published":"2023-07-19T20:11:30Z","title":"Confidence Estimation Using Unlabeled Data","summary":"  Overconfidence is a common issue for deep neural networks, limiting their\ndeployment in real-world applications. To better estimate confidence, existing\nmethods mostly focus on fully-supervised scenarios and rely on training labels.\nIn this paper, we propose the first confidence estimation method for a\nsemi-supervised setting, when most training labels are unavailable. We\nstipulate that even with limited training labels, we can still reasonably\napproximate the confidence of model on unlabeled samples by inspecting the\nprediction consistency through the training process. We use training\nconsistency as a surrogate function and propose a consistency ranking loss for\nconfidence estimation. On both image classification and segmentation tasks, our\nmethod achieves state-of-the-art performances in confidence estimation.\nFurthermore, we show the benefit of the proposed method through a downstream\nactive learning task. The code is available at\nhttps://github.com/TopoXLab/consistency-ranking-loss\n","authors":["Chen Li","Xiaoling Hu","Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2307.10440v1.pdf","comment":"Accepted by ICLR'23"},{"id":"http://arxiv.org/abs/2105.11166v6","updated":"2023-07-19T19:32:53Z","published":"2021-05-24T09:16:04Z","title":"AirNet: Neural Network Transmission over the Air","summary":"  State-of-the-art performance for many edge applications is achieved by deep\nneural networks (DNNs). Often, these DNNs are location- and time-sensitive, and\nmust be delivered over a wireless channel rapidly and efficiently. In this\npaper, we introduce AirNet, a family of novel training and transmission methods\nthat allow DNNs to be efficiently delivered over wireless channels under\nstringent transmit power and latency constraints. This corresponds to a new\nclass of joint source-channel coding problems, aimed at delivering DNNs with\nthe goal of maximizing their accuracy at the receiver, rather than recovering\nthem with high fidelity. In AirNet, we propose the direct mapping of the DNN\nparameters to transmitted channel symbols, while the network is trained to meet\nthe channel constraints, and exhibit robustness against channel noise. AirNet\nachieves higher accuracy compared to separation-based alternatives. We further\nimprove the performance of AirNet by pruning the network below the available\nbandwidth, and expanding it for improved robustness. We also benefit from\nunequal error protection by selectively expanding important layers of the\nnetwork. Finally, we develop an approach, which simultaneously trains a\nspectrum of DNNs, each targeting a different channel condition, resolving the\nimpractical memory requirements of training distinct networks for different\nchannel conditions.\n","authors":["Mikolaj Jankowski","Deniz Gunduz","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2105.11166v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09826v2","updated":"2023-07-19T19:30:52Z","published":"2023-04-16T11:22:59Z","title":"Fairness in AI and Its Long-Term Implications on Society","summary":"  Successful deployment of artificial intelligence (AI) in various settings has\nled to numerous positive outcomes for individuals and society. However, AI\nsystems have also been shown to harm parts of the population due to biased\npredictions. AI fairness focuses on mitigating such biases to ensure AI\ndecision making is not discriminatory towards certain groups. We take a closer\nlook at AI fairness and analyze how lack of AI fairness can lead to deepening\nof biases over time and act as a social stressor. More specifically, we discuss\nhow biased models can lead to more negative real-world outcomes for certain\ngroups, which may then become more prevalent by deploying new AI models trained\non increasingly biased data, resulting in a feedback loop. If the issues\npersist, they could be reinforced by interactions with other risks and have\nsevere implications on society in the form of social unrest. We examine current\nstrategies for improving AI fairness, assess their limitations in terms of\nreal-world deployment, and explore potential paths forward to ensure we reap\nAI's benefits without causing society's collapse.\n","authors":["Ondrej Bohdal","Timothy Hospedales","Philip H. S. Torr","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2304.09826v2.pdf","comment":"Stanford Existential Risks Conference 2023"},{"id":"http://arxiv.org/abs/2307.10422v1","updated":"2023-07-19T19:19:13Z","published":"2023-07-19T19:19:13Z","title":"PreDiff: Precipitation Nowcasting with Latent Diffusion Models","summary":"  Earth system forecasting has traditionally relied on complex physical models\nthat are computationally expensive and require significant domain expertise. In\nthe past decade, the unprecedented increase in spatiotemporal Earth observation\ndata has enabled data-driven forecasting models using deep learning techniques.\nThese models have shown promise for diverse Earth system forecasting tasks but\neither struggle with handling uncertainty or neglect domain-specific prior\nknowledge, resulting in averaging possible futures to blurred forecasts or\ngenerating physically implausible predictions. To address these limitations, we\npropose a two-stage pipeline for probabilistic spatiotemporal forecasting: 1)\nWe develop PreDiff, a conditional latent diffusion model capable of\nprobabilistic forecasts. 2) We incorporate an explicit knowledge control\nmechanism to align forecasts with domain-specific physical constraints. This is\nachieved by estimating the deviation from imposed constraints at each denoising\nstep and adjusting the transition distribution accordingly. We conduct\nempirical studies on two datasets: N-body MNIST, a synthetic dataset with\nchaotic behavior, and SEVIR, a real-world precipitation nowcasting dataset.\nSpecifically, we impose the law of conservation of energy in N-body MNIST and\nanticipated precipitation intensity in SEVIR. Experiments demonstrate the\neffectiveness of PreDiff in handling uncertainty, incorporating domain-specific\nprior knowledge, and generating forecasts that exhibit high operational\nutility.\n","authors":["Zhihan Gao","Xingjian Shi","Boran Han","Hao Wang","Xiaoyong Jin","Danielle Maddix","Yi Zhu","Mu Li","Yuyang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10422v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2307.10408v1","updated":"2023-07-19T18:37:57Z","published":"2023-07-19T18:37:57Z","title":"Explaining Autonomous Driving Actions with Visual Question Answering","summary":"  The end-to-end learning ability of self-driving vehicles has achieved\nsignificant milestones over the last decade owing to rapid advances in deep\nlearning and computer vision algorithms. However, as autonomous driving\ntechnology is a safety-critical application of artificial intelligence (AI),\nroad accidents and established regulatory principles necessitate the need for\nthe explainability of intelligent action choices for self-driving vehicles. To\nfacilitate interpretability of decision-making in autonomous driving, we\npresent a Visual Question Answering (VQA) framework, which explains driving\nactions with question-answering-based causal reasoning. To do so, we first\ncollect driving videos in a simulation environment using reinforcement learning\n(RL) and extract consecutive frames from this log data uniformly for five\nselected action categories. Further, we manually annotate the extracted frames\nusing question-answer pairs as justifications for the actions chosen in each\nscenario. Finally, we evaluate the correctness of the VQA-predicted answers for\nactions on unseen driving scenes. The empirical results suggest that the VQA\nmechanism can provide support to interpret real-time decisions of autonomous\nvehicles and help enhance overall driving safety.\n","authors":["Shahin Atakishiyev","Mohammad Salameh","Housam Babiker","Randy Goebel"],"pdf_url":"https://arxiv.org/pdf/2307.10408v1.pdf","comment":"Accepted to the 2023 IEEE International Conference on Intelligent\n  Transportation Systems (IEEE ITSC-2023)"},{"id":"http://arxiv.org/abs/2307.10404v1","updated":"2023-07-19T18:19:18Z","published":"2023-07-19T18:19:18Z","title":"Interpreting and Correcting Medical Image Classification with PIP-Net","summary":"  Part-prototype models are explainable-by-design image classifiers, and a\npromising alternative to black box AI. This paper explores the applicability\nand potential of interpretable machine learning, in particular PIP-Net, for\nautomated diagnosis support on real-world medical imaging data. PIP-Net learns\nhuman-understandable prototypical image parts and we evaluate its accuracy and\ninterpretability for fracture detection and skin cancer diagnosis. We find that\nPIP-Net's decision making process is in line with medical classification\nstandards, while only provided with image-level class labels. Because of\nPIP-Net's unsupervised pretraining of prototypes, data quality problems such as\nundesired text in an X-ray or labelling errors can be easily identified.\nAdditionally, we are the first to show that humans can manually correct the\nreasoning of PIP-Net by directly disabling undesired prototypes. We conclude\nthat part-prototype models are promising for medical applications due to their\ninterpretability and potential for advanced model debugging.\n","authors":["Meike Nauta","Johannes H. Hegeman","Jeroen Geerdink","Jörg Schlötterer","Maurice van Keulen","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2307.10404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10387v1","updated":"2023-07-19T18:00:32Z","published":"2023-07-19T18:00:32Z","title":"POV-Surgery: A Dataset for Egocentric Hand and Tool Pose Estimation\n  During Surgical Activities","summary":"  The surgical usage of Mixed Reality (MR) has received growing attention in\nareas such as surgical navigation systems, skill assessment, and robot-assisted\nsurgeries. For such applications, pose estimation for hand and surgical\ninstruments from an egocentric perspective is a fundamental task and has been\nstudied extensively in the computer vision field in recent years. However, the\ndevelopment of this field has been impeded by a lack of datasets, especially in\nthe surgical field, where bloody gloves and reflective metallic tools make it\nhard to obtain 3D pose annotations for hands and objects using conventional\nmethods. To address this issue, we propose POV-Surgery, a large-scale,\nsynthetic, egocentric dataset focusing on pose estimation for hands with\ndifferent surgical gloves and three orthopedic surgical instruments, namely\nscalpel, friem, and diskplacer. Our dataset consists of 53 sequences and 88,329\nframes, featuring high-resolution RGB-D video streams with activity\nannotations, accurate 3D and 2D annotations for hand-object pose, and 2D\nhand-object segmentation masks. We fine-tune the current SOTA methods on\nPOV-Surgery and further show the generalizability when applying to real-life\ncases with surgical gloves and tools by extensive evaluations. The code and the\ndataset are publicly available at batfacewayne.github.io/POV_Surgery_io/.\n","authors":["Rui Wang","Sophokles Ktistakis","Siwei Zhang","Mirko Meboldt","Quentin Lohmeyer"],"pdf_url":"https://arxiv.org/pdf/2307.10387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10373v1","updated":"2023-07-19T18:00:03Z","published":"2023-07-19T18:00:03Z","title":"TokenFlow: Consistent Diffusion Features for Consistent Video Editing","summary":"  The generative AI revolution has recently expanded to videos. Nevertheless,\ncurrent state-of-the-art video models are still lagging behind image models in\nterms of visual quality and user control over the generated content. In this\nwork, we present a framework that harnesses the power of a text-to-image\ndiffusion model for the task of text-driven video editing. Specifically, given\na source video and a target text-prompt, our method generates a high-quality\nvideo that adheres to the target text, while preserving the spatial layout and\nmotion of the input video. Our method is based on a key observation that\nconsistency in the edited video can be obtained by enforcing consistency in the\ndiffusion feature space. We achieve this by explicitly propagating diffusion\nfeatures based on inter-frame correspondences, readily available in the model.\nThus, our framework does not require any training or fine-tuning, and can work\nin conjunction with any off-the-shelf text-to-image editing method. We\ndemonstrate state-of-the-art editing results on a variety of real-world videos.\nWebpage: https://diffusion-tokenflow.github.io/\n","authors":["Michal Geyer","Omer Bar-Tal","Shai Bagon","Tali Dekel"],"pdf_url":"https://arxiv.org/pdf/2307.10373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10350v1","updated":"2023-07-19T17:47:12Z","published":"2023-07-19T17:47:12Z","title":"Improving Multimodal Datasets with Image Captioning","summary":"  Massive web datasets play a key role in the success of large vision-language\nmodels like CLIP and Flamingo. However, the raw web data is noisy, and existing\nfiltering methods to reduce noise often come at the expense of data diversity.\nOur work focuses on caption quality as one major source of noise, and studies\nhow generated captions can increase the utility of web-scraped datapoints with\nnondescript text. Through exploring different mixing strategies for raw and\ngenerated captions, we outperform the best filtering method proposed by the\nDataComp benchmark by 2% on ImageNet and 4% on average across 38 tasks, given a\ncandidate pool of 128M image-text pairs. Our best approach is also 2x better at\nFlickr and MS-COCO retrieval. We then analyze what makes synthetic captions an\neffective source of text supervision. In experimenting with different image\ncaptioning models, we also demonstrate that the performance of a model on\nstandard image captioning benchmarks (e.g., NoCaps CIDEr) is not a reliable\nindicator of the utility of the captions it generates for multimodal training.\nFinally, our experiments with using generated captions at DataComp's large\nscale (1.28B image-text pairs) offer insights into the limitations of synthetic\ntext, as well as the importance of image curation with increasing training data\nquantity.\n","authors":["Thao Nguyen","Samir Yitzhak Gadre","Gabriel Ilharco","Sewoong Oh","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2307.10350v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.09989v1","updated":"2023-07-19T13:49:35Z","published":"2023-07-19T13:49:35Z","title":"UniMatch: A Unified User-Item Matching Framework for the Multi-purpose\n  Merchant Marketing","summary":"  When doing private domain marketing with cloud services, the merchants\nusually have to purchase different machine learning models for the multiple\nmarketing purposes, leading to a very high cost. We present a unified user-item\nmatching framework to simultaneously conduct item recommendation and user\ntargeting with just one model. We empirically demonstrate that the above\nconcurrent modeling is viable via modeling the user-item interaction matrix\nwith the multinomial distribution, and propose a bidirectional bias-corrected\nNCE loss for the implementation. The proposed loss function guides the model to\nlearn the user-item joint probability $p(u,i)$ instead of the conditional\nprobability $p(i|u)$ or $p(u|i)$ through correcting both the users and items'\nbiases caused by the in-batch negative sampling. In addition, our framework is\nmodel-agnostic enabling a flexible adaptation of different model architectures.\nExtensive experiments demonstrate that our framework results in significant\nperformance gains in comparison with the state-of-the-art methods, with greatly\nreduced cost on computing resources and daily maintenance.\n","authors":["Qifang Zhao","Tianyu Li","Meng Du","Yu Jiang","Qinghui Sun","Zhongyao Wang","Hong Liu","Huan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.09989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09985v1","updated":"2023-07-19T13:44:32Z","published":"2023-07-19T13:44:32Z","title":"Our Model Achieves Excellent Performance on MovieLens: What Does it\n  Mean?","summary":"  A typical benchmark dataset for recommender system (RecSys) evaluation\nconsists of user-item interactions generated on a platform within a time\nperiod. The interaction generation mechanism partially explains why a user\ninteracts with (e.g.,like, purchase, rate) an item, and the context of when a\nparticular interaction happened. In this study, we conduct a meticulous\nanalysis on the MovieLens dataset and explain the potential impact on using the\ndataset for evaluating recommendation algorithms. We make a few main findings\nfrom our analysis. First, there are significant differences in user\ninteractions at the different stages when a user interacts with the MovieLens\nplatform. The early interactions largely define the user portrait which affect\nthe subsequent interactions. Second, user interactions are highly affected by\nthe candidate movies that are recommended by the platform's internal\nrecommendation algorithm(s). Removal of interactions that happen nearer to the\nlast few interactions of a user leads to increasing difficulty in learning user\npreference, thus deteriorating recommendation accuracy. Third, changing the\norder of user interactions makes it more difficult for sequential algorithms to\ncapture the progressive interaction process. Based on these findings, we\nfurther discuss the discrepancy between the interaction generation mechanism\nthat is employed by the MovieLens system and that of typical real world\nrecommendation scenarios. In summary, models that achieve excellent\nrecommendation accuracy on the MovieLens dataset may not demonstrate superior\nperformance in practice for at least two kinds of differences: (i) the\ndifferences in the contexts of user-item interaction generation, and (ii) the\ndifferences in user knowledge about the item collections.\n","authors":["Yu-chen Fan","Yitong Ji","Jie Zhang","Aixin Sun"],"pdf_url":"https://arxiv.org/pdf/2307.09985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09834v1","updated":"2023-07-19T08:44:11Z","published":"2023-07-19T08:44:11Z","title":"Who Provides the Largest Megaphone? The Role of Google News in Promoting\n  Russian State-Affiliated News Sources","summary":"  The Internet has not only digitized but also democratized information access\nacross the globe. This gradual but path-breaking move to online information\npropagation has resulted in search engines playing an increasingly prominent\nrole in shaping access to human knowledge. When an Internet user enters a\nquery, the search engine sorts through the hundreds of billions of possible\nwebpages to determine what to show. Google dominates the search engine market,\nwith Google Search surpassing 80% market share globally every year of the last\ndecade. Only in Russia and China do Google competitors claim more market share,\nwith approximately 60% of Internet users in Russia preferring Yandex (compared\nto 40% in favor of Google) and more than 80% of China's Internet users\naccessing Baidu as of 2022. Notwithstanding this long-standing regional\nvariation in Internet search providers, there is limited research showing how\nthese providers compare in terms of propagating state-sponsored information.\nOur study fills this research gap by focusing on Russian cyberspace and\nexamining how Google and Yandex's search algorithms rank content from Russian\nstate-controlled media (hereon, RSM) outlets. This question is timely and of\npractical interest given widespread reports indicating that RSM outlets have\nactively engaged in promoting Kremlin propaganda in the lead-up to, and in the\naftermath of, the Russian invasion of Ukraine in February 2022.\n","authors":["Keeley Erhardt","Saurabh Khanna"],"pdf_url":"https://arxiv.org/pdf/2307.09834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09775v1","updated":"2023-07-19T06:31:58Z","published":"2023-07-19T06:31:58Z","title":"DisCover: Disentangled Music Representation Learning for Cover Song\n  Identification","summary":"  In the field of music information retrieval (MIR), cover song identification\n(CSI) is a challenging task that aims to identify cover versions of a query\nsong from a massive collection. Existing works still suffer from high\nintra-song variances and inter-song correlations, due to the entangled nature\nof version-specific and version-invariant factors in their modeling. In this\nwork, we set the goal of disentangling version-specific and version-invariant\nfactors, which could make it easier for the model to learn invariant music\nrepresentations for unseen query songs. We analyze the CSI task in a\ndisentanglement view with the causal graph technique, and identify the\nintra-version and inter-version effects biasing the invariant learning. To\nblock these effects, we propose the disentangled music representation learning\nframework (DisCover) for CSI. DisCover consists of two critical components: (1)\nKnowledge-guided Disentanglement Module (KDM) and (2) Gradient-based\nAdversarial Disentanglement Module (GADM), which block intra-version and\ninter-version biased effects, respectively. KDM minimizes the mutual\ninformation between the learned representations and version-variant factors\nthat are identified with prior domain knowledge. GADM identifies\nversion-variant factors by simulating the representation transitions between\nintra-song versions, and exploits adversarial distillation for effect blocking.\nExtensive comparisons with best-performing methods and in-depth analysis\ndemonstrate the effectiveness of DisCover and the and necessity of\ndisentanglement for CSI.\n","authors":["Jiahao Xun","Shengyu Zhang","Yanting Yang","Jieming Zhu","Liqun Deng","Zhou Zhao","Zhenhua Dong","Ruiqi Li","Lichao Zhang","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2307.09775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09751v1","updated":"2023-07-19T05:23:43Z","published":"2023-07-19T05:23:43Z","title":"Information Retrieval Meets Large Language Models: A Strategic Report\n  from Chinese IR Community","summary":"  The research field of Information Retrieval (IR) has evolved significantly,\nexpanding beyond traditional search to meet diverse user information needs.\nRecently, Large Language Models (LLMs) have demonstrated exceptional\ncapabilities in text understanding, generation, and knowledge inference,\nopening up exciting avenues for IR research. LLMs not only facilitate\ngenerative retrieval but also offer improved solutions for user understanding,\nmodel evaluation, and user-system interactions. More importantly, the\nsynergistic relationship among IR models, LLMs, and humans forms a new\ntechnical paradigm that is more powerful for information seeking. IR models\nprovide real-time and relevant information, LLMs contribute internal knowledge,\nand humans play a central role of demanders and evaluators to the reliability\nof information services. Nevertheless, significant challenges exist, including\ncomputational costs, credibility concerns, domain-specific limitations, and\nethical considerations. To thoroughly discuss the transformative impact of LLMs\non IR research, the Chinese IR community conducted a strategic workshop in\nApril 2023, yielding valuable insights. This paper provides a summary of the\nworkshop's outcomes, including the rethinking of IR's core values, the mutual\nenhancement of LLMs and IR, the proposal of a novel IR technical paradigm, and\nopen challenges.\n","authors":["Qingyao Ai","Ting Bai","Zhao Cao","Yi Chang","Jiawei Chen","Zhumin Chen","Zhiyong Cheng","Shoubin Dong","Zhicheng Dou","Fuli Feng","Shen Gao","Jiafeng Guo","Xiangnan He","Yanyan Lan","Chenliang Li","Yiqun Liu","Ziyu Lyu","Weizhi Ma","Jun Ma","Zhaochun Ren","Pengjie Ren","Zhiqiang Wang","Mingwen Wang","Jirong Wen","Le Wu","Xin Xin","Jun Xu","Dawei Yin","Peng Zhang","Fan Zhang","Weinan Zhang","Min Zhang","Xiaofei Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.09751v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2208.06265v2","updated":"2023-07-19T05:08:06Z","published":"2022-08-10T08:28:46Z","title":"Trustworthy Recommender Systems","summary":"  Recommender systems (RSs) aim to help users to effectively retrieve items of\ntheir interests from a large catalogue. For a quite long period of time,\nresearchers and practitioners have been focusing on developing accurate RSs.\nRecent years have witnessed an increasing number of threats to RSs, coming from\nattacks, system and user generated noise, system bias. As a result, it has\nbecome clear that a strict focus on RS accuracy is limited and the research\nmust consider other important factors, e.g., trustworthiness. For end users, a\ntrustworthy RS (TRS) should not only be accurate, but also transparent,\nunbiased and fair as well as robust to noise or attacks. These observations\nactually led to a paradigm shift of the research on RSs: from accuracy-oriented\nRSs to TRSs. However, researchers lack a systematic overview and discussion of\nthe literature in this novel and fast developing field of TRSs. To this end, in\nthis paper, we provide an overview of TRSs, including a discussion of the\nmotivation and basic concepts of TRSs, a presentation of the challenges in\nbuilding TRSs, and a perspective on the future directions in this area. We also\nprovide a novel conceptual framework to support the construction of TRSs.\n","authors":["Shoujin Wang","Xiuzhen Zhang","Yan Wang","Huan Liu","Francesco Ricci"],"pdf_url":"https://arxiv.org/pdf/2208.06265v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09688v1","updated":"2023-07-19T00:08:49Z","published":"2023-07-19T00:08:49Z","title":"Amazon-M2: A Multilingual Multi-locale Shopping Session Dataset for\n  Recommendation and Text Generation","summary":"  Modeling customer shopping intentions is a crucial task for e-commerce, as it\ndirectly impacts user experience and engagement. Thus, accurately understanding\ncustomer preferences is essential for providing personalized recommendations.\nSession-based recommendation, which utilizes customer session data to predict\ntheir next interaction, has become increasingly popular. However, existing\nsession datasets have limitations in terms of item attributes, user diversity,\nand dataset scale. As a result, they cannot comprehensively capture the\nspectrum of user behaviors and preferences. To bridge this gap, we present the\nAmazon Multilingual Multi-locale Shopping Session Dataset, namely Amazon-M2. It\nis the first multilingual dataset consisting of millions of user sessions from\nsix different locales, where the major languages of products are English,\nGerman, Japanese, French, Italian, and Spanish. Remarkably, the dataset can\nhelp us enhance personalization and understanding of user preferences, which\ncan benefit various existing tasks as well as enable new tasks. To test the\npotential of the dataset, we introduce three tasks in this work: (1)\nnext-product recommendation, (2) next-product recommendation with domain\nshifts, and (3) next-product title generation. With the above tasks, we\nbenchmark a range of algorithms on our proposed dataset, drawing new insights\nfor further research and practice. In addition, based on the proposed dataset\nand tasks, we hosted a competition in the KDD CUP 2023 and have attracted\nthousands of users and submissions. The winning solutions and the associated\nworkshop can be accessed at our website https://kddcup23.github.io/.\n","authors":["Wei Jin","Haitao Mao","Zheng Li","Haoming Jiang","Chen Luo","Hongzhi Wen","Haoyu Han","Hanqing Lu","Zhengyang Wang","Ruirui Li","Zhen Li","Monica Xiao Cheng","Rahul Goutam","Haiyang Zhang","Karthik Subbian","Suhang Wang","Yizhou Sun","Jiliang Tang","Bing Yin","Xianfeng Tang"],"pdf_url":"https://arxiv.org/pdf/2307.09688v1.pdf","comment":"Dataset for KDD Cup 2023, https://kddcup23.github.io/"},{"id":"http://arxiv.org/abs/2205.11498v2","updated":"2023-07-19T23:05:57Z","published":"2022-05-23T17:53:44Z","title":"Injecting Domain Adaptation with Learning-to-hash for Effective and\n  Efficient Zero-shot Dense Retrieval","summary":"  Dense retrieval overcome the lexical gap and has shown great success in\nad-hoc information retrieval (IR). Despite their success, dense retrievers are\nexpensive to serve across practical use cases. For use cases requiring to\nsearch from millions of documents, the dense index becomes bulky and requires\nhigh memory usage for storing the index. More recently, learning-to-hash (LTH)\ntechniques, for e.g., BPR and JPQ, produce binary document vectors, thereby\nreducing the memory requirement to efficiently store the dense index. LTH\ntechniques are supervised and finetune the retriever using a ranking loss. They\noutperform their counterparts, i.e., traditional out-of-the-box vector\ncompression techniques such as PCA or PQ. A missing piece from prior work is\nthat existing techniques have been evaluated only in-domain, i.e., on a single\ndataset such as MS MARCO. In our work, we evaluate LTH and vector compression\ntechniques for improving the downstream zero-shot retrieval accuracy of the\nTAS-B dense retriever while maintaining efficiency at inference. Our results\ndemonstrate that, unlike prior work, LTH strategies when applied naively can\nunderperform the zero-shot TAS-B dense retriever on average by up to 14%\nnDCG@10 on the BEIR benchmark. To solve this limitation, in our work, we\npropose an easy yet effective solution of injecting domain adaptation with\nexisting supervised LTH techniques. We experiment with two well-known\nunsupervised domain adaptation techniques: GenQ and GPL. Our domain adaptation\ninjection technique can improve the downstream zero-shot retrieval\neffectiveness for both BPR and JPQ variants of the TAS-B model by on average\n11.5% and 8.2% nDCG@10 while both maintaining 32$\\times$ memory efficiency and\n14$\\times$ and 2$\\times$ speedup respectively in CPU retrieval latency on BEIR.\nAll our code, models, and data are publicly available at\nhttps://github.com/thakur-nandan/income.\n","authors":["Nandan Thakur","Nils Reimers","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2205.11498v2.pdf","comment":"Accepted at ReNeuIR 2023 Workshop"},{"id":"http://arxiv.org/abs/2307.10488v1","updated":"2023-07-19T22:48:02Z","published":"2023-07-19T22:48:02Z","title":"SPRINT: A Unified Toolkit for Evaluating and Demystifying Zero-shot\n  Neural Sparse Retrieval","summary":"  Traditionally, sparse retrieval systems relied on lexical representations to\nretrieve documents, such as BM25, dominated information retrieval tasks. With\nthe onset of pre-trained transformer models such as BERT, neural sparse\nretrieval has led to a new paradigm within retrieval. Despite the success,\nthere has been limited software supporting different sparse retrievers running\nin a unified, common environment. This hinders practitioners from fairly\ncomparing different sparse models and obtaining realistic evaluation results.\nAnother missing piece is, that a majority of prior work evaluates sparse\nretrieval models on in-domain retrieval, i.e. on a single dataset: MS MARCO.\nHowever, a key requirement in practical retrieval systems requires models that\ncan generalize well to unseen out-of-domain, i.e. zero-shot retrieval tasks. In\nthis work, we provide SPRINT, a unified Python toolkit based on Pyserini and\nLucene, supporting a common interface for evaluating neural sparse retrieval.\nThe toolkit currently includes five built-in models: uniCOIL, DeepImpact,\nSPARTA, TILDEv2 and SPLADEv2. Users can also easily add customized models by\ndefining their term weighting method. Using our toolkit, we establish strong\nand reproducible zero-shot sparse retrieval baselines across the\nwell-acknowledged benchmark, BEIR. Our results demonstrate that SPLADEv2\nachieves the best average score of 0.470 nDCG@10 on BEIR amongst all neural\nsparse retrievers. In this work, we further uncover the reasons behind its\nperformance gain. We show that SPLADEv2 produces sparse representations with a\nmajority of tokens outside of the original query and document which is often\ncrucial for its performance gains, i.e. a limitation among its other sparse\ncounterparts. We provide our SPRINT toolkit, models, and data used in our\nexperiments publicly here at https://github.com/thakur-nandan/sprint.\n","authors":["Nandan Thakur","Kexin Wang","Iryna Gurevych","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2307.10488v1.pdf","comment":"Accepted at SIGIR 2023 (Resource Track)"},{"id":"http://arxiv.org/abs/2307.10479v1","updated":"2023-07-19T22:20:06Z","published":"2023-07-19T22:20:06Z","title":"Fast Approximate Nearest Neighbor Search with a Dynamic Exploration\n  Graph using Continuous Refinement","summary":"  For approximate nearest neighbor search, graph-based algorithms have shown to\noffer the best trade-off between accuracy and search time. We propose the\nDynamic Exploration Graph (DEG) which significantly outperforms existing\nalgorithms in terms of search and exploration efficiency by combining two new\nideas: First, a single undirected even regular graph is incrementally built by\npartially replacing existing edges to integrate new vertices and to update old\nneighborhoods at the same time. Secondly, an edge optimization algorithm is\nused to continuously improve the quality of the graph. Combining this ongoing\nrefinement with the graph construction process leads to a well-organized graph\nstructure at all times, resulting in: (1) increased search efficiency, (2)\npredictable index size, (3) guaranteed connectivity and therefore reachability\nof all vertices, and (4) a dynamic graph structure. In addition we investigate\nhow well existing graph-based search systems can handle indexed queries where\nthe seed vertex of a search is the query itself. Such exploration tasks,\ndespite their good starting point, are not necessarily easy. High efficiency in\napproximate nearest neighbor search (ANNS) does not automatically imply good\nperformance in exploratory search. Extensive experiments show that our new\nDynamic Exploration Graph outperforms existing algorithms significantly for\nindexed and unindexed queries.\n","authors":["Nico Hezel","Kai Uwe Barthel","Konstantin Schall","Klaus Jung"],"pdf_url":"https://arxiv.org/pdf/2307.10479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10471v1","updated":"2023-07-19T21:45:07Z","published":"2023-07-19T21:45:07Z","title":"Classification of Visualization Types and Perspectives in Patents","summary":"  Due to the swift growth of patent applications each year, information and\nmultimedia retrieval approaches that facilitate patent exploration and\nretrieval are of utmost importance. Different types of visualizations (e.g.,\ngraphs, technical drawings) and perspectives (e.g., side view, perspective) are\nused to visualize details of innovations in patents. The classification of\nthese images enables a more efficient search and allows for further analysis.\nSo far, datasets for image type classification miss some important\nvisualization types for patents. Furthermore, related work does not make use of\nrecent deep learning approaches including transformers. In this paper, we adopt\nstate-of-the-art deep learning methods for the classification of visualization\ntypes and perspectives in patent images. We extend the CLEF-IP dataset for\nimage type classification in patents to ten classes and provide manual ground\ntruth annotations. In addition, we derive a set of hierarchical classes from a\ndataset that provides weakly-labeled data for image perspectives. Experimental\nresults have demonstrated the feasibility of the proposed approaches. Source\ncode, models, and dataset will be made publicly available.\n","authors":["Junaid Ahmed Ghauri","Eric Müller-Budack","Ralph Ewerth"],"pdf_url":"https://arxiv.org/pdf/2307.10471v1.pdf","comment":"Accepted in International Conference on Theory and Practice of\n  Digital Libraries (TPDL) 2023 (They have the copyright to publish\n  camera-ready version of this work)"},{"id":"http://arxiv.org/abs/2109.12509v3","updated":"2023-07-19T21:28:52Z","published":"2021-09-26T06:54:26Z","title":"Deep Exploration for Recommendation Systems","summary":"  Modern recommendation systems ought to benefit by probing for and learning\nfrom delayed feedback. Research has tended to focus on learning from a user's\nresponse to a single recommendation. Such work, which leverages methods of\nsupervised and bandit learning, forgoes learning from the user's subsequent\nbehavior. Where past work has aimed to learn from subsequent behavior, there\nhas been a lack of effective methods for probing to elicit informative delayed\nfeedback. Effective exploration through probing for delayed feedback becomes\nparticularly challenging when rewards are sparse. To address this, we develop\ndeep exploration methods for recommendation systems. In particular, we\nformulate recommendation as a sequential decision problem and demonstrate\nbenefits of deep exploration over single-step exploration. Our experiments are\ncarried out with high-fidelity industrial-grade simulators and establish large\nimprovements over existing algorithms.\n","authors":["Zheqing Zhu","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2109.12509v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10323v1","updated":"2023-07-19T07:20:30Z","published":"2023-07-19T07:20:30Z","title":"IncDSI: Incrementally Updatable Document Retrieval","summary":"  Differentiable Search Index is a recently proposed paradigm for document\nretrieval, that encodes information about a corpus of documents within the\nparameters of a neural network and directly maps queries to corresponding\ndocuments. These models have achieved state-of-the-art performances for\ndocument retrieval across many benchmarks. These kinds of models have a\nsignificant limitation: it is not easy to add new documents after a model is\ntrained. We propose IncDSI, a method to add documents in real time (about\n20-50ms per document), without retraining the model on the entire dataset (or\neven parts thereof). Instead we formulate the addition of documents as a\nconstrained optimization problem that makes minimal changes to the network\nparameters. Although orders of magnitude faster, our approach is competitive\nwith re-training the model on the whole dataset and enables the development of\ndocument retrieval systems that can be updated with new information in\nreal-time. Our code for IncDSI is available at\nhttps://github.com/varshakishore/IncDSI.\n","authors":["Varsha Kishore","Chao Wan","Justin Lovelace","Yoav Artzi","Kilian Q. Weinberger"],"pdf_url":"https://arxiv.org/pdf/2307.10323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00370v2","updated":"2023-07-19T06:55:04Z","published":"2023-07-01T15:44:53Z","title":"Improving Text Matching in E-Commerce Search with A Rationalizable,\n  Intervenable and Fast Entity-Based Relevance Model","summary":"  Discovering the intended items of user queries from a massive repository of\nitems is one of the main goals of an e-commerce search system. Relevance\nprediction is essential to the search system since it helps improve\nperformance. When online serving a relevance model, the model is required to\nperform fast and accurate inference. Currently, the widely used models such as\nBi-encoder and Cross-encoder have their limitations in accuracy or inference\nspeed respectively. In this work, we propose a novel model called the\nEntity-Based Relevance Model (EBRM). We identify the entities contained in an\nitem and decompose the QI (query-item) relevance problem into multiple QE\n(query-entity) relevance problems; we then aggregate their results to form the\nQI prediction using a soft logic formulation. The decomposition allows us to\nuse a Cross-encoder QE relevance module for high accuracy as well as cache QE\npredictions for fast online inference. Utilizing soft logic makes the\nprediction procedure interpretable and intervenable. We also show that\npretraining the QE module with auto-generated QE data from user logs can\nfurther improve the overall performance. The proposed method is evaluated on\nlabeled data from e-commerce websites. Empirical results show that it achieves\npromising improvements with computation efficiency.\n","authors":["Jiong Cai","Yong Jiang","Yue Zhang","Chengyue Jiang","Ke Yu","Jianhui Ji","Rong Xiao","Haihong Tang","Tao Wang","Zhongqiang Huang","Pengjun Xie","Fei Huang","Kewei Tu"],"pdf_url":"https://arxiv.org/pdf/2307.00370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10314v1","updated":"2023-07-19T03:31:41Z","published":"2023-07-19T03:31:41Z","title":"Mood Classification of Bangla Songs Based on Lyrics","summary":"  Music can evoke various emotions, and with the advancement of technology, it\nhas become more accessible to people. Bangla music, which portrays different\nhuman emotions, lacks sufficient research. The authors of this article aim to\nanalyze Bangla songs and classify their moods based on the lyrics. To achieve\nthis, this research has compiled a dataset of 4000 Bangla song lyrics, genres,\nand used Natural Language Processing and the Bert Algorithm to analyze the\ndata. Among the 4000 songs, 1513 songs are represented for the sad mood, 1362\nfor the romantic mood, 886 for happiness, and the rest 239 are classified as\nrelaxation. By embedding the lyrics of the songs, the authors have classified\nthe songs into four moods: Happy, Sad, Romantic, and Relaxed. This research is\ncrucial as it enables a multi-class classification of songs' moods, making the\nmusic more relatable to people's emotions. The article presents the automated\nresult of the four moods accurately derived from the song lyrics.\n","authors":["Maliha Mahajebin","Mohammad Rifat Ahmmad Rashid","Nafees Mansoor"],"pdf_url":"https://arxiv.org/pdf/2307.10314v1.pdf","comment":"Presented at International Conference on. Inventive Communication and\n  Computational Technologies 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.10171v1","updated":"2023-07-19T17:57:27Z","published":"2023-07-19T17:57:27Z","title":"LightPath: Lightweight and Scalable Path Representation Learning","summary":"  Movement paths are used widely in intelligent transportation and smart city\napplications. To serve such applications, path representation learning aims to\nprovide compact representations of paths that enable efficient and accurate\noperations when used for different downstream tasks such as path ranking and\ntravel cost estimation. In many cases, it is attractive that the path\nrepresentation learning is lightweight and scalable; in resource-limited\nenvironments and under green computing limitations, it is essential. Yet,\nexisting path representation learning studies focus on accuracy and pay at most\nsecondary attention to resource consumption and scalability.\n  We propose a lightweight and scalable path representation learning framework,\ntermed LightPath, that aims to reduce resource consumption and achieve\nscalability without affecting accuracy, thus enabling broader applicability.\nMore specifically, we first propose a sparse auto-encoder that ensures that the\nframework achieves good scalability with respect to path length. Next, we\npropose a relational reasoning framework to enable faster training of more\nrobust sparse path encoders. We also propose global-local knowledge\ndistillation to further reduce the size and improve the performance of sparse\npath encoders. Finally, we report extensive experiments on two real-world\ndatasets to offer insight into the efficiency, scalability, and effectiveness\nof the proposed framework.\n","authors":["Sean Bin Yang","Jilin Hu","Chenjuan Guo","Bin Yang","Christian S. Jensen"],"pdf_url":"https://arxiv.org/pdf/2307.10171v1.pdf","comment":"This paper has been accepted by ACM SIGKDD-23"},{"id":"http://arxiv.org/abs/2212.07383v3","updated":"2023-07-19T17:56:01Z","published":"2022-12-14T18:08:42Z","title":"Sequential Kernelized Independence Testing","summary":"  Independence testing is a classical statistical problem that has been\nextensively studied in the batch setting when one fixes the sample size before\ncollecting data. However, practitioners often prefer procedures that adapt to\nthe complexity of a problem at hand instead of setting sample size in advance.\nIdeally, such procedures should (a) stop earlier on easy tasks (and later on\nharder tasks), hence making better use of available resources, and (b)\ncontinuously monitor the data and efficiently incorporate statistical evidence\nafter collecting new data, while controlling the false alarm rate. Classical\nbatch tests are not tailored for streaming data: valid inference after data\npeeking requires correcting for multiple testing which results in low power.\nFollowing the principle of testing by betting, we design sequential kernelized\nindependence tests that overcome such shortcomings. We exemplify our broad\nframework using bets inspired by kernelized dependence measures, e.g., the\nHilbert-Schmidt independence criterion. Our test is also valid under\nnon-i.i.d., time-varying settings. We demonstrate the power of our approaches\non both simulated and real data.\n","authors":["Aleksandr Podkopaev","Patrick Blöbaum","Shiva Prasad Kasiviswanathan","Aaditya Ramdas"],"pdf_url":"https://arxiv.org/pdf/2212.07383v3.pdf","comment":"To appear at ICML 2023"},{"id":"http://arxiv.org/abs/2307.10169v1","updated":"2023-07-19T17:55:13Z","published":"2023-07-19T17:55:13Z","title":"Challenges and Applications of Large Language Models","summary":"  Large Language Models (LLMs) went from non-existent to ubiquitous in the\nmachine learning discourse within a few years. Due to the fast pace of the\nfield, it is difficult to identify the remaining challenges and already\nfruitful application areas. In this paper, we aim to establish a systematic set\nof open problems and application successes so that ML researchers can\ncomprehend the field's current state more quickly and become productive.\n","authors":["Jean Kaddour","Joshua Harris","Maximilian Mozes","Herbie Bradley","Roberta Raileanu","Robert McHardy"],"pdf_url":"https://arxiv.org/pdf/2307.10169v1.pdf","comment":"72 pages. v01. Work in progress. Feedback and comments are highly\n  appreciated!"},{"id":"http://arxiv.org/abs/2307.10167v1","updated":"2023-07-19T17:53:22Z","published":"2023-07-19T17:53:22Z","title":"VITS : Variational Inference Thomson Sampling for contextual bandits","summary":"  In this paper, we introduce and analyze a variant of the Thompson sampling\n(TS) algorithm for contextual bandits. At each round, traditional TS requires\nsamples from the current posterior distribution, which is usually intractable.\nTo circumvent this issue, approximate inference techniques can be used and\nprovide samples with distribution close to the posteriors. However, current\napproximate techniques yield to either poor estimation (Laplace approximation)\nor can be computationally expensive (MCMC methods, Ensemble sampling...). In\nthis paper, we propose a new algorithm, Varational Inference Thompson sampling\nVITS, based on Gaussian Variational Inference. This scheme provides powerful\nposterior approximations which are easy to sample from, and is computationally\nefficient, making it an ideal choice for TS. In addition, we show that VITS\nachieves a sub-linear regret bound of the same order in the dimension and\nnumber of round as traditional TS for linear contextual bandit. Finally, we\ndemonstrate experimentally the effectiveness of VITS on both synthetic and real\nworld datasets.\n","authors":["Pierre Clavier","Tom Huix","Alain Durmus"],"pdf_url":"https://arxiv.org/pdf/2307.10167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10163v1","updated":"2023-07-19T17:44:54Z","published":"2023-07-19T17:44:54Z","title":"Rethinking Backdoor Attacks","summary":"  In a backdoor attack, an adversary inserts maliciously constructed backdoor\nexamples into a training set to make the resulting model vulnerable to\nmanipulation. Defending against such attacks typically involves viewing these\ninserted examples as outliers in the training set and using techniques from\nrobust statistics to detect and remove them.\n  In this work, we present a different approach to the backdoor attack problem.\nSpecifically, we show that without structural information about the training\ndata distribution, backdoor attacks are indistinguishable from\nnaturally-occurring features in the data--and thus impossible to \"detect\" in a\ngeneral sense. Then, guided by this observation, we revisit existing defenses\nagainst backdoor attacks and characterize the (often latent) assumptions they\nmake and on which they depend. Finally, we explore an alternative perspective\non backdoor attacks: one that assumes these attacks correspond to the strongest\nfeature in the training data. Under this assumption (which we make formal) we\ndevelop a new primitive for detecting backdoor attacks. Our primitive naturally\ngives rise to a detection algorithm that comes with theoretical guarantees and\nis effective in practice.\n","authors":["Alaa Khaddaj","Guillaume Leclerc","Aleksandar Makelov","Kristian Georgiev","Hadi Salman","Andrew Ilyas","Aleksander Madry"],"pdf_url":"https://arxiv.org/pdf/2307.10163v1.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2307.10160v1","updated":"2023-07-19T17:42:36Z","published":"2023-07-19T17:42:36Z","title":"Robust Driving Policy Learning with Guided Meta Reinforcement Learning","summary":"  Although deep reinforcement learning (DRL) has shown promising results for\nautonomous navigation in interactive traffic scenarios, existing work typically\nadopts a fixed behavior policy to control social vehicles in the training\nenvironment. This may cause the learned driving policy to overfit the\nenvironment, making it difficult to interact well with vehicles with different,\nunseen behaviors. In this work, we introduce an efficient method to train\ndiverse driving policies for social vehicles as a single meta-policy. By\nrandomizing the interaction-based reward functions of social vehicles, we can\ngenerate diverse objectives and efficiently train the meta-policy through\nguiding policies that achieve specific objectives. We further propose a\ntraining strategy to enhance the robustness of the ego vehicle's driving policy\nusing the environment where social vehicles are controlled by the learned\nmeta-policy. Our method successfully learns an ego driving policy that\ngeneralizes well to unseen situations with out-of-distribution (OOD) social\nagents' behaviors in a challenging uncontrolled T-intersection scenario.\n","authors":["Kanghoon Lee","Jiachen Li","David Isele","Jinkyoo Park","Kikuo Fujimura","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2307.10160v1.pdf","comment":"ITSC 2023"},{"id":"http://arxiv.org/abs/2307.10155v1","updated":"2023-07-19T17:35:08Z","published":"2023-07-19T17:35:08Z","title":"Curvature-based Clustering on Graphs","summary":"  Unsupervised node clustering (or community detection) is a classical graph\nlearning task. In this paper, we study algorithms, which exploit the geometry\nof the graph to identify densely connected substructures, which form clusters\nor communities. Our method implements discrete Ricci curvatures and their\nassociated geometric flows, under which the edge weights of the graph evolve to\nreveal its community structure. We consider several discrete curvature notions\nand analyze the utility of the resulting algorithms. In contrast to prior\nliterature, we study not only single-membership community detection, where each\nnode belongs to exactly one community, but also mixed-membership community\ndetection, where communities may overlap. For the latter, we argue that it is\nbeneficial to perform community detection on the line graph, i.e., the graph's\ndual. We provide both theoretical and empirical evidence for the utility of our\ncurvature-based clustering algorithms. In addition, we give several results on\nthe relationship between the curvature of a graph and that of its dual, which\nenable the efficient implementation of our proposed mixed-membership community\ndetection approach and which may be of independent interest for curvature-based\nnetwork analysis.\n","authors":["Yu Tian","Zachary Lubberts","Melanie Weber"],"pdf_url":"https://arxiv.org/pdf/2307.10155v1.pdf","comment":"65 pages, 19 figures"},{"id":"http://arxiv.org/abs/2307.04228v2","updated":"2023-07-19T17:24:29Z","published":"2023-07-09T16:44:37Z","title":"Efficient Bayesian travel-time tomography with geologically-complex\n  priors using sensitivity-informed polynomial chaos expansion and deep\n  generative networks","summary":"  Monte Carlo Markov Chain (MCMC) methods commonly confront two fundamental\nchallenges: the accurate characterization of the prior distribution and the\nefficient evaluation of the likelihood. In the context of Bayesian studies on\ntomography, principal component analysis (PCA) can in some cases facilitate the\nstraightforward definition of the prior distribution, while simultaneously\nenabling the implementation of accurate surrogate models based on polynomial\nchaos expansion (PCE) to replace computationally intensive full-physics forward\nsolvers. When faced with scenarios where PCA does not offer a direct means of\neasily defining the prior distribution alternative methods like deep generative\nmodels (e.g., variational autoencoders (VAEs)), can be employed as viable\noptions. However, accurately producing a surrogate capable of capturing the\nintricate non-linear relationship between the latent parameters of a VAE and\nthe outputs of forward modeling presents a notable challenge. Indeed, while PCE\nmodels provide high accuracy when the input-output relationship can be\neffectively approximated by relatively low-degree multivariate polynomials,\nthis condition is typically unmet when utilizing latent variables derived from\ndeep generative models. In this contribution, we present a strategy that\ncombines the excellent reconstruction performances of VAE in terms of prio\nrepresentation with the accuracy of PCA-PCE surrogate modeling in the context\nof Bayesian ground penetrating radar (GPR) travel-time tomography. Within the\nMCMC process, the parametrization of the VAE is leveraged for prior exploration\nand sample proposal. Concurrently, modeling is conducted using PCE, which\noperates on either globally or locally defined principal components of the VAE\nsamples under examination.\n","authors":["Giovanni Angelo Meles","Macarena Amaya","Shiran Levy","Stefano Marelli","Niklas Linde"],"pdf_url":"https://arxiv.org/pdf/2307.04228v2.pdf","comment":"25 pages, 15 figures"},{"id":"http://arxiv.org/abs/2307.10142v1","updated":"2023-07-19T17:12:28Z","published":"2023-07-19T17:12:28Z","title":"Benchmarking Potential Based Rewards for Learning Humanoid Locomotion","summary":"  The main challenge in developing effective reinforcement learning (RL)\npipelines is often the design and tuning the reward functions. Well-designed\nshaping reward can lead to significantly faster learning. Naively formulated\nrewards, however, can conflict with the desired behavior and result in\noverfitting or even erratic performance if not properly tuned. In theory, the\nbroad class of potential based reward shaping (PBRS) can help guide the\nlearning process without affecting the optimal policy. Although several studies\nhave explored the use of potential based reward shaping to accelerate learning\nconvergence, most have been limited to grid-worlds and low-dimensional systems,\nand RL in robotics has predominantly relied on standard forms of reward\nshaping. In this paper, we benchmark standard forms of shaping with PBRS for a\nhumanoid robot. We find that in this high-dimensional system, PBRS has only\nmarginal benefits in convergence speed. However, the PBRS reward terms are\nsignificantly more robust to scaling than typical reward shaping approaches,\nand thus easier to tune.\n","authors":["Se Hwan Jeon","Steve Heim","Charles Khazoom","Sangbae Kim"],"pdf_url":"https://arxiv.org/pdf/2307.10142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.08169v2","updated":"2023-07-19T16:45:18Z","published":"2022-09-16T20:52:39Z","title":"Value Summation: A Novel Scoring Function for MPC-based Model-based\n  Reinforcement Learning","summary":"  This paper proposes a novel scoring function for the planning module of\nMPC-based reinforcement learning methods to address the inherent bias of using\nthe reward function to score trajectories. The proposed method enhances the\nlearning efficiency of existing MPC-based MBRL methods using the discounted sum\nof values. The method utilizes optimal trajectories to guide policy learning\nand updates its state-action value function based on real-world and augmented\nonboard data. The learning efficiency of the proposed method is evaluated in\nselected MuJoCo Gym environments as well as in learning locomotion skills for a\nsimulated model of the Cassie robot. The results demonstrate that the proposed\nmethod outperforms the current state-of-the-art algorithms in terms of learning\nefficiency and average reward return.\n","authors":["Mehran Raisi","Amirhossein Noohian","Luc Mccutcheon","Saber Fallah"],"pdf_url":"https://arxiv.org/pdf/2209.08169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09191v2","updated":"2023-07-19T16:24:31Z","published":"2023-07-17T13:17:26Z","title":"A benchmark of categorical encoders for binary classification","summary":"  Categorical encoders transform categorical features into numerical\nrepresentations that are indispensable for a wide range of machine learning\nmodels. Existing encoder benchmark studies lack generalizability because of\ntheir limited choice of (1) encoders, (2) experimental factors, and (3)\ndatasets. Additionally, inconsistencies arise from the adoption of varying\naggregation strategies. This paper is the most comprehensive benchmark of\ncategorical encoders to date, including an extensive evaluation of 32\nconfigurations of encoders from diverse families, with 36 combinations of\nexperimental factors, and on 50 datasets. The study shows the profound\ninfluence of dataset selection, experimental factors, and aggregation\nstrategies on the benchmark's conclusions -- aspects disregarded in previous\nencoder benchmarks.\n","authors":["Federico Matteucci","Vadim Arzamasov","Klemens Boehm"],"pdf_url":"https://arxiv.org/pdf/2307.09191v2.pdf","comment":"Submitted to the 37th Conference on Neural Information Processing\n  Systems (NeurIPS 2023) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2103.03328v3","updated":"2023-07-19T16:19:53Z","published":"2021-03-04T20:58:22Z","title":"Evaluation of Complexity Measures for Deep Learning Generalization in\n  Medical Image Analysis","summary":"  The generalization performance of deep learning models for medical image\nanalysis often decreases on images collected with different devices for data\nacquisition, device settings, or patient population. A better understanding of\nthe generalization capacity on new images is crucial for clinicians'\ntrustworthiness in deep learning. Although significant research efforts have\nbeen recently directed toward establishing generalization bounds and complexity\nmeasures, still, there is often a significant discrepancy between the predicted\nand actual generalization performance. As well, related large empirical studies\nhave been primarily based on validation with general-purpose image datasets.\nThis paper presents an empirical study that investigates the correlation\nbetween 25 complexity measures and the generalization abilities of supervised\ndeep learning classifiers for breast ultrasound images. The results indicate\nthat PAC-Bayes flatness-based and path norm-based measures produce the most\nconsistent explanation for the combination of models and data. We also\ninvestigate the use of multi-task classification and segmentation approach for\nbreast images, and report that such learning approach acts as an implicit\nregularizer and is conducive toward improved generalization.\n","authors":["Aleksandar Vakanski","Min Xian"],"pdf_url":"https://arxiv.org/pdf/2103.03328v3.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2306.13197v2","updated":"2023-07-19T16:19:24Z","published":"2023-06-22T20:42:50Z","title":"Pre or Post-Softmax Scores in Gradient-based Attribution Methods, What\n  is Best?","summary":"  Gradient based attribution methods for neural networks working as classifiers\nuse gradients of network scores. Here we discuss the practical differences\nbetween using gradients of pre-softmax scores versus post-softmax scores, and\ntheir respective advantages and disadvantages.\n","authors":["Miguel Lerma","Mirtha Lucas"],"pdf_url":"https://arxiv.org/pdf/2306.13197v2.pdf","comment":"8 pages, 2 figures, 2023 IEEE 13th International Conference on\n  Pattern Recognition Systems (ICPRS)"},{"id":"http://arxiv.org/abs/2210.12547v2","updated":"2023-07-19T16:16:50Z","published":"2022-10-22T20:42:06Z","title":"SurCo: Learning Linear Surrogates For Combinatorial Nonlinear\n  Optimization Problems","summary":"  Optimization problems with nonlinear cost functions and combinatorial\nconstraints appear in many real-world applications but remain challenging to\nsolve efficiently compared to their linear counterparts. To bridge this gap, we\npropose $\\textbf{SurCo}$ that learns linear $\\underline{\\text{Sur}}$rogate\ncosts which can be used in existing $\\underline{\\text{Co}}$mbinatorial solvers\nto output good solutions to the original nonlinear combinatorial optimization\nproblem. The surrogate costs are learned end-to-end with nonlinear loss by\ndifferentiating through the linear surrogate solver, combining the flexibility\nof gradient-based methods with the structure of linear combinatorial\noptimization. We propose three $\\texttt{SurCo}$ variants:\n$\\texttt{SurCo}-\\texttt{zero}$ for individual nonlinear problems,\n$\\texttt{SurCo}-\\texttt{prior}$ for problem distributions, and\n$\\texttt{SurCo}-\\texttt{hybrid}$ to combine both distribution and\nproblem-specific information. We give theoretical intuition motivating\n$\\texttt{SurCo}$, and evaluate it empirically. Experiments show that\n$\\texttt{SurCo}$ finds better solutions faster than state-of-the-art and domain\nexpert approaches in real-world optimization problems such as embedding table\nsharding, inverse photonic design, and nonlinear route planning.\n","authors":["Aaron Ferber","Taoan Huang","Daochen Zha","Martin Schubert","Benoit Steiner","Bistra Dilkina","Yuandong Tian"],"pdf_url":"https://arxiv.org/pdf/2210.12547v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05783v4","updated":"2023-07-19T16:14:31Z","published":"2023-02-11T21:07:30Z","title":"ConCerNet: A Contrastive Learning Based Framework for Automated\n  Conservation Law Discovery and Trustworthy Dynamical System Prediction","summary":"  Deep neural networks (DNN) have shown great capacity of modeling a dynamical\nsystem; nevertheless, they usually do not obey physics constraints such as\nconservation laws. This paper proposes a new learning framework named ConCerNet\nto improve the trustworthiness of the DNN based dynamics modeling to endow the\ninvariant properties. ConCerNet consists of two steps: (i) a contrastive\nlearning method to automatically capture the system invariants (i.e.\nconservation properties) along the trajectory observations; (ii) a neural\nprojection layer to guarantee that the learned dynamics models preserve the\nlearned invariants. We theoretically prove the functional relationship between\nthe learned latent representation and the unknown system invariant function.\nExperiments show that our method consistently outperforms the baseline neural\nnetworks in both coordinate error and conservation metrics by a large margin.\nWith neural network based parameterization and no dependence on prior\nknowledge, our method can be extended to complex and large-scale dynamics by\nleveraging an autoencoder.\n","authors":["Wang Zhang","Tsui-Wei Weng","Subhro Das","Alexandre Megretski","Luca Daniel","Lam M. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2302.05783v4.pdf","comment":"Accepted by ICML 2023"},{"id":"http://arxiv.org/abs/2307.10098v1","updated":"2023-07-19T16:13:13Z","published":"2023-07-19T16:13:13Z","title":"Gradient Sparsification For Masked Fine-Tuning of Transformers","summary":"  Fine-tuning pretrained self-supervised language models is widely adopted for\ntransfer learning to downstream tasks. Fine-tuning can be achieved by freezing\ngradients of the pretrained network and only updating gradients of a newly\nadded classification layer, or by performing gradient updates on all\nparameters. Gradual unfreezing makes a trade-off between the two by gradually\nunfreezing gradients of whole layers during training. This has been an\neffective strategy to trade-off between storage and training speed with\ngeneralization performance. However, it is not clear whether gradually\nunfreezing layers throughout training is optimal, compared to sparse variants\nof gradual unfreezing which may improve fine-tuning performance. In this paper,\nwe propose to stochastically mask gradients to regularize pretrained language\nmodels for improving overall fine-tuned performance. We introduce GradDrop and\nvariants thereof, a class of gradient sparsification methods that mask\ngradients during the backward pass, acting as gradient noise. GradDrop is\nsparse and stochastic unlike gradual freezing. Extensive experiments on the\nmultilingual XGLUE benchmark with XLMR-Large show that GradDrop is competitive\nagainst methods that use additional translated data for intermediate\npretraining and outperforms standard fine-tuning and gradual unfreezing. A\npost-analysis shows how GradDrop improves performance with languages it was not\ntrained on, such as under-resourced languages.\n","authors":["James O' Neill","Sourav Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.10098v1.pdf","comment":"Accepted to IJCNN 2023"},{"id":"http://arxiv.org/abs/2307.10093v1","updated":"2023-07-19T16:00:29Z","published":"2023-07-19T16:00:29Z","title":"Revisiting invariances and introducing priors in Gromov-Wasserstein\n  distances","summary":"  Gromov-Wasserstein distance has found many applications in machine learning\ndue to its ability to compare measures across metric spaces and its invariance\nto isometric transformations. However, in certain applications, this invariance\nproperty can be too flexible, thus undesirable. Moreover, the\nGromov-Wasserstein distance solely considers pairwise sample similarities in\ninput datasets, disregarding the raw feature representations. We propose a new\noptimal transport-based distance, called Augmented Gromov-Wasserstein, that\nallows for some control over the level of rigidity to transformations. It also\nincorporates feature alignments, enabling us to better leverage prior knowledge\non the input data for improved performance. We present theoretical insights\ninto the proposed metric. We then demonstrate its usefulness for single-cell\nmulti-omic alignment tasks and a transfer learning scenario in machine\nlearning.\n","authors":["Pinar Demetci","Quang Huy Tran","Ievgen Redko","Ritambhara Singh"],"pdf_url":"https://arxiv.org/pdf/2307.10093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04838v2","updated":"2023-07-19T15:59:03Z","published":"2023-07-10T18:15:03Z","title":"CREPE: Learnable Prompting With CLIP Improves Visual Relationship\n  Prediction","summary":"  In this paper, we explore the potential of Vision-Language Models (VLMs),\nspecifically CLIP, in predicting visual object relationships, which involves\ninterpreting visual features from images into language-based relations. Current\nstate-of-the-art methods use complex graphical models that utilize language\ncues and visual features to address this challenge. We hypothesize that the\nstrong language priors in CLIP embeddings can simplify these graphical models\npaving for a simpler approach. We adopt the UVTransE relation prediction\nframework, which learns the relation as a translational embedding with subject,\nobject, and union box embeddings from a scene. We systematically explore the\ndesign of CLIP-based subject, object, and union-box representations within the\nUVTransE framework and propose CREPE (CLIP Representation Enhanced Predicate\nEstimation). CREPE utilizes text-based representations for all three bounding\nboxes and introduces a novel contrastive training strategy to automatically\ninfer the text prompt for union-box. Our approach achieves state-of-the-art\nperformance in predicate estimation, mR@5 27.79, and mR@20 31.95 on the Visual\nGenome benchmark, achieving a 15.3\\% gain in performance over recent\nstate-of-the-art at mR@20. This work demonstrates CLIP's effectiveness in\nobject relation prediction and encourages further research on VLMs in this\nchallenging domain.\n","authors":["Rakshith Subramanyam","T. S. Jayram","Rushil Anirudh","Jayaraman J. Thiagarajan"],"pdf_url":"https://arxiv.org/pdf/2307.04838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10088v1","updated":"2023-07-19T15:57:24Z","published":"2023-07-19T15:57:24Z","title":"Android in the Wild: A Large-Scale Dataset for Android Device Control","summary":"  There is a growing interest in device-control systems that can interpret\nhuman natural language instructions and execute them on a digital device by\ndirectly controlling its user interface. We present a dataset for\ndevice-control research, Android in the Wild (AITW), which is orders of\nmagnitude larger than current datasets. The dataset contains human\ndemonstrations of device interactions, including the screens and actions, and\ncorresponding natural language instructions. It consists of 715k episodes\nspanning 30k unique instructions, four versions of Android (v10-13),and eight\ndevice types (Pixel 2 XL to Pixel 6) with varying screen resolutions. It\ncontains multi-step tasks that require semantic understanding of language and\nvisual context. This dataset poses a new challenge: actions available through\nthe user interface must be inferred from their visual appearance. And, instead\nof simple UI element-based actions, the action space consists of precise\ngestures (e.g., horizontal scrolls to operate carousel widgets). We organize\nour dataset to encourage robustness analysis of device-control systems, i.e.,\nhow well a system performs in the presence of new task descriptions, new\napplications, or new platform versions. We develop two agents and report\nperformance across the dataset. The dataset is available at\nhttps://github.com/google-research/google-research/tree/master/android_in_the_wild.\n","authors":["Christopher Rawles","Alice Li","Daniel Rodriguez","Oriana Riva","Timothy Lillicrap"],"pdf_url":"https://arxiv.org/pdf/2307.10088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10078v1","updated":"2023-07-19T15:51:25Z","published":"2023-07-19T15:51:25Z","title":"A Dual Formulation for Probabilistic Principal Component Analysis","summary":"  In this paper, we characterize Probabilistic Principal Component Analysis in\nHilbert spaces and demonstrate how the optimal solution admits a representation\nin dual space. This allows us to develop a generative framework for kernel\nmethods. Furthermore, we show how it englobes Kernel Principal Component\nAnalysis and illustrate its working on a toy and a real dataset.\n","authors":["Henri De Plaen","Johan A. K. Suykens"],"pdf_url":"https://arxiv.org/pdf/2307.10078v1.pdf","comment":"ICML 2023 Workshop on Duality for Modern Machine Learning (DP4ML). 14\n  pages (8 main + 5 appendix), 4 figures and 4 tables"},{"id":"http://arxiv.org/abs/2212.00736v2","updated":"2023-07-19T15:43:40Z","published":"2022-12-01T18:29:48Z","title":"An exponentially-growing family of universal quantum circuits","summary":"  Quantum machine learning has become an area of growing interest but has\ncertain theoretical and hardware-specific limitations. Notably, the problem of\nvanishing gradients, or barren plateaus, renders the training impossible for\ncircuits with high qubit counts, imposing a limit on the number of qubits that\ndata scientists can use for solving problems. Independently, angle-embedded\nsupervised quantum neural networks were shown to produce truncated Fourier\nseries with a degree directly dependent on two factors: the depth of the\nencoding and the number of parallel qubits the encoding applied to. The degree\nof the Fourier series limits the model expressivity. This work introduces two\nnew architectures whose Fourier degrees grow exponentially: the sequential and\nparallel exponential quantum machine learning architectures. This is done by\nefficiently using the available Hilbert space when encoding, increasing the\nexpressivity of the quantum encoding. Therefore, the exponential growth allows\nstaying at the low-qubit limit to create highly expressive circuits avoiding\nbarren plateaus. Practically, parallel exponential architecture was shown to\noutperform the existing linear architectures by reducing their final mean\nsquare error value by up to 44.7% in a one-dimensional test problem.\nFurthermore, the feasibility of this technique was also shown on a trapped ion\nquantum processing unit.\n","authors":["Mo Kordzanganeh","Pavel Sekatski","Markus Pflitsch","Alexey Melnikov"],"pdf_url":"https://arxiv.org/pdf/2212.00736v2.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.10062v1","updated":"2023-07-19T15:33:11Z","published":"2023-07-19T15:33:11Z","title":"Unsupervised Accuracy Estimation of Deep Visual Models using\n  Domain-Adaptive Adversarial Perturbation without Source Samples","summary":"  Deploying deep visual models can lead to performance drops due to the\ndiscrepancies between source and target distributions. Several approaches\nleverage labeled source data to estimate target domain accuracy, but accessing\nlabeled source data is often prohibitively difficult due to data\nconfidentiality or resource limitations on serving devices. Our work proposes a\nnew framework to estimate model accuracy on unlabeled target data without\naccess to source data. We investigate the feasibility of using pseudo-labels\nfor accuracy estimation and evolve this idea into adopting recent advances in\nsource-free domain adaptation algorithms. Our approach measures the\ndisagreement rate between the source hypothesis and the target pseudo-labeling\nfunction, adapted from the source hypothesis. We mitigate the impact of\nerroneous pseudo-labels that may arise due to a high ideal joint hypothesis\nrisk by employing adaptive adversarial perturbation on the input of the target\nmodel. Our proposed source-free framework effectively addresses the challenging\ndistribution shift scenarios and outperforms existing methods requiring source\ndata and labels for training.\n","authors":["JoonHo Lee","Jae Oh Woo","Hankyu Moon","Kwonho Lee"],"pdf_url":"https://arxiv.org/pdf/2307.10062v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10060v1","updated":"2023-07-19T15:30:06Z","published":"2023-07-19T15:30:06Z","title":"Accurate deep learning sub-grid scale models for large eddy simulations","summary":"  We present two families of sub-grid scale (SGS) turbulence models developed\nfor large-eddy simulation (LES) purposes. Their development required the\nformulation of physics-informed robust and efficient Deep Learning (DL)\nalgorithms which, unlike state-of-the-art analytical modeling techniques can\nproduce high-order complex non-linear relations between inputs and outputs.\nExplicit filtering of data from direct simulations of the canonical channel\nflow at two friction Reynolds numbers $Re_\\tau\\approx 395$ and 590 provided\naccurate data for training and testing. The two sets of models use different\nnetwork architectures. One of the architectures uses tensor basis neural\nnetworks (TBNN) and embeds the simplified analytical model form of the general\neffective-viscosity hypothesis, thus incorporating the Galilean, rotational and\nreflectional invariances. The other architecture is that of a relatively simple\nnetwork, that is able to incorporate the Galilean invariance only. However,\nthis simpler architecture has better feature extraction capacity owing to its\nability to establish relations between and extract information from\ncross-components of the integrity basis tensors and the SGS stresses. Both sets\nof models are used to predict the SGS stresses for feature datasets generated\nwith different filter widths, and at different Reynolds numbers. It is shown\nthat due to the simpler model's better feature learning capabilities, it\noutperforms the invariance embedded model in statistical performance metrics.\nIn a priori tests, both sets of models provide similar levels of dissipation\nand backscatter. Based on the test results, both sets of models should be\nusable in a posteriori actual LESs.\n","authors":["Rikhi Bose","Arunabha M. Roy"],"pdf_url":"https://arxiv.org/pdf/2307.10060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10053v1","updated":"2023-07-19T15:26:18Z","published":"2023-07-19T15:26:18Z","title":"Convergence Guarantees for Stochastic Subgradient Methods in Nonsmooth\n  Nonconvex Optimization","summary":"  In this paper, we investigate the convergence properties of the stochastic\ngradient descent (SGD) method and its variants, especially in training neural\nnetworks built from nonsmooth activation functions. We develop a novel\nframework that assigns different timescales to stepsizes for updating the\nmomentum terms and variables, respectively. Under mild conditions, we prove the\nglobal convergence of our proposed framework in both single-timescale and\ntwo-timescale cases. We show that our proposed framework encompasses a wide\nrange of well-known SGD-type methods, including heavy-ball SGD, SignSGD, Lion,\nnormalized SGD and clipped SGD. Furthermore, when the objective function adopts\na finite-sum formulation, we prove the convergence properties for these\nSGD-type methods based on our proposed framework. In particular, we prove that\nthese SGD-type methods find the Clarke stationary points of the objective\nfunction with randomly chosen stepsizes and initial points under mild\nassumptions. Preliminary numerical experiments demonstrate the high efficiency\nof our analyzed SGD-type methods.\n","authors":["Nachuan Xiao","Xiaoyin Hu","Kim-Chuan Toh"],"pdf_url":"https://arxiv.org/pdf/2307.10053v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2303.15592v2","updated":"2023-07-19T15:16:21Z","published":"2023-03-27T20:49:42Z","title":"Uncovering Bias in Personal Informatics","summary":"  Personal informatics (PI) systems, powered by smartphones and wearables,\nenable people to lead healthier lifestyles by providing meaningful and\nactionable insights that break down barriers between users and their health\ninformation. Today, such systems are used by billions of users for monitoring\nnot only physical activity and sleep but also vital signs and women's and heart\nhealth, among others. Despite their widespread usage, the processing of\nsensitive PI data may suffer from biases, which may entail practical and\nethical implications. In this work, we present the first comprehensive\nempirical and analytical study of bias in PI systems, including biases in raw\ndata and in the entire machine learning life cycle. We use the most detailed\nframework to date for exploring the different sources of bias and find that\nbiases exist both in the data generation and the model learning and\nimplementation streams. According to our results, the most affected minority\ngroups are users with health issues, such as diabetes, joint issues, and\nhypertension, and female users, whose data biases are propagated or even\namplified by learning models, while intersectional biases can also be observed.\n","authors":["Sofia Yfantidou","Pavlos Sermpezis","Athena Vakali","Ricardo Baeza-Yates"],"pdf_url":"https://arxiv.org/pdf/2303.15592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10026v1","updated":"2023-07-19T15:11:04Z","published":"2023-07-19T15:11:04Z","title":"Contextual Reliability: When Different Features Matter in Different\n  Contexts","summary":"  Deep neural networks often fail catastrophically by relying on spurious\ncorrelations. Most prior work assumes a clear dichotomy into spurious and\nreliable features; however, this is often unrealistic. For example, most of the\ntime we do not want an autonomous car to simply copy the speed of surrounding\ncars -- we don't want our car to run a red light if a neighboring car does so.\nHowever, we cannot simply enforce invariance to next-lane speed, since it could\nprovide valuable information about an unobservable pedestrian at a crosswalk.\nThus, universally ignoring features that are sometimes (but not always)\nreliable can lead to non-robust performance. We formalize a new setting called\ncontextual reliability which accounts for the fact that the \"right\" features to\nuse may vary depending on the context. We propose and analyze a two-stage\nframework called Explicit Non-spurious feature Prediction (ENP) which first\nidentifies the relevant features to use for a given context, then trains a\nmodel to rely exclusively on these features. Our work theoretically and\nempirically demonstrates the advantages of ENP over existing methods and\nprovides new benchmarks for contextual reliability.\n","authors":["Gaurav Ghosal","Amrith Setlur","Daniel S. Brown","Anca D. Dragan","Aditi Raghunathan"],"pdf_url":"https://arxiv.org/pdf/2307.10026v1.pdf","comment":"ICML 2023 Camera Ready Version"},{"id":"http://arxiv.org/abs/2307.10022v1","updated":"2023-07-19T15:05:55Z","published":"2023-07-19T15:05:55Z","title":"Europepolls: A Dataset of Country-Level Opinion Polling Data for the\n  European Union and the UK","summary":"  I propose an open dataset of country-level historical opinion polling data\nfor the European Union and the UK. The dataset aims to fill a gap in available\nopinion polling data for the European Union. Some existing datasets are\nrestricted to the past five years, limiting research opportunities. At the same\ntime, some larger proprietary datasets exist but are available only in a visual\npreprocessed time series format. Finally, while other large datasets for\nindividual countries might exist, these could be inaccessible due to language\nbarriers. The data was gathered from Wikipedia, and preprocessed using the\npandas library. Both the raw and the preprocessed data are in the .csv format.\nI hope that given the recent advances in LLMs and deep learning in general,\nthis large dataset will enable researchers to uncover complex interactions\nbetween multimodal data (news articles, economic indicators, social media) and\nvoting behavior. The raw data, the preprocessed data, and the preprocessing\nscripts are available on GitHub.\n","authors":["Konstantinos Pitas"],"pdf_url":"https://arxiv.org/pdf/2307.10022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15585v2","updated":"2023-07-19T15:00:06Z","published":"2023-03-27T20:28:26Z","title":"Beyond Accuracy: A Critical Review of Fairness in Machine Learning for\n  Mobile and Wearable Computing","summary":"  The field of mobile and wearable computing is undergoing a revolutionary\nintegration of machine learning. Devices can now diagnose diseases, predict\nheart irregularities, and unlock the full potential of human cognition.\nHowever, the underlying algorithms powering these predictions are not immune to\nbiases with respect to sensitive attributes (e.g., gender, race), leading to\ndiscriminatory outcomes. The goal of this work is to explore the extent to\nwhich the mobile and wearable computing community has adopted ways of reporting\ninformation about datasets and models to surface and, eventually, counter\nbiases. Our systematic review of papers published in the Proceedings of the ACM\nInteractive, Mobile, Wearable and Ubiquitous Technologies (IMWUT) journal from\n2018-2022 indicates that, while there has been progress made on algorithmic\nfairness, there is still ample room for growth. Our findings show that only a\nsmall portion (5%) of published papers adheres to modern fairness reporting,\nwhile the overwhelming majority thereof focuses on accuracy or error metrics.\nTo generalize these results across venues of similar scope, we analyzed recent\nproceedings of ACM MobiCom, MobiSys, and SenSys, IEEE Pervasive, and IEEE\nTransactions on Mobile Computing Computing, and found no deviation from our\nprimary result. In light of these findings, our work provides practical\nguidelines for the design and development of mobile and wearable technologies\nthat not only strive for accuracy but also fairness.\n","authors":["Sofia Yfantidou","Marios Constantinides","Dimitris Spathis","Athena Vakali","Daniele Quercia","Fahim Kawsar"],"pdf_url":"https://arxiv.org/pdf/2303.15585v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.09753v3","updated":"2023-07-19T14:55:17Z","published":"2022-02-20T07:42:00Z","title":"Finite-Time Analysis of Natural Actor-Critic for POMDPs","summary":"  We consider the reinforcement learning problem for partially observed Markov\ndecision processes (POMDPs) with large or even countably infinite state spaces,\nwhere the controller has access to only noisy observations of the underlying\ncontrolled Markov chain. We consider a natural actor-critic method that employs\na finite internal memory for policy parameterization, and a multi-step temporal\ndifference learning algorithm for policy evaluation. We establish, to the best\nof our knowledge, the first non-asymptotic global convergence of actor-critic\nmethods for partially observed systems under function approximation. In\nparticular, in addition to the function approximation and statistical errors\nthat also arise in MDPs, we explicitly characterize the error due to the use of\nfinite-state controllers. This additional error is stated in terms of the total\nvariation distance between the traditional belief state in POMDPs and the\nposterior distribution of the hidden state when using a finite-state\ncontroller. Further, we show that this error can be made small in the case of\nsliding-block controllers by using larger block sizes.\n","authors":["Semih Cayci","Niao He","R. Srikant"],"pdf_url":"https://arxiv.org/pdf/2202.09753v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06385v2","updated":"2023-07-19T14:51:37Z","published":"2023-07-12T18:13:58Z","title":"Temporal Label-Refinement for Weakly-Supervised Audio-Visual Event\n  Localization","summary":"  Audio-Visual Event Localization (AVEL) is the task of temporally localizing\nand classifying \\emph{audio-visual events}, i.e., events simultaneously visible\nand audible in a video. In this paper, we solve AVEL in a weakly-supervised\nsetting, where only video-level event labels (their presence/absence, but not\ntheir locations in time) are available as supervision for training. Our idea is\nto use a base model to estimate labels on the training data at a finer temporal\nresolution than at the video level and re-train the model with these labels.\nI.e., we determine the subset of labels for each \\emph{slice} of frames in a\ntraining video by (i) replacing the frames outside the slice with those from a\nsecond video having no overlap in video-level labels, and (ii) feeding this\nsynthetic video into the base model to extract labels for just the slice in\nquestion. To handle the out-of-distribution nature of our synthetic videos, we\npropose an auxiliary objective for the base model that induces more reliable\npredictions of the localized event labels as desired. Our three-stage pipeline\noutperforms several existing AVEL methods with no architectural changes and\nimproves performance on a related weakly-supervised task as well.\n","authors":["Kalyan Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2307.06385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.07677v4","updated":"2023-07-19T14:45:15Z","published":"2021-06-14T18:01:08Z","title":"Planning to Fairly Allocate: Probabilistic Fairness in the Restless\n  Bandit Setting","summary":"  Restless and collapsing bandits are often used to model budget-constrained\nresource allocation in settings where arms have action-dependent transition\nprobabilities, such as the allocation of health interventions among patients.\nHowever, state-of-the-art Whittle-index-based approaches to this planning\nproblem either do not consider fairness among arms, or incentivize fairness\nwithout guaranteeing it. We thus introduce ProbFair, a probabilistically fair\npolicy that maximizes total expected reward and satisfies the budget constraint\nwhile ensuring a strictly positive lower bound on the probability of being\npulled at each timestep. We evaluate our algorithm on a real-world application,\nwhere interventions support continuous positive airway pressure (CPAP) therapy\nadherence among patients, as well as on a broader class of synthetic transition\nmatrices. We find that ProbFair preserves utility while providing fairness\nguarantees.\n","authors":["Christine Herlihy","Aviva Prins","Aravind Srinivasan","John P. Dickerson"],"pdf_url":"https://arxiv.org/pdf/2106.07677v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11103v2","updated":"2023-07-19T14:42:10Z","published":"2023-03-20T13:40:11Z","title":"Sionna RT: Differentiable Ray Tracing for Radio Propagation Modeling","summary":"  Sionna is a GPU-accelerated open-source library for link-level simulations\nbased on TensorFlow. Since release v0.14 it integrates a differentiable ray\ntracer (RT) for the simulation of radio wave propagation. This unique feature\nallows for the computation of gradients of the channel impulse response and\nother related quantities with respect to many system and environment\nparameters, such as material properties, antenna patterns, array geometries, as\nwell as transmitter and receiver orientations and positions. In this paper, we\noutline the key components of Sionna RT and showcase example applications such\nas learning radio materials and optimizing transmitter orientations by gradient\ndescent. While classic ray tracing is a crucial tool for 6G research topics\nlike reconfigurable intelligent surfaces, integrated sensing and\ncommunications, as well as user localization, differentiable ray tracing is a\nkey enabler for many novel and exciting research directions, for example,\ndigital twins.\n","authors":["Jakob Hoydis","Fayçal Aït Aoudia","Sebastian Cammerer","Merlin Nimier-David","Nikolaus Binder","Guillermo Marcus","Alexander Keller"],"pdf_url":"https://arxiv.org/pdf/2303.11103v2.pdf","comment":"5 pages, 5 figures, update reflects new features of Sionna RT\n  introduced in release v0.15"},{"id":"http://arxiv.org/abs/2208.07734v6","updated":"2023-07-19T14:39:54Z","published":"2022-08-16T13:09:25Z","title":"Data Augmentation is a Hyperparameter: Cherry-picked Self-Supervision\n  for Unsupervised Anomaly Detection is Creating the Illusion of Success","summary":"  Self-supervised learning (SSL) has emerged as a promising alternative to\ncreate supervisory signals to real-world problems, avoiding the extensive cost\nof manual labeling. SSL is particularly attractive for unsupervised tasks such\nas anomaly detection (AD), where labeled anomalies are rare or often\nnonexistent. A large catalog of augmentation functions has been used for\nSSL-based AD (SSAD) on image data, and recent works have reported that the type\nof augmentation has a significant impact on accuracy. Motivated by those, this\nwork sets out to put image-based SSAD under a larger lens and investigate the\nrole of data augmentation in SSAD. Through extensive experiments on 3 different\ndetector models and across 420 AD tasks, we provide comprehensive numerical and\nvisual evidences that the alignment between data augmentation and\nanomaly-generating mechanism is the key to the success of SSAD, and in the lack\nthereof, SSL may even impair accuracy. To the best of our knowledge, this is\nthe first meta-analysis on the role of data augmentation in SSAD.\n","authors":["Jaemin Yoo","Tiancheng Zhao","Leman Akoglu"],"pdf_url":"https://arxiv.org/pdf/2208.07734v6.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2307.10003v1","updated":"2023-07-19T14:23:26Z","published":"2023-07-19T14:23:26Z","title":"TbExplain: A Text-based Explanation Method for Scene Classification\n  Models with the Statistical Prediction Correction","summary":"  The field of Explainable Artificial Intelligence (XAI) aims to improve the\ninterpretability of black-box machine learning models. Building a heatmap based\non the importance value of input features is a popular method for explaining\nthe underlying functions of such models in producing their predictions.\nHeatmaps are almost understandable to humans, yet they are not without flaws.\nNon-expert users, for example, may not fully understand the logic of heatmaps\n(the logic in which relevant pixels to the model's prediction are highlighted\nwith different intensities or colors). Additionally, objects and regions of the\ninput image that are relevant to the model prediction are frequently not\nentirely differentiated by heatmaps. In this paper, we propose a framework\ncalled TbExplain that employs XAI techniques and a pre-trained object detector\nto present text-based explanations of scene classification models. Moreover,\nTbExplain incorporates a novel method to correct predictions and textually\nexplain them based on the statistics of objects in the input image when the\ninitial prediction is unreliable. To assess the trustworthiness and validity of\nthe text-based explanations, we conducted a qualitative experiment, and the\nfindings indicated that these explanations are sufficiently reliable.\nFurthermore, our quantitative and qualitative experiments on TbExplain with\nscene classification datasets reveal an improvement in classification accuracy\nover ResNet variants.\n","authors":["Amirhossein Aminimehr","Pouya Khani","Amirali Molaei","Amirmohammad Kazemeini","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2307.10003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.10403v2","updated":"2023-07-19T14:23:17Z","published":"2022-05-20T18:44:06Z","title":"Tackling Provably Hard Representative Selection via Graph Neural\n  Networks","summary":"  Representative Selection (RS) is the problem of finding a small subset of\nexemplars from a dataset that is representative of the dataset. In this paper,\nwe study RS for attributed graphs, and focus on finding representative nodes\nthat optimize the accuracy of a model trained on the selected representatives.\nTheoretically, we establish a new hardness result forRS (in the absence of a\ngraph structure) by proving that a particular, highly practical variant of it\n(RS for Learning) is hard to approximate in polynomial time within any\nreasonable factor, which implies a significant potential gap between the\noptimum solution of widely-used surrogate functions and the actual accuracy of\nthe model. We then study the setting where a (homophilous) graph structure is\navailable, or can be constructed, between the data points.We show that with an\nappropriate modeling approach, the presence of such a structure can turn a hard\nRS (for learning) problem into one that can be effectively solved. To this end,\nwe develop RS-GNN, a representation learning-based RS model based on Graph\nNeural Networks. Empirically, we demonstrate the effectiveness of RS-GNN on\nproblems with predefined graph structures as well as problems with graphs\ninduced from node feature similarities, by showing that RS-GNN achieves\nsignificant improvements over established baselines on a suite of eight\nbenchmarks.\n","authors":["Mehran Kazemi","Anton Tsitsulin","Hossein Esfandiari","MohammadHossein Bateni","Deepak Ramachandran","Bryan Perozzi","Vahab Mirrokni"],"pdf_url":"https://arxiv.org/pdf/2205.10403v2.pdf","comment":"Accepted at the Transactions of Machine Learning Research (TMLR)\n  Journal"},{"id":"http://arxiv.org/abs/2307.08913v2","updated":"2023-07-19T14:18:00Z","published":"2023-07-18T01:16:23Z","title":"Towards the Sparseness of Projection Head in Self-Supervised Learning","summary":"  In recent years, self-supervised learning (SSL) has emerged as a promising\napproach for extracting valuable representations from unlabeled data. One\nsuccessful SSL method is contrastive learning, which aims to bring positive\nexamples closer while pushing negative examples apart. Many current contrastive\nlearning approaches utilize a parameterized projection head. Through a\ncombination of empirical analysis and theoretical investigation, we provide\ninsights into the internal mechanisms of the projection head and its\nrelationship with the phenomenon of dimensional collapse. Our findings\ndemonstrate that the projection head enhances the quality of representations by\nperforming contrastive loss in a projected subspace. Therefore, we propose an\nassumption that only a subset of features is necessary when minimizing the\ncontrastive loss of a mini-batch of data. Theoretical analysis further suggests\nthat a sparse projection head can enhance generalization, leading us to\nintroduce SparseHead - a regularization term that effectively constrains the\nsparsity of the projection head, and can be seamlessly integrated with any\nself-supervised learning (SSL) approaches. Our experimental results validate\nthe effectiveness of SparseHead, demonstrating its ability to improve the\nperformance of existing contrastive methods.\n","authors":["Zeen Song","Xingzhe Su","Jingyao Wang","Wenwen Qiang","Changwen Zheng","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2307.08913v2.pdf","comment":"9 pages,3 figures"},{"id":"http://arxiv.org/abs/2305.15851v2","updated":"2023-07-19T14:16:22Z","published":"2023-05-25T08:43:11Z","title":"On sampling determinantal and Pfaffian point processes on a quantum\n  computer","summary":"  DPPs were introduced by Macchi as a model in quantum optics the 1970s. Since\nthen, they have been widely used as models and subsampling tools in statistics\nand computer science. Most applications require sampling from a DPP, and given\ntheir quantum origin, it is natural to wonder whether sampling a DPP on a\nquantum computer is easier than on a classical one. We focus here on DPPs over\na finite state space, which are distributions over the subsets of\n$\\{1,\\dots,N\\}$ parametrized by an $N\\times N$ Hermitian kernel matrix. Vanilla\nsampling consists in two steps, of respective costs $\\mathcal{O}(N^3)$ and\n$\\mathcal{O}(Nr^2)$ operations on a classical computer, where $r$ is the rank\nof the kernel matrix. A large first part of the current paper consists in\nexplaining why the state-of-the-art in quantum simulation of fermionic systems\nalready yields quantum DPP sampling algorithms. We then modify existing quantum\ncircuits, and discuss their insertion in a full DPP sampling pipeline that\nstarts from practical kernel specifications. The bottom line is that, with $P$\n(classical) parallel processors, we can divide the preprocessing cost by $P$\nand build a quantum circuit with $\\mathcal{O}(Nr)$ gates that sample a given\nDPP, with depth varying from $\\mathcal{O}(N)$ to $\\mathcal{O}(r\\log N)$\ndepending on qubit-communication constraints on the target machine. We also\nconnect existing work on the simulation of superconductors to Pfaffian point\nprocesses, which generalize DPPs and would be a natural addition to the machine\nlearner's toolbox. Finally, the circuits are empirically validated on a\nclassical simulator and on 5-qubit machines.\n","authors":["Rémi Bardenet","Michaël Fanuel","Alexandre Feller"],"pdf_url":"https://arxiv.org/pdf/2305.15851v2.pdf","comment":"48 pages, 8 figures. Additional results about parity of cardinality\n  of PfPP samples"},{"id":"http://arxiv.org/abs/2307.09994v1","updated":"2023-07-19T13:58:01Z","published":"2023-07-19T13:58:01Z","title":"Impact of Disentanglement on Pruning Neural Networks","summary":"  Deploying deep learning neural networks on edge devices, to accomplish task\nspecific objectives in the real-world, requires a reduction in their memory\nfootprint, power consumption, and latency. This can be realized via efficient\nmodel compression. Disentangled latent representations produced by variational\nautoencoder (VAE) networks are a promising approach for achieving model\ncompression because they mainly retain task-specific information, discarding\nuseless information for the task at hand. We make use of the Beta-VAE framework\ncombined with a standard criterion for pruning to investigate the impact of\nforcing the network to learn disentangled representations on the pruning\nprocess for the task of classification. In particular, we perform experiments\non MNIST and CIFAR10 datasets, examine disentanglement challenges, and propose\na path forward for future works.\n","authors":["Carl Shneider","Peyman Rostami","Anis Kacem","Nilotpal Sinha","Abd El Rahman Shabayek","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2307.09994v1.pdf","comment":"Presented in ISCS23"},{"id":"http://arxiv.org/abs/2307.08347v2","updated":"2023-07-19T13:55:32Z","published":"2023-07-17T09:38:41Z","title":"M-FLAG: Medical Vision-Language Pre-training with Frozen Language Models\n  and Latent Space Geometry Optimization","summary":"  Medical vision-language models enable co-learning and integrating features\nfrom medical imaging and clinical text. However, these models are not easy to\ntrain and the latent representation space can be complex. Here we propose a\nnovel way for pre-training and regularising medical vision-language models. The\nproposed method, named Medical vision-language pre-training with Frozen\nlanguage models and Latent spAce Geometry optimization (M-FLAG), leverages a\nfrozen language model for training stability and efficiency and introduces a\nnovel orthogonality loss to harmonize the latent space geometry. We demonstrate\nthe potential of the pre-trained model on three downstream tasks: medical image\nclassification, segmentation, and object detection. Extensive experiments\nacross five public datasets demonstrate that M-FLAG significantly outperforms\nexisting medical vision-language pre-training approaches and reduces the number\nof parameters by 78\\%. Notably, M-FLAG achieves outstanding performance on the\nsegmentation task while using only 1\\% of the RSNA dataset, even outperforming\nImageNet pre-trained models that have been fine-tuned using 100\\% of the data.\n","authors":["Che Liu","Sibo Cheng","Chen Chen","Mengyun Qiao","Weitong Zhang","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2307.08347v2.pdf","comment":"Accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.09989v1","updated":"2023-07-19T13:49:35Z","published":"2023-07-19T13:49:35Z","title":"UniMatch: A Unified User-Item Matching Framework for the Multi-purpose\n  Merchant Marketing","summary":"  When doing private domain marketing with cloud services, the merchants\nusually have to purchase different machine learning models for the multiple\nmarketing purposes, leading to a very high cost. We present a unified user-item\nmatching framework to simultaneously conduct item recommendation and user\ntargeting with just one model. We empirically demonstrate that the above\nconcurrent modeling is viable via modeling the user-item interaction matrix\nwith the multinomial distribution, and propose a bidirectional bias-corrected\nNCE loss for the implementation. The proposed loss function guides the model to\nlearn the user-item joint probability $p(u,i)$ instead of the conditional\nprobability $p(i|u)$ or $p(u|i)$ through correcting both the users and items'\nbiases caused by the in-batch negative sampling. In addition, our framework is\nmodel-agnostic enabling a flexible adaptation of different model architectures.\nExtensive experiments demonstrate that our framework results in significant\nperformance gains in comparison with the state-of-the-art methods, with greatly\nreduced cost on computing resources and daily maintenance.\n","authors":["Qifang Zhao","Tianyu Li","Meng Du","Yu Jiang","Qinghui Sun","Zhongyao Wang","Hong Liu","Huan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.09989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09988v1","updated":"2023-07-19T13:49:12Z","published":"2023-07-19T13:49:12Z","title":"TinyTrain: Deep Neural Network Training at the Extreme Edge","summary":"  On-device training is essential for user personalisation and privacy. With\nthe pervasiveness of IoT devices and microcontroller units (MCU), this task\nbecomes more challenging due to the constrained memory and compute resources,\nand the limited availability of labelled user data. Nonetheless, prior works\nneglect the data scarcity issue, require excessively long training time (e.g. a\nfew hours), or induce substantial accuracy loss ($\\geq$10\\%). We propose\nTinyTrain, an on-device training approach that drastically reduces training\ntime by selectively updating parts of the model and explicitly coping with data\nscarcity. TinyTrain introduces a task-adaptive sparse-update method that\ndynamically selects the layer/channel based on a multi-objective criterion that\njointly captures user data, the memory, and the compute capabilities of the\ntarget device, leading to high accuracy on unseen tasks with reduced\ncomputation and memory footprint. TinyTrain outperforms vanilla fine-tuning of\nthe entire network by 3.6-5.0\\% in accuracy, while reducing the backward-pass\nmemory and computation cost by up to 2,286$\\times$ and 7.68$\\times$,\nrespectively. Targeting broadly used real-world edge devices, TinyTrain\nachieves 9.5$\\times$ faster and 3.5$\\times$ more energy-efficient training over\nstatus-quo approaches, and 2.8$\\times$ smaller memory footprint than SOTA\napproaches, while remaining within the 1 MB memory envelope of MCU-grade\nplatforms.\n","authors":["Young D. Kwon","Rui Li","Stylianos I. Venieris","Jagmohan Chauhan","Nicholas D. Lane","Cecilia Mascolo"],"pdf_url":"https://arxiv.org/pdf/2307.09988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14518v2","updated":"2023-07-19T13:48:46Z","published":"2023-02-28T12:13:57Z","title":"Generalization Error Bounds for Noisy, Iterative Algorithms via Maximal\n  Leakage","summary":"  We adopt an information-theoretic framework to analyze the generalization\nbehavior of the class of iterative, noisy learning algorithms. This class is\nparticularly suitable for study under information-theoretic metrics as the\nalgorithms are inherently randomized, and it includes commonly used algorithms\nsuch as Stochastic Gradient Langevin Dynamics (SGLD). Herein, we use the\nmaximal leakage (equivalently, the Sibson mutual information of order infinity)\nmetric, as it is simple to analyze, and it implies both bounds on the\nprobability of having a large generalization error and on its expected value.\nWe show that, if the update function (e.g., gradient) is bounded in $L_2$-norm\nand the additive noise is isotropic Gaussian noise, then one can obtain an\nupper-bound on maximal leakage in semi-closed form. Furthermore, we demonstrate\nhow the assumptions on the update function affect the optimal (in the sense of\nminimizing the induced maximal leakage) choice of the noise. Finally, we\ncompute explicit tight upper bounds on the induced maximal leakage for other\nscenarios of interest.\n","authors":["Ibrahim Issa","Amedeo Roberto Esposito","Michael Gastpar"],"pdf_url":"https://arxiv.org/pdf/2302.14518v2.pdf","comment":"Updated to fix an error in Theorem 4 (asymptotic analysis)"},{"id":"http://arxiv.org/abs/2210.14037v2","updated":"2023-07-19T13:43:07Z","published":"2022-10-25T14:13:53Z","title":"Revisiting Softmax for Uncertainty Approximation in Text Classification","summary":"  Uncertainty approximation in text classification is an important area with\napplications in domain adaptation and interpretability. One of the most widely\nused uncertainty approximation methods is Monte Carlo (MC) Dropout, which is\ncomputationally expensive as it requires multiple forward passes through the\nmodel. A cheaper alternative is to simply use the softmax based on a single\nforward pass without dropout to estimate model uncertainty. However, prior work\nhas indicated that these predictions tend to be overconfident. In this paper,\nwe perform a thorough empirical analysis of these methods on five datasets with\ntwo base neural architectures in order to identify the trade-offs between the\ntwo. We compare both softmax and an efficient version of MC Dropout on their\nuncertainty approximations and downstream text classification performance,\nwhile weighing their runtime (cost) against performance (benefit). We find\nthat, while MC dropout produces the best uncertainty approximations, using a\nsimple softmax leads to competitive and in some cases better uncertainty\nestimation for text classification at a much lower computational cost,\nsuggesting that softmax can in fact be a sufficient uncertainty estimate when\ncomputational resources are a concern.\n","authors":["Andreas Nugaard Holm","Dustin Wright","Isabelle Augenstein"],"pdf_url":"https://arxiv.org/pdf/2210.14037v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09977v1","updated":"2023-07-19T13:33:43Z","published":"2023-07-19T13:33:43Z","title":"Learner Referral for Cost-Effective Federated Learning Over Hierarchical\n  IoT Networks","summary":"  The paradigm of federated learning (FL) to address data privacy concerns by\nlocally training parameters on resource-constrained clients in a distributed\nmanner has garnered significant attention. Nonetheless, FL is not applicable\nwhen not all clients within the coverage of the FL server are registered with\nthe FL network. To bridge this gap, this paper proposes joint learner referral\naided federated client selection (LRef-FedCS), along with communications and\ncomputing resource scheduling, and local model accuracy optimization (LMAO)\nmethods. These methods are designed to minimize the cost incurred by the\nworst-case participant and ensure the long-term fairness of FL in hierarchical\nInternet of Things (HieIoT) networks. Utilizing the Lyapunov optimization\ntechnique, we reformulate the original problem into a stepwise joint\noptimization problem (JOP). Subsequently, to tackle the mixed-integer\nnon-convex JOP, we separatively and iteratively address LRef-FedCS and LMAO\nthrough the centralized method and self-adaptive global best harmony search\n(SGHS) algorithm, respectively. To enhance scalability, we further propose a\ndistributed LRef-FedCS approach based on a matching game to replace the\ncentralized method described above. Numerical simulations and experimental\nresults on the MNIST/CIFAR-10 datasets demonstrate that our proposed LRef-FedCS\napproach could achieve a good balance between pursuing high global accuracy and\nreducing cost.\n","authors":["Yulan Gao","Ziqiang Ye","Yue Xiao","Wei Xiang"],"pdf_url":"https://arxiv.org/pdf/2307.09977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03587v2","updated":"2023-07-19T13:23:29Z","published":"2023-07-07T13:29:07Z","title":"BOF-UCB: A Bayesian-Optimistic Frequentist Algorithm for Non-Stationary\n  Contextual Bandits","summary":"  We propose a novel Bayesian-Optimistic Frequentist Upper Confidence Bound\n(BOF-UCB) algorithm for stochastic contextual linear bandits in non-stationary\nenvironments. This unique combination of Bayesian and frequentist principles\nenhances adaptability and performance in dynamic settings. The BOF-UCB\nalgorithm utilizes sequential Bayesian updates to infer the posterior\ndistribution of the unknown regression parameter, and subsequently employs a\nfrequentist approach to compute the Upper Confidence Bound (UCB) by maximizing\nthe expected reward over the posterior distribution. We provide theoretical\nguarantees of BOF-UCB's performance and demonstrate its effectiveness in\nbalancing exploration and exploitation on synthetic datasets and classical\ncontrol tasks in a reinforcement learning setting. Our results show that\nBOF-UCB outperforms existing methods, making it a promising solution for\nsequential decision-making in non-stationary environments.\n","authors":["Nicklas Werge","Abdullah Akgül","Melih Kandemir"],"pdf_url":"https://arxiv.org/pdf/2307.03587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09946v2","updated":"2023-07-19T13:15:08Z","published":"2023-05-17T04:56:11Z","title":"AdaMSS: Adaptive Multi-Modality Segmentation-to-Survival Learning for\n  Survival Outcome Prediction from PET/CT Images","summary":"  Survival prediction is a major concern for cancer management. Deep survival\nmodels based on deep learning have been widely adopted to perform end-to-end\nsurvival prediction from medical images. Recent deep survival models achieved\npromising performance by jointly performing tumor segmentation with survival\nprediction, where the models were guided to extract tumor-related information\nthrough Multi-Task Learning (MTL). However, these deep survival models have\ndifficulties in exploring out-of-tumor prognostic information. In addition,\nexisting deep survival models are unable to effectively leverage multi-modality\nimages. Empirically-designed fusion strategies were commonly adopted to fuse\nmulti-modality information via task-specific manually-designed networks, thus\nlimiting the adaptability to different scenarios. In this study, we propose an\nAdaptive Multi-modality Segmentation-to-Survival model (AdaMSS) for survival\nprediction from PET/CT images. Instead of adopting MTL, we propose a novel\nSegmentation-to-Survival Learning (SSL) strategy, where our AdaMSS is trained\nfor tumor segmentation and survival prediction sequentially in two stages. This\nstrategy enables the AdaMSS to focus on tumor regions in the first stage and\ngradually expand its focus to include other prognosis-related regions in the\nsecond stage. We also propose a data-driven strategy to fuse multi-modality\ninformation, which realizes adaptive optimization of fusion strategies based on\ntraining data during training. With the SSL and data-driven fusion strategies,\nour AdaMSS is designed as an adaptive model that can self-adapt its focus\nregions and fusion strategy for different training stages. Extensive\nexperiments with two large clinical datasets show that our AdaMSS outperforms\nstate-of-the-art survival prediction methods.\n","authors":["Mingyuan Meng","Bingxin Gu","Michael Fulham","Shaoli Song","Dagan Feng","Lei Bi","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2305.09946v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2307.09964v1","updated":"2023-07-19T13:14:47Z","published":"2023-07-19T13:14:47Z","title":"Towards green AI-based software systems: an architecture-centric\n  approach (GAISSA)","summary":"  Nowadays, AI-based systems have achieved outstanding results and have\noutperformed humans in different domains. However, the processes of training AI\nmodels and inferring from them require high computational resources, which pose\na significant challenge in the current energy efficiency societal demand. To\ncope with this challenge, this research project paper describes the main\nvision, goals, and expected outcomes of the GAISSA project. The GAISSA project\naims at providing data scientists and software engineers tool-supported,\narchitecture-centric methods for the modelling and development of green\nAI-based systems. Although the project is in an initial stage, we describe the\ncurrent research results, which illustrate the potential to achieve GAISSA\nobjectives.\n","authors":["Silverio Martínez-Fernández","Xavier Franch","Francisco Durán"],"pdf_url":"https://arxiv.org/pdf/2307.09964v1.pdf","comment":"Accepted for publication as full paper - 2023 49th Euromicro\n  Conference Series on Software Engineering and Advanced Applications (SEAA)"},{"id":"http://arxiv.org/abs/2210.06226v2","updated":"2023-07-19T13:08:21Z","published":"2022-10-12T14:15:39Z","title":"Alpha-divergence Variational Inference Meets Importance Weighted\n  Auto-Encoders: Methodology and Asymptotics","summary":"  Several algorithms involving the Variational R\\'enyi (VR) bound have been\nproposed to minimize an alpha-divergence between a target posterior\ndistribution and a variational distribution. Despite promising empirical\nresults, those algorithms resort to biased stochastic gradient descent\nprocedures and thus lack theoretical guarantees. In this paper, we formalize\nand study the VR-IWAE bound, a generalization of the Importance Weighted\nAuto-Encoder (IWAE) bound. We show that the VR-IWAE bound enjoys several\ndesirable properties and notably leads to the same stochastic gradient descent\nprocedure as the VR bound in the reparameterized case, but this time by relying\non unbiased gradient estimators. We then provide two complementary theoretical\nanalyses of the VR-IWAE bound and thus of the standard IWAE bound. Those\nanalyses shed light on the benefits or lack thereof of these bounds. Lastly, we\nillustrate our theoretical claims over toy and real-data examples.\n","authors":["Kamélia Daudel","Joe Benton","Yuyang Shi","Arnaud Doucet"],"pdf_url":"https://arxiv.org/pdf/2210.06226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2001.05887v4","updated":"2023-07-19T12:58:18Z","published":"2020-01-16T15:24:26Z","title":"MixPath: A Unified Approach for One-shot Neural Architecture Search","summary":"  Blending multiple convolutional kernels is proved advantageous in neural\narchitecture design. However, current two-stage neural architecture search\nmethods are mainly limited to single-path search spaces. How to efficiently\nsearch models of multi-path structures remains a difficult problem. In this\npaper, we are motivated to train a one-shot multi-path supernet to accurately\nevaluate the candidate architectures. Specifically, we discover that in the\nstudied search spaces, feature vectors summed from multiple paths are nearly\nmultiples of those from a single path. Such disparity perturbs the supernet\ntraining and its ranking ability. Therefore, we propose a novel mechanism\ncalled Shadow Batch Normalization (SBN) to regularize the disparate feature\nstatistics. Extensive experiments prove that SBNs are capable of stabilizing\nthe optimization and improving ranking performance. We call our unified\nmulti-path one-shot approach as MixPath, which generates a series of models\nthat achieve state-of-the-art results on ImageNet.\n","authors":["Xiangxiang Chu","Shun Lu","Xudong Li","Bo Zhang"],"pdf_url":"https://arxiv.org/pdf/2001.05887v4.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.09955v1","updated":"2023-07-19T12:51:28Z","published":"2023-07-19T12:51:28Z","title":"XSkill: Cross Embodiment Skill Discovery","summary":"  Human demonstration videos are a widely available data source for robot\nlearning and an intuitive user interface for expressing desired behavior.\nHowever, directly extracting reusable robot manipulation skills from\nunstructured human videos is challenging due to the big embodiment difference\nand unobserved action parameters. To bridge this embodiment gap, this paper\nintroduces XSkill, an imitation learning framework that 1) discovers a\ncross-embodiment representation called skill prototypes purely from unlabeled\nhuman and robot manipulation videos, 2) transfers the skill representation to\nrobot actions using conditional diffusion policy, and finally, 3) composes the\nlearned skill to accomplish unseen tasks specified by a human prompt video. Our\nexperiments in simulation and real-world environments show that the discovered\nskill prototypes facilitate both skill transfer and composition for unseen\ntasks, resulting in a more general and scalable imitation learning framework.\nThe performance of XSkill is best understood from the anonymous website:\nhttps://xskillcorl.github.io.\n","authors":["Mengda Xu","Zhenjia Xu","Cheng Chi","Manuela Veloso","Shuran Song"],"pdf_url":"https://arxiv.org/pdf/2307.09955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09943v1","updated":"2023-07-19T12:35:16Z","published":"2023-07-19T12:35:16Z","title":"Impatient Bandits: Optimizing for the Long-Term Without Delay","summary":"  Recommender systems are a ubiquitous feature of online platforms.\nIncreasingly, they are explicitly tasked with increasing users' long-term\nsatisfaction. In this context, we study a content exploration task, which we\nformalize as a multi-armed bandit problem with delayed rewards. We observe that\nthere is an apparent trade-off in choosing the learning signal: Waiting for the\nfull reward to become available might take several weeks, hurting the rate at\nwhich learning happens, whereas measuring short-term proxy rewards reflects the\nactual long-term goal only imperfectly. We address this challenge in two steps.\nFirst, we develop a predictive model of delayed rewards that incorporates all\ninformation obtained to date. Full observations as well as partial (short or\nmedium-term) outcomes are combined through a Bayesian filter to obtain a\nprobabilistic belief. Second, we devise a bandit algorithm that takes advantage\nof this new predictive model. The algorithm quickly learns to identify content\naligned with long-term success by carefully balancing exploration and\nexploitation. We apply our approach to a podcast recommendation problem, where\nwe seek to identify shows that users engage with repeatedly over two months. We\nempirically validate that our approach results in substantially better\nperformance compared to approaches that either optimize for short-term proxies,\nor wait for the long-term outcome to be fully realized.\n","authors":["Thomas McDonald","Lucas Maystre","Mounia Lalmas","Daniel Russo","Kamil Ciosek"],"pdf_url":"https://arxiv.org/pdf/2307.09943v1.pdf","comment":"Presented at the 29th ACM SIGKDD Conference on Knowledge Discovery\n  and Data Mining (KDD '23)"},{"id":"http://arxiv.org/abs/2307.09942v1","updated":"2023-07-19T12:35:09Z","published":"2023-07-19T12:35:09Z","title":"TREEMENT: Interpretable Patient-Trial Matching via Personalized Dynamic\n  Tree-Based Memory Network","summary":"  Clinical trials are critical for drug development but often suffer from\nexpensive and inefficient patient recruitment. In recent years, machine\nlearning models have been proposed for speeding up patient recruitment via\nautomatically matching patients with clinical trials based on longitudinal\npatient electronic health records (EHR) data and eligibility criteria of\nclinical trials. However, they either depend on trial-specific expert rules\nthat cannot expand to other trials or perform matching at a very general level\nwith a black-box model where the lack of interpretability makes the model\nresults difficult to be adopted.\n  To provide accurate and interpretable patient trial matching, we introduce a\npersonalized dynamic tree-based memory network model named TREEMENT. It\nutilizes hierarchical clinical ontologies to expand the personalized patient\nrepresentation learned from sequential EHR data, and then uses an attentional\nbeam-search query learned from eligibility criteria embedding to offer a\ngranular level of alignment for improved performance and interpretability. We\nevaluated TREEMENT against existing models on real-world datasets and\ndemonstrated that TREEMENT outperforms the best baseline by 7% in terms of\nerror reduction in criteria-level matching and achieves state-of-the-art\nresults in its trial-level matching ability. Furthermore, we also show TREEMENT\ncan offer good interpretability to make the model results easier for adoption.\n","authors":["Brandon Theodorou","Cao Xiao","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2307.09942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02486v2","updated":"2023-07-19T12:25:35Z","published":"2023-07-05T17:59:38Z","title":"LongNet: Scaling Transformers to 1,000,000,000 Tokens","summary":"  Scaling sequence length has become a critical demand in the era of large\nlanguage models. However, existing methods struggle with either computational\ncomplexity or model expressivity, rendering the maximum sequence length\nrestricted. To address this issue, we introduce LongNet, a Transformer variant\nthat can scale sequence length to more than 1 billion tokens, without\nsacrificing the performance on shorter sequences. Specifically, we propose\ndilated attention, which expands the attentive field exponentially as the\ndistance grows. LongNet has significant advantages: 1) it has a linear\ncomputation complexity and a logarithm dependency between any two tokens in a\nsequence; 2) it can be served as a distributed trainer for extremely long\nsequences; 3) its dilated attention is a drop-in replacement for standard\nattention, which can be seamlessly integrated with the existing\nTransformer-based optimization. Experiments results demonstrate that LongNet\nyields strong performance on both long-sequence modeling and general language\ntasks. Our work opens up new possibilities for modeling very long sequences,\ne.g., treating a whole corpus or even the entire Internet as a sequence.\n","authors":["Jiayu Ding","Shuming Ma","Li Dong","Xingxing Zhang","Shaohan Huang","Wenhui Wang","Nanning Zheng","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.02486v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2302.07265v2","updated":"2023-07-19T12:18:34Z","published":"2023-02-14T18:59:02Z","title":"The Meta-Evaluation Problem in Explainable AI: Identifying Reliable\n  Estimators with MetaQuantus","summary":"  One of the unsolved challenges in the field of Explainable AI (XAI) is\ndetermining how to most reliably estimate the quality of an explanation method\nin the absence of ground truth explanation labels. Resolving this issue is of\nutmost importance as the evaluation outcomes generated by competing evaluation\nmethods (or ''quality estimators''), which aim at measuring the same property\nof an explanation method, frequently present conflicting rankings. Such\ndisagreements can be challenging for practitioners to interpret, thereby\ncomplicating their ability to select the best-performing explanation method. We\naddress this problem through a meta-evaluation of different quality estimators\nin XAI, which we define as ''the process of evaluating the evaluation method''.\nOur novel framework, MetaQuantus, analyses two complementary performance\ncharacteristics of a quality estimator: its resilience to noise and reactivity\nto randomness, thus circumventing the need for ground truth labels. We\ndemonstrate the effectiveness of our framework through a series of experiments,\ntargeting various open questions in XAI such as the selection and\nhyperparameter optimisation of quality estimators. Our work is released under\nan open-source license (https://github.com/annahedstroem/MetaQuantus) to serve\nas a development tool for XAI- and Machine Learning (ML) practitioners to\nverify and benchmark newly constructed quality estimators in a given\nexplainability context. With this work, we provide the community with clear and\ntheoretically-grounded guidance for identifying reliable evaluation methods,\nthus facilitating reproducibility in the field.\n","authors":["Anna Hedström","Philine Bommer","Kristoffer K. Wickstrøm","Wojciech Samek","Sebastian Lapuschkin","Marina M. -C. Höhne"],"pdf_url":"https://arxiv.org/pdf/2302.07265v2.pdf","comment":"35 pages, 15 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.09933v1","updated":"2023-07-19T12:15:06Z","published":"2023-07-19T12:15:06Z","title":"Spuriosity Didn't Kill the Classifier: Using Invariant Predictions to\n  Harness Spurious Features","summary":"  To avoid failures on out-of-distribution data, recent works have sought to\nextract features that have a stable or invariant relationship with the label\nacross domains, discarding the \"spurious\" or unstable features whose\nrelationship with the label changes across domains. However, unstable features\noften carry complementary information about the label that could boost\nperformance if used correctly in the test domain. Our main contribution is to\nshow that it is possible to learn how to use these unstable features in the\ntest domain without labels. In particular, we prove that pseudo-labels based on\nstable features provide sufficient guidance for doing so, provided that stable\nand unstable features are conditionally independent given the label. Based on\nthis theoretical insight, we propose Stable Feature Boosting (SFB), an\nalgorithm for: (i) learning a predictor that separates stable and\nconditionally-independent unstable features; and (ii) using the stable-feature\npredictions to adapt the unstable-feature predictions in the test domain.\nTheoretically, we prove that SFB can learn an asymptotically-optimal predictor\nwithout test-domain labels. Empirically, we demonstrate the effectiveness of\nSFB on real and synthetic data.\n","authors":["Cian Eastwood","Shashank Singh","Andrei Liviu Nicolicioiu","Marin Vlastelica","Julius von Kügelgen","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2307.09933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09931v1","updated":"2023-07-19T12:12:17Z","published":"2023-07-19T12:12:17Z","title":"DISA: DIfferentiable Similarity Approximation for Universal Multimodal\n  Registration","summary":"  Multimodal image registration is a challenging but essential step for\nnumerous image-guided procedures. Most registration algorithms rely on the\ncomputation of complex, frequently non-differentiable similarity metrics to\ndeal with the appearance discrepancy of anatomical structures between imaging\nmodalities. Recent Machine Learning based approaches are limited to specific\nanatomy-modality combinations and do not generalize to new settings. We propose\na generic framework for creating expressive cross-modal descriptors that enable\nfast deformable global registration. We achieve this by approximating existing\nmetrics with a dot-product in the feature space of a small convolutional neural\nnetwork (CNN) which is inherently differentiable can be trained without\nregistered data. Our method is several orders of magnitude faster than local\npatch-based metrics and can be directly applied in clinical settings by\nreplacing the similarity measure with the proposed one. Experiments on three\ndifferent datasets demonstrate that our approach generalizes well beyond the\ntraining data, yielding a broad capture range even on unseen anatomies and\nmodality pairs, without the need for specialized retraining. We make our\ntraining code and data publicly available.\n","authors":["Matteo Ronchetti","Wolfgang Wein","Nassir Navab","Oliver Zettinig","Raphael Prevost"],"pdf_url":"https://arxiv.org/pdf/2307.09931v1.pdf","comment":"This preprint was submitted to MICCAI 2023. The Version of Record of\n  this contribution will be published in Springer LNCS"},{"id":"http://arxiv.org/abs/2307.04639v2","updated":"2023-07-19T12:08:51Z","published":"2023-07-10T15:35:31Z","title":"Multimodal brain age estimation using interpretable adaptive\n  population-graph learning","summary":"  Brain age estimation is clinically important as it can provide valuable\ninformation in the context of neurodegenerative diseases such as Alzheimer's.\nPopulation graphs, which include multimodal imaging information of the subjects\nalong with the relationships among the population, have been used in literature\nalong with Graph Convolutional Networks (GCNs) and have proved beneficial for a\nvariety of medical imaging tasks. A population graph is usually static and\nconstructed manually using non-imaging information. However, graph construction\nis not a trivial task and might significantly affect the performance of the\nGCN, which is inherently very sensitive to the graph structure. In this work,\nwe propose a framework that learns a population graph structure optimized for\nthe downstream task. An attention mechanism assigns weights to a set of imaging\nand non-imaging features (phenotypes), which are then used for edge extraction.\nThe resulting graph is used to train the GCN. The entire pipeline can be\ntrained end-to-end. Additionally, by visualizing the attention weights that\nwere the most important for the graph construction, we increase the\ninterpretability of the graph. We use the UK Biobank, which provides a large\nvariety of neuroimaging and non-imaging phenotypes, to evaluate our method on\nbrain age regression and classification. The proposed method outperforms\ncompeting static graph approaches and other state-of-the-art adaptive methods.\nWe further show that the assigned attention scores indicate that there are both\nimaging and non-imaging phenotypes that are informative for brain age\nestimation and are in agreement with the relevant literature.\n","authors":["Kyriaki-Margarita Bintsi","Vasileios Baltatzis","Rolandos Alexandros Potamias","Alexander Hammers","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2307.04639v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.09916v1","updated":"2023-07-19T11:40:15Z","published":"2023-07-19T11:40:15Z","title":"TimeTuner: Diagnosing Time Representations for Time-Series Forecasting\n  with Counterfactual Explanations","summary":"  Deep learning (DL) approaches are being increasingly used for time-series\nforecasting, with many efforts devoted to designing complex DL models. Recent\nstudies have shown that the DL success is often attributed to effective data\nrepresentations, fostering the fields of feature engineering and representation\nlearning. However, automated approaches for feature learning are typically\nlimited with respect to incorporating prior knowledge, identifying interactions\namong variables, and choosing evaluation metrics to ensure that the models are\nreliable. To improve on these limitations, this paper contributes a novel\nvisual analytics framework, namely TimeTuner, designed to help analysts\nunderstand how model behaviors are associated with localized correlations,\nstationarity, and granularity of time-series representations. The system mainly\nconsists of the following two-stage technique: We first leverage counterfactual\nexplanations to connect the relationships among time-series representations,\nmultivariate features and model predictions. Next, we design multiple\ncoordinated views including a partition-based correlation matrix and juxtaposed\nbivariate stripes, and provide a set of interactions that allow users to step\ninto the transformation selection process, navigate through the feature space,\nand reason the model performance. We instantiate TimeTuner with two\ntransformation methods of smoothing and sampling, and demonstrate its\napplicability on real-world time-series forecasting of univariate sunspots and\nmultivariate air pollutants. Feedback from domain experts indicates that our\nsystem can help characterize time-series representations and guide the feature\nengineering processes.\n","authors":["Jianing Hao","Qing Shi","Yilin Ye","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2307.09916v1.pdf","comment":"11 pages, 9 figures, this paper has been accepted by VIS2024"},{"id":"http://arxiv.org/abs/2307.09912v1","updated":"2023-07-19T11:32:24Z","published":"2023-07-19T11:32:24Z","title":"Deep projection networks for learning time-homogeneous dynamical systems","summary":"  We consider the general class of time-homogeneous dynamical systems, both\ndiscrete and continuous, and study the problem of learning a meaningful\nrepresentation of the state from observed data. This is instrumental for the\ntask of learning a forward transfer operator of the system, that in turn can be\nused for forecasting future states or observables. The representation,\ntypically parametrized via a neural network, is associated with a projection\noperator and is learned by optimizing an objective function akin to that of\ncanonical correlation analysis (CCA). However, unlike CCA, our objective avoids\nmatrix inversions and therefore is generally more stable and applicable to\nchallenging scenarios. Our objective is a tight relaxation of CCA and we\nfurther enhance it by proposing two regularization schemes, one encouraging the\northogonality of the components of the representation while the other\nexploiting Chapman-Kolmogorov's equation. We apply our method to challenging\ndiscrete dynamical systems, discussing improvements over previous methods, as\nwell as to continuous dynamical systems.\n","authors":["Vladimir R. Kostic","Pietro Novelli","Riccardo Grazzi","Karim Lounici","Massimiliano Pontil"],"pdf_url":"https://arxiv.org/pdf/2307.09912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06698v2","updated":"2023-07-19T11:23:07Z","published":"2023-07-13T11:54:32Z","title":"IntelliGraphs: Datasets for Benchmarking Knowledge Graph Generation","summary":"  Knowledge Graph Embedding (KGE) models are used to learn continuous\nrepresentations of entities and relations. A key task in the literature is\npredicting missing links between entities. However, Knowledge Graphs are not\njust sets of links but also have semantics underlying their structure.\nSemantics is crucial in several downstream tasks, such as query answering or\nreasoning. We introduce the subgraph inference task, where a model has to\ngenerate likely and semantically valid subgraphs. We propose IntelliGraphs, a\nset of five new Knowledge Graph datasets. The IntelliGraphs datasets contain\nsubgraphs with semantics expressed in logical rules for evaluating subgraph\ninference. We also present the dataset generator that produced the synthetic\ndatasets. We designed four novel baseline models, which include three models\nbased on traditional KGEs. We evaluate their expressiveness and show that these\nmodels cannot capture the semantics. We believe this benchmark will encourage\nthe development of machine learning models that emphasize semantic\nunderstanding.\n","authors":["Thiviyan Thanapalasingam","Emile van Krieken","Peter Bloem","Paul Groth"],"pdf_url":"https://arxiv.org/pdf/2307.06698v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09211v3","updated":"2023-07-19T10:52:30Z","published":"2023-05-16T06:40:04Z","title":"CB-HVTNet: A channel-boosted hybrid vision transformer network for\n  lymphocyte assessment in histopathological images","summary":"  Transformers, due to their ability to learn long range dependencies, have\novercome the shortcomings of convolutional neural networks (CNNs) for global\nperspective learning. Therefore, they have gained the focus of researchers for\nseveral vision related tasks including medical diagnosis. However, their\nmulti-head attention module only captures global level feature representations,\nwhich is insufficient for medical images. To address this issue, we propose a\nChannel Boosted Hybrid Vision Transformer (CB HVT) that uses transfer learning\nto generate boosted channels and employs both transformers and CNNs to analyse\nlymphocytes in histopathological images. The proposed CB HVT comprises five\nmodules, including a channel generation module, channel exploitation module,\nchannel merging module, region-aware module, and a detection and segmentation\nhead, which work together to effectively identify lymphocytes. The channel\ngeneration module uses the idea of channel boosting through transfer learning\nto extract diverse channels from different auxiliary learners. In the CB HVT,\nthese boosted channels are first concatenated and ranked using an attention\nmechanism in the channel exploitation module. A fusion block is then utilized\nin the channel merging module for a gradual and systematic merging of the\ndiverse boosted channels to improve the network's learning representations. The\nCB HVT also employs a proposal network in its region aware module and a head to\neffectively identify objects, even in overlapping regions and with artifacts.\nWe evaluated the proposed CB HVT on two publicly available datasets for\nlymphocyte assessment in histopathological images. The results show that CB HVT\noutperformed other state of the art detection models, and has good\ngeneralization ability, demonstrating its value as a tool for pathologists.\n","authors":["Momina Liaqat Ali","Zunaira Rauf","Asifullah Khan","Anabia Sohail","Rafi Ullah","Jeonghwan Gwak"],"pdf_url":"https://arxiv.org/pdf/2305.09211v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09896v1","updated":"2023-07-19T10:50:36Z","published":"2023-07-19T10:50:36Z","title":"Repeated Observations for Classification","summary":"  We study the problem nonparametric classification with repeated observations.\nLet $\\bX$ be the $d$ dimensional feature vector and let $Y$ denote the label\ntaking values in $\\{1,\\dots ,M\\}$. In contrast to usual setup with large sample\nsize $n$ and relatively low dimension $d$, this paper deals with the situation,\nwhen instead of observing a single feature vector $\\bX$ we are given $t$\nrepeated feature vectors $\\bV_1,\\dots ,\\bV_t $. Some simple classification\nrules are presented such that the conditional error probabilities have\nexponential convergence rate of convergence as $t\\to\\infty$. In the analysis,\nwe investigate particular models like robust detection by nominal densities,\nprototype classification, linear transformation, linear classification,\nscaling.\n","authors":["Hüseyin Afşer","László Györfi","Harro Walk"],"pdf_url":"https://arxiv.org/pdf/2307.09896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09883v1","updated":"2023-07-19T10:27:34Z","published":"2023-07-19T10:27:34Z","title":"Symmetric Equilibrium Learning of VAEs","summary":"  We view variational autoencoders (VAE) as decoder-encoder pairs, which map\ndistributions in the data space to distributions in the latent space and vice\nversa. The standard learning approach for VAEs, i.e. maximisation of the\nevidence lower bound (ELBO), has an obvious asymmetry in that respect.\nMoreover, it requires a closed form a-priori latent distribution. This limits\nthe applicability of VAEs in more complex scenarios, such as general\nsemi-supervised learning and employing complex generative models as priors. We\npropose a Nash equilibrium learning approach that relaxes these restrictions\nand allows learning VAEs in situations where both the data and the latent\ndistributions are accessible only by sampling. The flexibility and simplicity\nof this approach allows its application to a wide range of learning scenarios\nand downstream tasks. We show experimentally that the models learned by this\nmethod are comparable to those obtained by ELBO learning and demonstrate its\napplicability for tasks that are not accessible by standard VAE learning.\n","authors":["Boris Flach","Dmitrij Schlesinger","Alexander Shekhovtsov"],"pdf_url":"https://arxiv.org/pdf/2307.09883v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.09882v1","updated":"2023-07-19T10:26:29Z","published":"2023-07-19T10:26:29Z","title":"Adversarial Likelihood Estimation with One-way Flows","summary":"  Generative Adversarial Networks (GANs) can produce high-quality samples, but\ndo not provide an estimate of the probability density around the samples.\nHowever, it has been noted that maximizing the log-likelihood within an\nenergy-based setting can lead to an adversarial framework where the\ndiscriminator provides unnormalized density (often called energy). We further\ndevelop this perspective, incorporate importance sampling, and show that 1)\nWasserstein GAN performs a biased estimate of the partition function, and we\npropose instead to use an unbiased estimator; 2) when optimizing for\nlikelihood, one must maximize generator entropy. This is hypothesized to\nprovide a better mode coverage. Different from previous works, we explicitly\ncompute the density of the generated samples. This is the key enabler to\ndesigning an unbiased estimator of the partition function and computation of\nthe generator entropy term. The generator density is obtained via a new type of\nflow network, called one-way flow network, that is less constrained in terms of\narchitecture, as it does not require to have a tractable inverse function. Our\nexperimental results show that we converge faster, produce comparable sample\nquality to GANs with similar architecture, successfully avoid over-fitting to\ncommonly used datasets and produce smooth low-dimensional latent\nrepresentations of the training data.\n","authors":["Omri Ben-Dov","Pravir Singh Gupta","Victoria Abrevaya","Michael J. Black","Partha Ghosh"],"pdf_url":"https://arxiv.org/pdf/2307.09882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09866v1","updated":"2023-07-19T09:53:56Z","published":"2023-07-19T09:53:56Z","title":"Detecting Vulnerable Nodes in Urban Infrastructure Interdependent\n  Network","summary":"  Understanding and characterizing the vulnerability of urban infrastructures,\nwhich refers to the engineering facilities essential for the regular running of\ncities and that exist naturally in the form of networks, is of great value to\nus. Potential applications include protecting fragile facilities and designing\nrobust topologies, etc. Due to the strong correlation between different\ntopological characteristics and infrastructure vulnerability and their\ncomplicated evolution mechanisms, some heuristic and machine-assisted analysis\nfall short in addressing such a scenario. In this paper, we model the\ninterdependent network as a heterogeneous graph and propose a system based on\ngraph neural network with reinforcement learning, which can be trained on\nreal-world data, to characterize the vulnerability of the city system\naccurately. The presented system leverages deep learning techniques to\nunderstand and analyze the heterogeneous graph, which enables us to capture the\nrisk of cascade failure and discover vulnerable infrastructures of cities.\nExtensive experiments with various requests demonstrate not only the expressive\npower of our system but also transferring ability and necessity of the specific\ncomponents.\n","authors":["Jinzhu Mao","Liu Cao","Chen Gao","Huandong Wang","Hangyu Fan","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2307.09866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09862v1","updated":"2023-07-19T09:45:41Z","published":"2023-07-19T09:45:41Z","title":"Towards a population-informed approach to the definition of data-driven\n  models for structural dynamics","summary":"  Machine learning has affected the way in which many phenomena for various\ndomains are modelled, one of these domains being that of structural dynamics.\nHowever, because machine-learning algorithms are problem-specific, they often\nfail to perform efficiently in cases of data scarcity. To deal with such\nissues, combination of physics-based approaches and machine learning algorithms\nhave been developed. Although such methods are effective, they also require the\nanalyser's understanding of the underlying physics of the problem. The current\nwork is aimed at motivating the use of models which learn such relationships\nfrom a population of phenomena, whose underlying physics are similar. The\ndevelopment of such models is motivated by the way that physics-based models,\nand more specifically finite element models, work. Such models are considered\ntransferrable, explainable and trustworthy, attributes which are not trivially\nimposed or achieved for machine-learning models. For this reason,\nmachine-learning approaches are less trusted by industry and often considered\nmore difficult to form validated models. To achieve such data-driven models, a\npopulation-based scheme is followed here and two different machine-learning\nalgorithms from the meta-learning domain are used. The two algorithms are the\nmodel-agnostic meta-learning (MAML) algorithm and the conditional neural\nprocesses (CNP) model. The algorithms seem to perform as intended and\noutperform a traditional machine-learning algorithm at approximating the\nquantities of interest. Moreover, they exhibit behaviour similar to traditional\nmachine learning algorithms (e.g. neural networks or Gaussian processes),\nconcerning their performance as a function of the available structures in the\ntraining population.\n","authors":["G. Tsialiamanis","N. Dervilis","D. J. Wagg","K. Worden"],"pdf_url":"https://arxiv.org/pdf/2307.09862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07873v2","updated":"2023-07-19T09:23:43Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding Adversarial\n  Transferability From Surrogate Training","summary":"  Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v2.pdf","comment":"Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21\n  pages, 12 figures, 13 tables"},{"id":"http://arxiv.org/abs/2307.09458v2","updated":"2023-07-19T09:22:02Z","published":"2023-07-18T17:39:04Z","title":"Does Circuit Analysis Interpretability Scale? Evidence from Multiple\n  Choice Capabilities in Chinchilla","summary":"  \\emph{Circuit analysis} is a promising technique for understanding the\ninternal mechanisms of language models. However, existing analyses are done in\nsmall models far from the state of the art. To address this, we present a case\nstudy of circuit analysis in the 70B Chinchilla model, aiming to test the\nscalability of circuit analysis. In particular, we study multiple-choice\nquestion answering, and investigate Chinchilla's capability to identify the\ncorrect answer \\emph{label} given knowledge of the correct answer \\emph{text}.\nWe find that the existing techniques of logit attribution, attention pattern\nvisualization, and activation patching naturally scale to Chinchilla, allowing\nus to identify and categorize a small set of `output nodes' (attention heads\nand MLPs).\n  We further study the `correct letter' category of attention heads aiming to\nunderstand the semantics of their features, with mixed results. For normal\nmultiple-choice question answers, we significantly compress the query, key and\nvalue subspaces of the head without loss of performance when operating on the\nanswer labels for multiple-choice questions, and we show that the query and key\nsubspaces represent an `Nth item in an enumeration' feature to at least some\nextent. However, when we attempt to use this explanation to understand the\nheads' behaviour on a more general distribution including randomized answer\nlabels, we find that it is only a partial explanation, suggesting there is more\nto learn about the operation of `correct letter' heads on multiple choice\nquestion answering.\n","authors":["Tom Lieberum","Matthew Rahtz","János Kramár","Neel Nanda","Geoffrey Irving","Rohin Shah","Vladimir Mikulik"],"pdf_url":"https://arxiv.org/pdf/2307.09458v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10404v4","updated":"2023-07-19T09:17:09Z","published":"2023-06-17T18:16:51Z","title":"The RL Perceptron: Generalisation Dynamics of Policy Learning in High\n  Dimensions","summary":"  Reinforcement learning (RL) algorithms have proven transformative in a range\nof domains. To tackle real-world domains, these systems often use neural\nnetworks to learn policies directly from pixels or other high-dimensional\nsensory input. By contrast, much theory of RL has focused on discrete state\nspaces or worst-case analysis, and fundamental questions remain about the\ndynamics of policy learning in high-dimensional settings. Here, we propose a\nsolvable high-dimensional model of RL that can capture a variety of learning\nprotocols, and derive its typical dynamics as a set of closed-form ordinary\ndifferential equations (ODEs). We derive optimal schedules for the learning\nrates and task difficulty - analogous to annealing schemes and curricula during\ntraining in RL - and show that the model exhibits rich behaviour, including\ndelayed learning under sparse rewards; a variety of learning regimes depending\non reward baselines; and a speed-accuracy trade-off driven by reward\nstringency. Experiments on variants of the Procgen game \"Bossfight\" and Arcade\nLearning Environment game \"Pong\" also show such a speed-accuracy trade-off in\npractice. Together, these results take a step towards closing the gap between\ntheory and practice in high-dimensional RL.\n","authors":["Nishil Patel","Sebastian Lee","Stefano Sarao Mannelli","Sebastian Goldt","Adrew Saxe"],"pdf_url":"https://arxiv.org/pdf/2306.10404v4.pdf","comment":"10 pages, 7 figures, Preprint"},{"id":"http://arxiv.org/abs/2305.07898v2","updated":"2023-07-19T09:15:20Z","published":"2023-05-13T11:42:40Z","title":"Network-GIANT: Fully distributed Newton-type optimization via harmonic\n  Hessian consensus","summary":"  This paper considers the problem of distributed multi-agent learning, where\nthe global aim is to minimize a sum of local objective (empirical loss)\nfunctions through local optimization and information exchange between\nneighbouring nodes. We introduce a Newton-type fully distributed optimization\nalgorithm, Network-GIANT, which is based on GIANT, a Federated learning\nalgorithm that relies on a centralized parameter server. The Network-GIANT\nalgorithm is designed via a combination of gradient-tracking and a Newton-type\niterative algorithm at each node with consensus based averaging of local\ngradient and Newton updates. We prove that our algorithm guarantees semi-global\nand exponential convergence to the exact solution over the network assuming\nstrongly convex and smooth loss functions. We provide empirical evidence of the\nsuperior convergence performance of Network-GIANT over other state-of-art\ndistributed learning algorithms such as Network-DANE and Newton-Raphson\nConsensus.\n","authors":["Alessio Maritan","Ganesh Sharma","Luca Schenato","Subhrakanti Dey"],"pdf_url":"https://arxiv.org/pdf/2305.07898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09844v1","updated":"2023-07-19T09:03:41Z","published":"2023-07-19T09:03:41Z","title":"Reinforcement Learning for Credit Index Option Hedging","summary":"  In this paper, we focus on finding the optimal hedging strategy of a credit\nindex option using reinforcement learning. We take a practical approach, where\nthe focus is on realism i.e. discrete time, transaction costs; even testing our\npolicy on real market data. We apply a state of the art algorithm, the Trust\nRegion Volatility Optimization (TRVO) algorithm and show that the derived\nhedging strategy outperforms the practitioner's Black & Scholes delta hedge.\n","authors":["Francesco Mandelli","Marco Pinciroli","Michele Trapletti","Edoardo Vittori"],"pdf_url":"https://arxiv.org/pdf/2307.09844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09340v2","updated":"2023-07-19T08:55:01Z","published":"2023-03-16T14:21:45Z","title":"Improving Automated Hemorrhage Detection in Sparse-view Computed\n  Tomography via Deep Convolutional Neural Network based Artifact Reduction","summary":"  Purpose: Sparse-view computed tomography (CT) is an effective way to reduce\ndose by lowering the total number of views acquired, albeit at the expense of\nimage quality, which, in turn, can impact the ability to detect diseases. We\nexplore deep learning-based artifact reduction in sparse-view cranial CT scans\nand its impact on automated hemorrhage detection. Methods: We trained a U-Net\nfor artefact reduction on simulated sparse-view cranial CT scans from 3000\npatients obtained from a public dataset and reconstructed with varying levels\nof sub-sampling. Additionally, we trained a convolutional neural network on\nfully sampled CT data from 17,545 patients for automated hemorrhage detection.\nWe evaluated the classification performance using the area under the receiver\noperator characteristic curves (AUC-ROCs) with corresponding 95% confidence\nintervals (CIs) and the DeLong test, along with confusion matrices. The\nperformance of the U-Net was compared to an analytical approach based on total\nvariation (TV). Results: The U-Net performed superior compared to unprocessed\nand TV-processed images with respect to image quality and automated hemorrhage\ndiagnosis. With U-Net post-processing, the number of views can be reduced from\n4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;\n0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256\nviews (0.967; 0.964-0.969) with a slight performance decrease (P<.001).\nConclusion: The results suggest that U-Net based artifact reduction\nsubstantially enhances automated hemorrhage detection in sparse-view cranial\nCTs. Our findings highlight that appropriate post-processing is crucial for\noptimal image quality and diagnostic accuracy while minimizing radiation dose.\n","authors":["Johannes Thalhammer","Manuel Schultheiss","Tina Dorosti","Tobias Lasser","Franz Pfeiffer","Daniela Pfeiffer","Florian Schaff"],"pdf_url":"https://arxiv.org/pdf/2303.09340v2.pdf","comment":"11 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.09836v1","updated":"2023-07-19T08:47:41Z","published":"2023-07-19T08:47:41Z","title":"Near-Linear Time Projection onto the $\\ell_{1,\\infty}$ Ball; Application\n  to Sparse Autoencoders","summary":"  Looking for sparsity is nowadays crucial to speed up the training of\nlarge-scale neural networks. Projections onto the $\\ell_{1,2}$ and\n$\\ell_{1,\\infty}$ are among the most efficient techniques to sparsify and\nreduce the overall cost of neural networks. In this paper, we introduce a new\nprojection algorithm for the $\\ell_{1,\\infty}$ norm ball. The worst-case time\ncomplexity of this algorithm is $\\mathcal{O}\\big(nm+J\\log(nm)\\big)$ for a\nmatrix in $\\mathbb{R}^{n\\times m}$. $J$ is a term that tends to 0 when the\nsparsity is high, and to $nm$ when the sparsity is low. Its implementation is\neasy and it is guaranteed to converge to the exact solution in a finite time.\nMoreover, we propose to incorporate the $\\ell_{1,\\infty}$ ball projection while\ntraining an autoencoder to enforce feature selection and sparsity of the\nweights. Sparsification appears in the encoder to primarily do feature\nselection due to our application in biology, where only a very small part\n($<2\\%$) of the data is relevant. We show that both in the biological case and\nin the general case of sparsity that our method is the fastest.\n","authors":["Guillaume Perez","Laurent Condat","Michel Barlaud"],"pdf_url":"https://arxiv.org/pdf/2307.09836v1.pdf","comment":"22 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.09835v1","updated":"2023-07-19T08:46:47Z","published":"2023-07-19T08:46:47Z","title":"Deep Operator Network Approximation Rates for Lipschitz Operators","summary":"  We establish universality and expression rate bounds for a class of neural\nDeep Operator Networks (DON) emulating Lipschitz (or H\\\"older) continuous maps\n$\\mathcal G:\\mathcal X\\to\\mathcal Y$ between (subsets of) separable Hilbert\nspaces $\\mathcal X$, $\\mathcal Y$. The DON architecture considered uses linear\nencoders $\\mathcal E$ and decoders $\\mathcal D$ via (biorthogonal) Riesz bases\nof $\\mathcal X$, $\\mathcal Y$, and an approximator network of an\ninfinite-dimensional, parametric coordinate map that is Lipschitz continuous on\nthe sequence space $\\ell^2(\\mathbb N)$. Unlike previous works ([Herrmann,\nSchwab and Zech: Neural and Spectral operator surrogates: construction and\nexpression rate bounds, SAM Report, 2022], [Marcati and Schwab: Exponential\nConvergence of Deep Operator Networks for Elliptic Partial Differential\nEquations, SAM Report, 2022]), which required for example $\\mathcal G$ to be\nholomorphic, the present expression rate results require mere Lipschitz (or\nH\\\"older) continuity of $\\mathcal G$. Key in the proof of the present\nexpression rate bounds is the use of either super-expressive activations (e.g.\n[Yarotski: Elementary superexpressive activations, Int. Conf. on ML, 2021],\n[Shen, Yang and Zhang: Neural network approximation: Three hidden layers are\nenough, Neural Networks, 2021], and the references there) which are inspired by\nthe Kolmogorov superposition theorem, or of nonstandard NN architectures with\nstandard (ReLU) activations as recently proposed in [Zhang, Shen and Yang:\nNeural Network Architecture Beyond Width and Depth, Adv. in Neural Inf. Proc.\nSys., 2022]. We illustrate the abstract results by approximation rate bounds\nfor emulation of a) solution operators for parametric elliptic variational\ninequalities, and b) Lipschitz maps of Hilbert-Schmidt operators.\n","authors":["Christoph Schwab","Andreas Stein","Jakob Zech"],"pdf_url":"https://arxiv.org/pdf/2307.09835v1.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2307.09829v1","updated":"2023-07-19T08:34:25Z","published":"2023-07-19T08:34:25Z","title":"What do neural networks learn in image classification? A frequency\n  shortcut perspective","summary":"  Frequency analysis is useful for understanding the mechanisms of\nrepresentation learning in neural networks (NNs). Most research in this area\nfocuses on the learning dynamics of NNs for regression tasks, while little for\nclassification. This study empirically investigates the latter and expands the\nunderstanding of frequency shortcuts. First, we perform experiments on\nsynthetic datasets, designed to have a bias in different frequency bands. Our\nresults demonstrate that NNs tend to find simple solutions for classification,\nand what they learn first during training depends on the most distinctive\nfrequency characteristics, which can be either low- or high-frequencies.\nSecond, we confirm this phenomenon on natural images. We propose a metric to\nmeasure class-wise frequency characteristics and a method to identify frequency\nshortcuts. The results show that frequency shortcuts can be texture-based or\nshape-based, depending on what best simplifies the objective. Third, we\nvalidate the transferability of frequency shortcuts on out-of-distribution\n(OOD) test sets. Our results suggest that frequency shortcuts can be\ntransferred across datasets and cannot be fully avoided by larger model\ncapacity and data augmentation. We recommend that future research should focus\non effective training schemes mitigating frequency shortcut learning.\n","authors":["Shunxin Wang","Raymond Veldhuis","Christoph Brune","Nicola Strisciuglio"],"pdf_url":"https://arxiv.org/pdf/2307.09829v1.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2307.09823v1","updated":"2023-07-19T08:21:01Z","published":"2023-07-19T08:21:01Z","title":"Multi-modal Learning based Prediction for Disease","summary":"  Non alcoholic fatty liver disease (NAFLD) is the most common cause of chronic\nliver disease, which can be predicted accurately to prevent advanced fibrosis\nand cirrhosis. While, a liver biopsy, the gold standard for NAFLD diagnosis, is\ninvasive, expensive, and prone to sampling errors. Therefore, non-invasive\nstudies are extremely promising, yet they are still in their infancy due to the\nlack of comprehensive research data and intelligent methods for multi-modal\ndata. This paper proposes a NAFLD diagnosis system (DeepFLDDiag) combining a\ncomprehensive clinical dataset (FLDData) and a multi-modal learning based NAFLD\nprediction method (DeepFLD). The dataset includes over 6000 participants\nphysical examinations, laboratory and imaging studies, extensive\nquestionnaires, and facial images of partial participants, which is\ncomprehensive and valuable for clinical studies. From the dataset, we\nquantitatively analyze and select clinical metadata that most contribute to\nNAFLD prediction. Furthermore, the proposed DeepFLD, a deep neural network\nmodel designed to predict NAFLD using multi-modal input, including metadata and\nfacial images, outperforms the approach that only uses metadata. Satisfactory\nperformance is also verified on other unseen datasets. Inspiringly, DeepFLD can\nachieve competitive results using only facial images as input rather than\nmetadata, paving the way for a more robust and simpler non-invasive NAFLD\ndiagnosis.\n","authors":["Yaran Chen","Xueyu Chen","Yu Han","Haoran Li","Dongbin Zhao","Jingzhong Li","Xu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09818v1","updated":"2023-07-19T08:06:37Z","published":"2023-07-19T08:06:37Z","title":"Deep unrolling Shrinkage Network for Dynamic MR imaging","summary":"  Deep unrolling networks that utilize sparsity priors have achieved great\nsuccess in dynamic magnetic resonance (MR) imaging. The convolutional neural\nnetwork (CNN) is usually utilized to extract the transformed domain, and then\nthe soft thresholding (ST) operator is applied to the CNN-transformed data to\nenforce the sparsity priors. However, the ST operator is usually constrained to\nbe the same across all channels of the CNN-transformed data. In this paper, we\npropose a novel operator, called soft thresholding with channel attention\n(AST), that learns the threshold for each channel. In particular, we put\nforward a novel deep unrolling shrinkage network (DUS-Net) by unrolling the\nalternating direction method of multipliers (ADMM) for optimizing the\ntransformed $l_1$ norm dynamic MR reconstruction model. Experimental results on\nan open-access dynamic cine MR dataset demonstrate that the proposed DUS-Net\noutperforms the state-of-the-art methods. The source code is available at\n\\url{https://github.com/yhao-z/DUS-Net}.\n","authors":["Yinghao Zhang","Xiaodi Li","Weihang Li","Yue Hu"],"pdf_url":"https://arxiv.org/pdf/2307.09818v1.pdf","comment":"5 pages,3 figures,2 tables"},{"id":"http://arxiv.org/abs/2307.09816v1","updated":"2023-07-19T08:05:46Z","published":"2023-07-19T08:05:46Z","title":"Manifold Learning with Sparse Regularised Optimal Transport","summary":"  Manifold learning is a central task in modern statistics and data science.\nMany datasets (cells, documents, images, molecules) can be represented as point\nclouds embedded in a high dimensional ambient space, however the degrees of\nfreedom intrinsic to the data are usually far fewer than the number of ambient\ndimensions. The task of detecting a latent manifold along which the data are\nembedded is a prerequisite for a wide family of downstream analyses. Real-world\ndatasets are subject to noisy observations and sampling, so that distilling\ninformation about the underlying manifold is a major challenge. We propose a\nmethod for manifold learning that utilises a symmetric version of optimal\ntransport with a quadratic regularisation that constructs a sparse and adaptive\naffinity matrix, that can be interpreted as a generalisation of the\nbistochastic kernel normalisation. We prove that the resulting kernel is\nconsistent with a Laplace-type operator in the continuous limit, establish\nrobustness to heteroskedastic noise and exhibit these results in simulations.\nWe identify a highly efficient computational scheme for computing this optimal\ntransport for discrete data and demonstrate that it outperforms competing\nmethods in a set of examples.\n","authors":["Stephen Zhang","Gilles Mordant","Tetsuya Matsumoto","Geoffrey Schiebinger"],"pdf_url":"https://arxiv.org/pdf/2307.09816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09810v1","updated":"2023-07-19T07:58:21Z","published":"2023-07-19T07:58:21Z","title":"GenKL: An Iterative Framework for Resolving Label Ambiguity and Label\n  Non-conformity in Web Images Via a New Generalized KL Divergence","summary":"  Web image datasets curated online inherently contain ambiguous\nin-distribution (ID) instances and out-of-distribution (OOD) instances, which\nwe collectively call non-conforming (NC) instances. In many recent approaches\nfor mitigating the negative effects of NC instances, the core implicit\nassumption is that the NC instances can be found via entropy maximization. For\n\"entropy\" to be well-defined, we are interpreting the output prediction vector\nof an instance as the parameter vector of a multinomial random variable, with\nrespect to some trained model with a softmax output layer. Hence, entropy\nmaximization is based on the idealized assumption that NC instances have\npredictions that are \"almost\" uniformly distributed. However, in real-world web\nimage datasets, there are numerous NC instances whose predictions are far from\nbeing uniformly distributed. To tackle the limitation of entropy maximization,\nwe propose $(\\alpha, \\beta)$-generalized KL divergence,\n$\\mathcal{D}_{\\text{KL}}^{\\alpha, \\beta}(p\\|q)$, which can be used to identify\nsignificantly more NC instances. Theoretical properties of\n$\\mathcal{D}_{\\text{KL}}^{\\alpha, \\beta}(p\\|q)$ are proven, and we also show\nempirically that a simple use of $\\mathcal{D}_{\\text{KL}}^{\\alpha,\n\\beta}(p\\|q)$ outperforms all baselines on the NC instance identification task.\nBuilding upon $(\\alpha,\\beta)$-generalized KL divergence, we also introduce a\nnew iterative training framework, GenKL, that identifies and relabels NC\ninstances. When evaluated on three web image datasets, Clothing1M,\nFood101/Food101N, and mini WebVision 1.0, we achieved new state-of-the-art\nclassification accuracies: $81.34\\%$, $85.73\\%$ and $78.99\\%$/$92.54\\%$\n(top-1/top-5), respectively.\n","authors":["Xia Huang","Kai Fong Ernest Chong"],"pdf_url":"https://arxiv.org/pdf/2307.09810v1.pdf","comment":"Published (with open access) at International Journal of Computer\n  Vision (IJCV, 2023). 25 pages, 8 figures. Code is available at:\n  https://github.com/codetopaper/GenKL"},{"id":"http://arxiv.org/abs/2307.09801v1","updated":"2023-07-19T07:40:51Z","published":"2023-07-19T07:40:51Z","title":"Graph Federated Learning Based on the Decentralized Framework","summary":"  Graph learning has a wide range of applications in many scenarios, which\nrequire more need for data privacy. Federated learning is an emerging\ndistributed machine learning approach that leverages data from individual\ndevices or data centers to improve the accuracy and generalization of the\nmodel, while also protecting the privacy of user data. Graph-federated learning\nis mainly based on the classical federated learning framework i.e., the\nClient-Server framework. However, the Client-Server framework faces problems\nsuch as a single point of failure of the central server and poor scalability of\nnetwork topology. First, we introduce the decentralized framework to\ngraph-federated learning. Second, determine the confidence among nodes based on\nthe similarity of data among nodes, subsequently, the gradient information is\nthen aggregated by linear weighting based on confidence. Finally, the proposed\nmethod is compared with FedAvg, Fedprox, GCFL, and GCFL+ to verify the\neffectiveness of the proposed method. Experiments demonstrate that the proposed\nmethod outperforms other methods.\n","authors":["Peilin Liu","Yanni Tang","Mingyue Zhang","Wu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.09801v1.pdf","comment":"12 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.09797v1","updated":"2023-07-19T07:31:37Z","published":"2023-07-19T07:31:37Z","title":"Probabilistic Forecasting with Coherent Aggregation","summary":"  Obtaining accurate probabilistic forecasts while respecting hierarchical\ninformation is an important operational challenge in many applications, perhaps\nmost obviously in energy management, supply chain planning, and resource\nallocation. The basic challenge, especially for multivariate forecasting, is\nthat forecasts are often required to be coherent with respect to the\nhierarchical structure. In this paper, we propose a new model which leverages a\nfactor model structure to produce coherent forecasts by construction. This is a\nconsequence of a simple (exchangeability) observation: permuting\n\\textit{}base-level series in the hierarchy does not change their aggregates.\nOur model uses a convolutional neural network to produce parameters for the\nfactors, their loadings and base-level distributions; it produces samples which\ncan be differentiated with respect to the model's parameters; and it can\ntherefore optimize for any sample-based loss function, including the Continuous\nRanked Probability Score and quantile losses. We can choose arbitrary\ncontinuous distributions for the factor and the base-level distributions. We\ncompare our method to two previous methods which can be optimized end-to-end,\nwhile enforcing coherent aggregation. Our model achieves significant\nimprovements: between $11.8-41.4\\%$ on three hierarchical forecasting datasets.\nWe also analyze the influence of parameters in our model with respect to\nbase-level distribution and number of factors.\n","authors":["Geoffrey Négiar","Ruijun Ma","O. Nangba Meetei","Mengfei Cao","Michael W. Mahoney"],"pdf_url":"https://arxiv.org/pdf/2307.09797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05086v3","updated":"2023-07-19T07:31:35Z","published":"2023-02-10T07:08:13Z","title":"Making Substitute Models More Bayesian Can Enhance Transferability of\n  Adversarial Examples","summary":"  The transferability of adversarial examples across deep neural networks\n(DNNs) is the crux of many black-box attacks. Many prior efforts have been\ndevoted to improving the transferability via increasing the diversity in inputs\nof some substitute models. In this paper, by contrast, we opt for the diversity\nin substitute models and advocate to attack a Bayesian model for achieving\ndesirable transferability. Deriving from the Bayesian formulation, we develop a\nprincipled strategy for possible finetuning, which can be combined with many\noff-the-shelf Gaussian posterior approximations over DNN parameters. Extensive\nexperiments have been conducted to verify the effectiveness of our method, on\ncommon benchmark datasets, and the results demonstrate that our method\noutperforms recent state-of-the-arts by large margins (roughly 19% absolute\nincrease in average attack success rate on ImageNet), and, by combining with\nthese recent methods, further performance gain can be obtained. Our code:\nhttps://github.com/qizhangli/MoreBayesian-attack.\n","authors":["Qizhang Li","Yiwen Guo","Wangmeng Zuo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2302.05086v3.pdf","comment":"Accepted by ICLR 2023, fix typos"},{"id":"http://arxiv.org/abs/2307.09796v1","updated":"2023-07-19T07:30:01Z","published":"2023-07-19T07:30:01Z","title":"Forecasting Early with Meta Learning","summary":"  In the early observation period of a time series, there might be only a few\nhistoric observations available to learn a model. However, in cases where an\nexisting prior set of datasets is available, Meta learning methods can be\napplicable. In this paper, we devise a Meta learning method that exploits\nsamples from additional datasets and learns to augment time series through\nadversarial learning as an auxiliary task for the target dataset. Our model\n(FEML), is equipped with a shared Convolutional backbone that learns features\nfor varying length inputs from different datasets and has dataset specific\nheads to forecast for different output lengths. We show that FEML can meta\nlearn across datasets and by additionally learning on adversarial generated\nsamples as auxiliary samples for the target dataset, it can improve the\nforecasting performance compared to single task learning, and various solutions\nadapted from Joint learning, Multi-task learning and classic forecasting\nbaselines.\n","authors":["Shayan Jawed","Kiran Madhusudhanan","Vijaya Krishna Yalavarthi","Lars Schmidt-Thieme"],"pdf_url":"https://arxiv.org/pdf/2307.09796v1.pdf","comment":"IJCNN 2023"},{"id":"http://arxiv.org/abs/2307.09795v1","updated":"2023-07-19T07:29:14Z","published":"2023-07-19T07:29:14Z","title":"From West to East: Who can understand the music of the others better?","summary":"  Recent developments in MIR have led to several benchmark deep learning models\nwhose embeddings can be used for a variety of downstream tasks. At the same\ntime, the vast majority of these models have been trained on Western pop/rock\nmusic and related styles. This leads to research questions on whether these\nmodels can be used to learn representations for different music cultures and\nstyles, or whether we can build similar music audio embedding models trained on\ndata from different cultures or styles. To that end, we leverage transfer\nlearning methods to derive insights about the similarities between the\ndifferent music cultures to which the data belongs to. We use two Western music\ndatasets, two traditional/folk datasets coming from eastern Mediterranean\ncultures, and two datasets belonging to Indian art music. Three deep audio\nembedding models are trained and transferred across domains, including two\nCNN-based and a Transformer-based architecture, to perform auto-tagging for\neach target domain dataset. Experimental results show that competitive\nperformance is achieved in all domains via transfer learning, while the best\nsource dataset varies for each music culture. The implementation and the\ntrained models are both provided in a public repository.\n","authors":["Charilaos Papaioannou","Emmanouil Benetos","Alexandros Potamianos"],"pdf_url":"https://arxiv.org/pdf/2307.09795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09792v1","updated":"2023-07-19T07:17:06Z","published":"2023-07-19T07:17:06Z","title":"A Note on Hardness of Computing Recursive Teaching Dimension","summary":"  In this short note, we show that the problem of computing the recursive\nteaching dimension (RTD) for a concept class (given explicitly as input)\nrequires $n^{\\Omega(\\log n)}$-time, assuming the exponential time hypothesis\n(ETH). This matches the running time $n^{O(\\log n)}$ of the brute-force\nalgorithm for the problem.\n","authors":["Pasin Manurangsi"],"pdf_url":"https://arxiv.org/pdf/2307.09792v1.pdf","comment":"To appear in IPL"},{"id":"http://arxiv.org/abs/2307.09782v1","updated":"2023-07-19T06:58:03Z","published":"2023-07-19T06:58:03Z","title":"ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization\n  Using Floating-Point Formats","summary":"  In the complex domain of large language models (LLMs), striking a balance\nbetween computational efficiency and maintaining model quality is a formidable\nchallenge. Navigating the inherent limitations of uniform quantization,\nparticularly when dealing with outliers, and motivated by the launch of\nNVIDIA's H100 hardware, this study delves into the viability of floating-point\n(FP) quantization, particularly focusing on FP8 and FP4, as a potential\nsolution. Our comprehensive investigation reveals that for LLMs, FP8 activation\nconsistently outshines its integer (INT8) equivalent, with the performance edge\nbecoming more noticeable in models possessing parameters beyond one billion.\nFor weight quantization, our findings indicate that FP4 exhibits comparable, if\nnot superior, performance to INT4, simplifying deployment on FP-supported\nhardware like H100. To mitigate the overhead from precision alignment caused by\nthe disparity between weights and activations, we propose two scaling\nconstraints for weight quantization that negligibly impact the performance\ncompared to the standard W4A8 model. We additionally enhance our quantization\nmethods by integrating the Low Rank Compensation (LoRC) strategy, yielding\nimprovements especially in smaller models. The results of our investigation\nemphasize the immense potential of FP quantization for LLMs, paving the way for\nhigh-efficiency deployment in resource-limited settings.\n","authors":["Xiaoxia Wu","Zhewei Yao","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2307.09782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09781v1","updated":"2023-07-19T06:56:07Z","published":"2023-07-19T06:56:07Z","title":"Text2Layer: Layered Image Generation using Latent Diffusion Model","summary":"  Layer compositing is one of the most popular image editing workflows among\nboth amateurs and professionals. Motivated by the success of diffusion models,\nwe explore layer compositing from a layered image generation perspective.\nInstead of generating an image, we propose to generate background, foreground,\nlayer mask, and the composed image simultaneously. To achieve layered image\ngeneration, we train an autoencoder that is able to reconstruct layered images\nand train diffusion models on the latent representation. One benefit of the\nproposed problem is to enable better compositing workflows in addition to the\nhigh-quality image output. Another benefit is producing higher-quality layer\nmasks compared to masks produced by a separate step of image segmentation.\nExperimental results show that the proposed method is able to generate\nhigh-quality layered images and initiates a benchmark for future work.\n","authors":["Xinyang Zhang","Wentian Zhao","Xin Lu","Jeff Chien"],"pdf_url":"https://arxiv.org/pdf/2307.09781v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2212.01692v4","updated":"2023-07-19T06:48:35Z","published":"2022-12-03T21:14:32Z","title":"Can In-context Learners Learn a Reasoning Concept from Demonstrations?","summary":"  Language models exhibit an emergent ability to learn a new task from a small\nnumber of input-output demonstrations. However, recent work shows that\nin-context learners largely rely on their pre-trained knowledge, such as the\nsentiment of the labels, instead of learning new associations from the input.\nWe argue that the commonly-used few-shot evaluation using a random selection of\nin-context demonstrations can not disentangle models' reliance on such biases,\nas most of the randomly-selected demonstrations do not present relations\ninformative for prediction beyond exposing the task's input-output\ndistribution.\n  Therefore, to evaluate models' in-context learning ability independent of\nmodels' memory, we introduce a Concept-sharing few-shot learning method\nchoosing the demonstrations that share an underlying concept with the predicted\nsample. We extract a set of such concepts from available human explanations and\nmeasure how much models can benefit from presenting these concepts in few-shot\ndemonstrations.\n  We find that most of the recent in-context learners can not consistently\nbenefit from the demonstrated concepts, irrespective of the model size.\nHowever, we note that T0 models are more sensitive to exhibited concepts,\nbenefiting from concept-sharing demonstrations in 7 out of 8 evaluation\nscenarios.\n","authors":["Michal Štefánik","Marek Kadlčík"],"pdf_url":"https://arxiv.org/pdf/2212.01692v4.pdf","comment":"Awarded Best Paper at ACL 2023 Natural Language Reasoning and\n  Structured Explanations (NLRSE) workshop"},{"id":"http://arxiv.org/abs/2307.09779v1","updated":"2023-07-19T06:48:33Z","published":"2023-07-19T06:48:33Z","title":"Beyond Single-Feature Importance with ICECREAM","summary":"  Which set of features was responsible for a certain output of a machine\nlearning model? Which components caused the failure of a cloud computing\napplication? These are just two examples of questions we are addressing in this\nwork by Identifying Coalition-based Explanations for Common and Rare Events in\nAny Model (ICECREAM). Specifically, we propose an information-theoretic\nquantitative measure for the influence of a coalition of variables on the\ndistribution of a target variable. This allows us to identify which set of\nfactors is essential to obtain a certain outcome, as opposed to\nwell-established explainability and causal contribution analysis methods which\ncan assign contributions only to individual factors and rank them by their\nimportance. In experiments with synthetic and real-world data, we show that\nICECREAM outperforms state-of-the-art methods for explainability and root cause\nanalysis, and achieves impressive accuracy in both tasks.\n","authors":["Michael Oesterle","Patrick Blöbaum","Atalanti A. Mastakouri","Elke Kirschbaum"],"pdf_url":"https://arxiv.org/pdf/2307.09779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.03638v4","updated":"2023-07-19T06:43:10Z","published":"2022-06-08T01:50:08Z","title":"Alternately Optimized Graph Neural Networks","summary":"  Graph Neural Networks (GNNs) have greatly advanced the semi-supervised node\nclassification task on graphs. The majority of existing GNNs are trained in an\nend-to-end manner that can be viewed as tackling a bi-level optimization\nproblem. This process is often inefficient in computation and memory usage. In\nthis work, we propose a new optimization framework for semi-supervised learning\non graphs. The proposed framework can be conveniently solved by the alternating\noptimization algorithms, resulting in significantly improved efficiency.\nExtensive experiments demonstrate that the proposed method can achieve\ncomparable or better performance with state-of-the-art baselines while it has\nsignificantly better computation and memory efficiency.\n","authors":["Haoyu Han","Xiaorui Liu","Haitao Mao","MohamadAli Torkamani","Feng Shi","Victor Lee","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2206.03638v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09771v1","updated":"2023-07-19T06:17:16Z","published":"2023-07-19T06:17:16Z","title":"A Novel Spatial-Temporal Variational Quantum Circuit to Enable Deep\n  Learning on NISQ Devices","summary":"  Quantum computing presents a promising approach for machine learning with its\ncapability for extremely parallel computation in high-dimension through\nsuperposition and entanglement. Despite its potential, existing quantum\nlearning algorithms, such as Variational Quantum Circuits(VQCs), face\nchallenges in handling more complex datasets, particularly those that are not\nlinearly separable. What's more, it encounters the deployability issue, making\nthe learning models suffer a drastic accuracy drop after deploying them to the\nactual quantum devices. To overcome these limitations, this paper proposes a\nnovel spatial-temporal design, namely ST-VQC, to integrate non-linearity in\nquantum learning and improve the robustness of the learning model to noise.\nSpecifically, ST-VQC can extract spatial features via a novel block-based\nencoding quantum sub-circuit coupled with a layer-wise computation quantum\nsub-circuit to enable temporal-wise deep learning. Additionally, a SWAP-Free\nphysical circuit design is devised to improve robustness. These designs bring a\nnumber of hyperparameters. After a systematic analysis of the design space for\neach design component, an automated optimization framework is proposed to\ngenerate the ST-VQC quantum circuit. The proposed ST-VQC has been evaluated on\ntwo IBM quantum processors, ibm_cairo with 27 qubits and ibmq_lima with 7\nqubits to assess its effectiveness. The results of the evaluation on the\nstandard dataset for binary classification show that ST-VQC can achieve over\n30% accuracy improvement compared with existing VQCs on actual quantum\ncomputers. Moreover, on a non-linear synthetic dataset, the ST-VQC outperforms\na linear classifier by 27.9%, while the linear classifier using classical\ncomputing outperforms the existing VQC by 15.58%.\n","authors":["Jinyang Li","Zhepeng Wang","Zhirui Hu","Prasanna Date","Ang Li","Weiwen Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.09771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.11434v2","updated":"2023-07-19T06:17:10Z","published":"2022-03-22T03:13:39Z","title":"Non-linear Embeddings in Hilbert Simplex Geometry","summary":"  A key technique of machine learning and computer vision is to embed discrete\nweighted graphs into continuous spaces for further downstream processing.\nEmbedding discrete hierarchical structures in hyperbolic geometry has proven\nvery successful since it was shown that any weighted tree can be embedded in\nthat geometry with arbitrary low distortion. Various optimization methods for\nhyperbolic embeddings based on common models of hyperbolic geometry have been\nstudied. In this paper, we consider Hilbert geometry for the standard simplex\nwhich is isometric to a vector space equipped with the variation polytope norm.\nWe study the representation power of this Hilbert simplex geometry by embedding\ndistance matrices of graphs. Our findings demonstrate that Hilbert simplex\ngeometry is competitive to alternative geometries such as the Poincar\\'e\nhyperbolic ball or the Euclidean geometry for embedding tasks while being fast\nand numerically robust.\n","authors":["Frank Nielsen","Ke Sun"],"pdf_url":"https://arxiv.org/pdf/2203.11434v2.pdf","comment":"19 pages, 11 figures"},{"id":"http://arxiv.org/abs/2307.09768v1","updated":"2023-07-19T06:05:33Z","published":"2023-07-19T06:05:33Z","title":"How Curvature Enhance the Adaptation Power of Framelet GCNs","summary":"  Graph neural network (GNN) has been demonstrated powerful in modeling\ngraph-structured data. However, despite many successful cases of applying GNNs\nto various graph classification and prediction tasks, whether the graph\ngeometrical information has been fully exploited to enhance the learning\nperformance of GNNs is not yet well understood. This paper introduces a new\napproach to enhance GNN by discrete graph Ricci curvature. Specifically, the\ngraph Ricci curvature defined on the edges of a graph measures how difficult\nthe information transits on one edge from one node to another based on their\nneighborhoods. Motivated by the geometric analogy of Ricci curvature in the\ngraph setting, we prove that by inserting the curvature information with\ndifferent carefully designed transformation function $\\zeta$, several known\ncomputational issues in GNN such as over-smoothing can be alleviated in our\nproposed model. Furthermore, we verified that edges with very positive Ricci\ncurvature (i.e., $\\kappa_{i,j} \\approx 1$) are preferred to be dropped to\nenhance model's adaption to heterophily graph and one curvature based graph\nedge drop algorithm is proposed. Comprehensive experiments show that our\ncurvature-based GNN model outperforms the state-of-the-art baselines in both\nhomophily and heterophily graph datasets, indicating the effectiveness of\ninvolving graph geometric information in GNNs.\n","authors":["Dai Shi","Yi Guo","Zhiqi Shao","Junbin Gao"],"pdf_url":"https://arxiv.org/pdf/2307.09768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14048v2","updated":"2023-07-19T06:02:38Z","published":"2023-06-24T20:11:14Z","title":"H$_2$O: Heavy-Hitter Oracle for Efficient Generative Inference of Large\n  Language Models","summary":"  Large Language Models (LLMs), despite their recent impressive\naccomplishments, are notably cost-prohibitive to deploy, particularly for\napplications involving long-content generation, such as dialogue systems and\nstory writing. Often, a large amount of transient state information, referred\nto as the KV cache, is stored in GPU memory in addition to model parameters,\nscaling linearly with the sequence length and batch size. In this paper, we\nintroduce a novel approach for implementing the KV cache which significantly\nreduces its memory footprint. Our approach is based on the noteworthy\nobservation that a small portion of tokens contributes most of the value when\ncomputing attention scores. We call these tokens Heavy Hitters (H$_2$). Through\na comprehensive investigation, we find that (i) the emergence of H$_2$ is\nnatural and strongly correlates with the frequent co-occurrence of tokens in\nthe text, and (ii) removing them results in significant performance\ndegradation. Based on these insights, we propose Heavy Hitter Oracle (H$_2$O),\na KV cache eviction policy that dynamically retains a balance of recent and\nH$_2$ tokens. We formulate the KV cache eviction as a dynamic submodular\nproblem and prove (under mild assumptions) a theoretical guarantee for our\nnovel eviction algorithm which could help guide future work. We validate the\naccuracy of our algorithm with OPT, LLaMA, and GPT-NeoX across a wide range of\ntasks. Our implementation of H$_2$O with 20% heavy hitters improves the\nthroughput over three leading inference systems DeepSpeed Zero-Inference,\nHugging Face Accelerate, and FlexGen by up to 29$\\times$, 29$\\times$, and\n3$\\times$ on OPT-6.7B and OPT-30B. With the same batch size, H2O can reduce the\nlatency by up to 1.9$\\times$. The code is available at\nhttps://github.com/FMInference/H2O.\n","authors":["Zhenyu Zhang","Ying Sheng","Tianyi Zhou","Tianlong Chen","Lianmin Zheng","Ruisi Cai","Zhao Song","Yuandong Tian","Christopher Ré","Clark Barrett","Zhangyang Wang","Beidi Chen"],"pdf_url":"https://arxiv.org/pdf/2306.14048v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09767v1","updated":"2023-07-19T05:58:21Z","published":"2023-07-19T05:58:21Z","title":"Sig-Splines: universal approximation and convex calibration of time\n  series generative models","summary":"  We propose a novel generative model for multivariate discrete-time time\nseries data. Drawing inspiration from the construction of neural spline flows,\nour algorithm incorporates linear transformations and the signature transform\nas a seamless substitution for traditional neural networks. This approach\nenables us to achieve not only the universality property inherent in neural\nnetworks but also introduces convexity in the model's parameters.\n","authors":["Magnus Wiese","Phillip Murray","Ralf Korn"],"pdf_url":"https://arxiv.org/pdf/2307.09767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08621v2","updated":"2023-07-19T05:56:42Z","published":"2023-07-17T16:40:01Z","title":"Retentive Network: A Successor to Transformer for Large Language Models","summary":"  In this work, we propose Retentive Network (RetNet) as a foundation\narchitecture for large language models, simultaneously achieving training\nparallelism, low-cost inference, and good performance. We theoretically derive\nthe connection between recurrence and attention. Then we propose the retention\nmechanism for sequence modeling, which supports three computation paradigms,\ni.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel\nrepresentation allows for training parallelism. The recurrent representation\nenables low-cost $O(1)$ inference, which improves decoding throughput, latency,\nand GPU memory without sacrificing performance. The chunkwise recurrent\nrepresentation facilitates efficient long-sequence modeling with linear\ncomplexity, where each chunk is encoded parallelly while recurrently\nsummarizing the chunks. Experimental results on language modeling show that\nRetNet achieves favorable scaling results, parallel training, low-cost\ndeployment, and efficient inference. The intriguing properties make RetNet a\nstrong successor to Transformer for large language models. Code will be\navailable at https://aka.ms/retnet.\n","authors":["Yutao Sun","Li Dong","Shaohan Huang","Shuming Ma","Yuqing Xia","Jilong Xue","Jianyong Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02918v3","updated":"2023-07-19T05:51:00Z","published":"2023-03-06T06:28:20Z","title":"Graph Positional Encoding via Random Feature Propagation","summary":"  Two main families of node feature augmentation schemes have been explored for\nenhancing GNNs: random features and spectral positional encoding. Surprisingly,\nhowever, there is still no clear understanding of the relation between these\ntwo augmentation schemes. Here we propose a novel family of positional encoding\nschemes which draws a link between the above two approaches and improves over\nboth. The new approach, named Random Feature Propagation (RFP), is inspired by\nthe power iteration method and its generalizations. It concatenates several\nintermediate steps of an iterative algorithm for computing the dominant\neigenvectors of a propagation matrix, starting from random node features.\nNotably, these propagation steps are based on graph-dependent propagation\noperators that can be either predefined or learned. We explore the theoretical\nand empirical benefits of RFP. First, we provide theoretical justifications for\nusing random features, for incorporating early propagation steps, and for using\nmultiple random initializations. Then, we empirically demonstrate that RFP\nsignificantly outperforms both spectral PE and random features in multiple node\nclassification and graph classification benchmarks.\n","authors":["Moshe Eliasof","Fabrizio Frasca","Beatrice Bevilacqua","Eran Treister","Gal Chechik","Haggai Maron"],"pdf_url":"https://arxiv.org/pdf/2303.02918v3.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2307.09762v1","updated":"2023-07-19T05:45:05Z","published":"2023-07-19T05:45:05Z","title":"Reinforcing POD based model reduction techniques in reaction-diffusion\n  complex networks using stochastic filtering and pattern recognition","summary":"  Complex networks are used to model many real-world systems. However, the\ndimensionality of these systems can make them challenging to analyze.\nDimensionality reduction techniques like POD can be used in such cases.\nHowever, these models are susceptible to perturbations in the input data. We\npropose an algorithmic framework that combines techniques from pattern\nrecognition (PR) and stochastic filtering theory to enhance the output of such\nmodels. The results of our study show that our method can improve the accuracy\nof the surrogate model under perturbed inputs. Deep Neural Networks (DNNs) are\nsusceptible to adversarial attacks. However, recent research has revealed that\nneural Ordinary Differential Equations (ODEs) exhibit robustness in specific\napplications. We benchmark our algorithmic framework with a Neural ODE-based\napproach as a reference.\n","authors":["Abhishek Ajayakumar","Soumyendu Raha"],"pdf_url":"https://arxiv.org/pdf/2307.09762v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.04603v3","updated":"2023-07-19T05:43:44Z","published":"2023-07-07T09:01:42Z","title":"Solvent: A Framework for Protein Folding","summary":"  Consistency and reliability are crucial for conducting AI research. Many\nfamous research fields, such as object detection, have been compared and\nvalidated with solid benchmark frameworks. After AlphaFold2, the protein\nfolding task has entered a new phase, and many methods are proposed based on\nthe component of AlphaFold2. The importance of a unified research framework in\nprotein folding contains implementations and benchmarks to consistently and\nfairly compare various approaches. To achieve this, we present Solvent, an\nprotein folding framework that supports significant components of\nstate-of-th-arts models in the manner of off-the-shelf interface Solvent\ncontains different models implemented in a unified codebase and supports\ntraining and evaluation for defined models on the same dataset. We benchmark\nwell-known algorithms and their components and provide experiments that give\nhelpful insights into the protein structure modeling field. We hope that\nSolvent will increase the reliability and consistency of proposed models and\ngives efficiency in both speed and costs, resulting in acceleration on protein\nfolding modeling research. The code is available at\nhttps://github.com/kakaobrain/solvent, and the project will continue to be\ndeveloped.\n","authors":["Jaemyung Lee","Kyeongtak Han","Jaehoon Kim","Hasun Yu","Youhan Lee"],"pdf_url":"https://arxiv.org/pdf/2307.04603v3.pdf","comment":"preprint, 8pages"},{"id":"http://arxiv.org/abs/2307.09759v1","updated":"2023-07-19T05:41:40Z","published":"2023-07-19T05:41:40Z","title":"Constructing Extreme Learning Machines with zero Spectral Bias","summary":"  The phenomena of Spectral Bias, where the higher frequency components of a\nfunction being learnt in a feedforward Artificial Neural Network (ANN) are seen\nto converge more slowly than the lower frequencies, is observed ubiquitously\nacross ANNs. This has created technology challenges in fields where resolution\nof higher frequencies is crucial, like in Physics Informed Neural Networks\n(PINNs). Extreme Learning Machines (ELMs) that obviate an iterative solution\nprocess which provides the theoretical basis of Spectral Bias (SB), should in\nprinciple be free of the same. This work verifies the reliability of this\nassumption, and shows that it is incorrect. However, the structure of ELMs\nmakes them naturally amenable to implementation of variants of Fourier Feature\nEmbeddings, which have been shown to mitigate SB in ANNs. This approach is\nimplemented and verified to completely eliminate SB, thus bringing into\nfeasibility the application of ELMs for practical problems like PINNs where\nresolution of higher frequencies is essential.\n","authors":["Kaumudi Joshi","Vukka Snigdha","Arya Kumar Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2307.09759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12239v2","updated":"2023-07-19T05:32:04Z","published":"2023-05-20T17:13:06Z","title":"Off-Policy Average Reward Actor-Critic with Deterministic Policy Search","summary":"  The average reward criterion is relatively less studied as most existing\nworks in the Reinforcement Learning literature consider the discounted reward\ncriterion. There are few recent works that present on-policy average reward\nactor-critic algorithms, but average reward off-policy actor-critic is\nrelatively less explored. In this work, we present both on-policy and\noff-policy deterministic policy gradient theorems for the average reward\nperformance criterion. Using these theorems, we also present an Average Reward\nOff-Policy Deep Deterministic Policy Gradient (ARO-DDPG) Algorithm. We first\nshow asymptotic convergence analysis using the ODE-based method. Subsequently,\nwe provide a finite time analysis of the resulting stochastic approximation\nscheme with linear function approximator and obtain an $\\epsilon$-optimal\nstationary policy with a sample complexity of $\\Omega(\\epsilon^{-2.5})$. We\ncompare the average reward performance of our proposed ARO-DDPG algorithm and\nobserve better empirical performance compared to state-of-the-art on-policy\naverage reward actor-critic algorithms over MuJoCo-based environments.\n","authors":["Naman Saxena","Subhojyoti Khastigir","Shishir Kolathaya","Shalabh Bhatnagar"],"pdf_url":"https://arxiv.org/pdf/2305.12239v2.pdf","comment":"Accepted at ICML 2023"},{"id":"http://arxiv.org/abs/2208.06265v2","updated":"2023-07-19T05:08:06Z","published":"2022-08-10T08:28:46Z","title":"Trustworthy Recommender Systems","summary":"  Recommender systems (RSs) aim to help users to effectively retrieve items of\ntheir interests from a large catalogue. For a quite long period of time,\nresearchers and practitioners have been focusing on developing accurate RSs.\nRecent years have witnessed an increasing number of threats to RSs, coming from\nattacks, system and user generated noise, system bias. As a result, it has\nbecome clear that a strict focus on RS accuracy is limited and the research\nmust consider other important factors, e.g., trustworthiness. For end users, a\ntrustworthy RS (TRS) should not only be accurate, but also transparent,\nunbiased and fair as well as robust to noise or attacks. These observations\nactually led to a paradigm shift of the research on RSs: from accuracy-oriented\nRSs to TRSs. However, researchers lack a systematic overview and discussion of\nthe literature in this novel and fast developing field of TRSs. To this end, in\nthis paper, we provide an overview of TRSs, including a discussion of the\nmotivation and basic concepts of TRSs, a presentation of the challenges in\nbuilding TRSs, and a perspective on the future directions in this area. We also\nprovide a novel conceptual framework to support the construction of TRSs.\n","authors":["Shoujin Wang","Xiuzhen Zhang","Yan Wang","Huan Liu","Francesco Ricci"],"pdf_url":"https://arxiv.org/pdf/2208.06265v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01646v2","updated":"2023-07-19T04:59:35Z","published":"2023-07-04T10:58:42Z","title":"SwinGNN: Rethinking Permutation Invariance in Diffusion Models for Graph\n  Generation","summary":"  Diffusion models based on permutation-equivariant networks can learn\npermutation-invariant distributions for graph data. However, in comparison to\ntheir non-invariant counterparts, we have found that these invariant models\nencounter greater learning challenges since 1) their effective target\ndistributions exhibit more modes; 2) their optimal one-step denoising scores\nare the score functions of Gaussian mixtures with more components. Motivated by\nthis analysis, we propose a non-invariant diffusion model, called\n$\\textit{SwinGNN}$, which employs an efficient edge-to-edge 2-WL message\npassing network and utilizes shifted window based self-attention inspired by\nSwinTransformers. Further, through systematic ablations, we identify several\ncritical training and sampling techniques that significantly improve the sample\nquality of graph generation. At last, we introduce a simple post-processing\ntrick, $\\textit{i.e.}$, randomly permuting the generated graphs, which provably\nconverts any graph generative model to a permutation-invariant one. Extensive\nexperiments on synthetic and real-world protein and molecule datasets show that\nour SwinGNN achieves state-of-the-art performances. Our code is released at\nhttps://github.com/qiyan98/SwinGNN.\n","authors":["Qi Yan","Zhengyang Liang","Yang Song","Renjie Liao","Lele Wang"],"pdf_url":"https://arxiv.org/pdf/2307.01646v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.03597v4","updated":"2023-07-19T04:52:33Z","published":"2022-06-07T21:30:58Z","title":"Meta-Learning Parameterized Skills","summary":"  We propose a novel parameterized skill-learning algorithm that aims to learn\ntransferable parameterized skills and synthesize them into a new action space\nthat supports efficient learning in long-horizon tasks. We propose to leverage\noff-policy Meta-RL combined with a trajectory-centric smoothness term to learn\na set of parameterized skills. Our agent can use these learned skills to\nconstruct a three-level hierarchical framework that models a\nTemporally-extended Parameterized Action Markov Decision Process. We\nempirically demonstrate that the proposed algorithms enable an agent to solve a\nset of difficult long-horizon (obstacle-course and robot manipulation) tasks.\n","authors":["Haotian Fu","Shangqun Yu","Saket Tiwari","Michael Littman","George Konidaris"],"pdf_url":"https://arxiv.org/pdf/2206.03597v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09742v1","updated":"2023-07-19T04:07:33Z","published":"2023-07-19T04:07:33Z","title":"Improved Distribution Matching for Dataset Condensation","summary":"  Dataset Condensation aims to condense a large dataset into a smaller one\nwhile maintaining its ability to train a well-performing model, thus reducing\nthe storage cost and training effort in deep learning applications. However,\nconventional dataset condensation methods are optimization-oriented and\ncondense the dataset by performing gradient or parameter matching during model\noptimization, which is computationally intensive even on small datasets and\nmodels. In this paper, we propose a novel dataset condensation method based on\ndistribution matching, which is more efficient and promising. Specifically, we\nidentify two important shortcomings of naive distribution matching (i.e.,\nimbalanced feature numbers and unvalidated embeddings for distance computation)\nand address them with three novel techniques (i.e., partitioning and expansion\naugmentation, efficient and enriched model sampling, and class-aware\ndistribution regularization). Our simple yet effective method outperforms most\nprevious optimization-oriented methods with much fewer computational resources,\nthereby scaling data condensation to larger datasets and models. Extensive\nexperiments demonstrate the effectiveness of our method. Codes are available at\nhttps://github.com/uitrbn/IDM\n","authors":["Ganlong Zhao","Guanbin Li","Yipeng Qin","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2307.09742v1.pdf","comment":"CVPR2023"},{"id":"http://arxiv.org/abs/2302.11665v2","updated":"2023-07-19T04:03:11Z","published":"2023-02-22T21:41:34Z","title":"AlpaServe: Statistical Multiplexing with Model Parallelism for Deep\n  Learning Serving","summary":"  Model parallelism is conventionally viewed as a method to scale a single\nlarge deep learning model beyond the memory limits of a single device. In this\npaper, we demonstrate that model parallelism can be additionally used for the\nstatistical multiplexing of multiple devices when serving multiple models, even\nwhen a single model can fit into a single device. Our work reveals a\nfundamental trade-off between the overhead introduced by model parallelism and\nthe opportunity to exploit statistical multiplexing to reduce serving latency\nin the presence of bursty workloads. We explore the new trade-off space and\npresent a novel serving system, AlpaServe, that determines an efficient\nstrategy for placing and parallelizing collections of large deep learning\nmodels across a distributed cluster. Evaluation results on production workloads\nshow that AlpaServe can process requests at up to 10x higher rates or 6x more\nburstiness while staying within latency constraints for more than 99% of\nrequests.\n","authors":["Zhuohan Li","Lianmin Zheng","Yinmin Zhong","Vincent Liu","Ying Sheng","Xin Jin","Yanping Huang","Zhifeng Chen","Hao Zhang","Joseph E. Gonzalez","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2302.11665v2.pdf","comment":"OSDI 2023"},{"id":"http://arxiv.org/abs/2305.16165v2","updated":"2023-07-19T02:42:46Z","published":"2023-05-11T21:20:29Z","title":"A Conceptual Model for End-to-End Causal Discovery in Knowledge Tracing","summary":"  In this paper, we take a preliminary step towards solving the problem of\ncausal discovery in knowledge tracing, i.e., finding the underlying causal\nrelationship among different skills from real-world student response data. This\nproblem is important since it can potentially help us understand the causal\nrelationship between different skills without extensive A/B testing, which can\npotentially help educators to design better curricula according to skill\nprerequisite information. Specifically, we propose a conceptual solution, a\nnovel causal gated recurrent unit (GRU) module in a modified deep knowledge\ntracing model, which uses i) a learnable permutation matrix for causal ordering\namong skills and ii) an optionally learnable lower-triangular matrix for causal\nstructure among skills. We also detail how to learn the model parameters in an\nend-to-end, differentiable way. Our solution placed among the top entries in\nTask 3 of the NeurIPS 2022 Challenge on Causal Insights for Learning Paths in\nEducation. We detail preliminary experiments as evaluated on the challenge's\npublic leaderboard since the ground truth causal structure has not been\npublicly released, making detailed local evaluation impossible.\n","authors":["Nischal Ashok Kumar","Wanyong Feng","Jaewook Lee","Hunter McNichols","Aritra Ghosh","Andrew Lan"],"pdf_url":"https://arxiv.org/pdf/2305.16165v2.pdf","comment":"16th International Conference on Educational Data Mining (EDM 2023)"},{"id":"http://arxiv.org/abs/2305.00909v4","updated":"2023-07-19T02:41:58Z","published":"2023-04-28T01:47:09Z","title":"Outline, Then Details: Syntactically Guided Coarse-To-Fine Code\n  Generation","summary":"  For a complicated algorithm, its implementation by a human programmer usually\nstarts with outlining a rough control flow followed by iterative enrichments,\neventually yielding carefully generated syntactic structures and variables in a\nhierarchy. However, state-of-the-art large language models generate codes in a\nsingle pass, without intermediate warm-ups to reflect the structured thought\nprocess of \"outline-then-detail\". Inspired by the recent success of\nchain-of-thought prompting, we propose ChainCoder, a program synthesis language\nmodel that generates Python code progressively, i.e. from coarse to fine in\nmultiple passes. We first decompose source code into layout frame components\nand accessory components via abstract syntax tree parsing to construct a\nhierarchical representation. We then reform our prediction target into a\nmulti-pass objective, each pass generates a subsequence, which is concatenated\nin the hierarchy. Finally, a tailored transformer architecture is leveraged to\njointly encode the natural language descriptions and syntactically aligned I/O\ndata samples. Extensive evaluations show that ChainCoder outperforms\nstate-of-the-arts, demonstrating that our progressive generation eases the\nreasoning procedure and guides the language model to generate higher-quality\nsolutions. Our codes are available at:\nhttps://github.com/VITA-Group/ChainCoder.\n","authors":["Wenqing Zheng","S P Sharan","Ajay Kumar Jaiswal","Kevin Wang","Yihan Xi","Dejia Xu","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2305.00909v4.pdf","comment":"Accepted in ICML 2023"},{"id":"http://arxiv.org/abs/2307.09706v1","updated":"2023-07-19T01:37:31Z","published":"2023-07-19T01:37:31Z","title":"RaTE: a Reproducible automatic Taxonomy Evaluation by Filling the Gap","summary":"  Taxonomies are an essential knowledge representation, yet most studies on\nautomatic taxonomy construction (ATC) resort to manual evaluation to score\nproposed algorithms. We argue that automatic taxonomy evaluation (ATE) is just\nas important as taxonomy construction. We propose RaTE, an automatic label-free\ntaxonomy scoring procedure, which relies on a large pre-trained language model.\nWe apply our evaluation procedure to three state-of-the-art ATC algorithms with\nwhich we built seven taxonomies from the Yelp domain, and show that 1) RaTE\ncorrelates well with human judgments and 2) artificially degrading a taxonomy\nleads to decreasing RaTE score.\n","authors":["Tianjian Gao","Phillipe Langlais"],"pdf_url":"https://arxiv.org/pdf/2307.09706v1.pdf","comment":"15th International Conference on Computational Semantics (IWCS),\n  Association for Computational Linguistics (ACL)"},{"id":"http://arxiv.org/abs/2307.03135v2","updated":"2023-07-19T01:28:30Z","published":"2023-07-06T17:05:26Z","title":"Distilling Large Vision-Language Model with Out-of-Distribution\n  Generalizability","summary":"  Large vision-language models have achieved outstanding performance, but their\nsize and computational requirements make their deployment on\nresource-constrained devices and time-sensitive tasks impractical. Model\ndistillation, the process of creating smaller, faster models that maintain the\nperformance of larger models, is a promising direction towards the solution.\nThis paper investigates the distillation of visual representations in large\nteacher vision-language models into lightweight student models using a small-\nor mid-scale dataset. Notably, this study focuses on open-vocabulary\nout-of-distribution (OOD) generalization, a challenging problem that has been\noverlooked in previous model distillation literature. We propose two principles\nfrom vision and language modality perspectives to enhance student's OOD\ngeneralization: (1) by better imitating teacher's visual representation space,\nand carefully promoting better coherence in vision-language alignment with the\nteacher; (2) by enriching the teacher's language representations with\ninformative and finegrained semantic attributes to effectively distinguish\nbetween different labels. We propose several metrics and conduct extensive\nexperiments to investigate their techniques. The results demonstrate\nsignificant improvements in zero-shot and few-shot student performance on\nopen-vocabulary out-of-distribution classification, highlighting the\neffectiveness of our proposed approaches. Code released at\nhttps://github.com/xuanlinli17/large_vlm_distillation_ood\n","authors":["Xuanlin Li","Yunhao Fang","Minghua Liu","Zhan Ling","Zhuowen Tu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2307.03135v2.pdf","comment":"Published at International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2307.09702v1","updated":"2023-07-19T01:14:49Z","published":"2023-07-19T01:14:49Z","title":"Efficient Guided Generation for LLMs","summary":"  In this article we describe an efficient approach to guiding language model\ntext generation with regular expressions and context-free grammars. Our\napproach adds little to no overhead to the token sequence generation process,\nand makes guided generation feasible in practice. An implementation is provided\nin the open source Python library Outlines.\n","authors":["Brandon T. Willard","Rémi Louf"],"pdf_url":"https://arxiv.org/pdf/2307.09702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09692v1","updated":"2023-07-19T00:31:58Z","published":"2023-07-19T00:31:58Z","title":"STRAPPER: Preference-based Reinforcement Learning via Self-training\n  Augmentation and Peer Regularization","summary":"  Preference-based reinforcement learning (PbRL) promises to learn a complex\nreward function with binary human preference. However, such human-in-the-loop\nformulation requires considerable human effort to assign preference labels to\nsegment pairs, hindering its large-scale applications. Recent approache has\ntried to reuse unlabeled segments, which implicitly elucidates the distribution\nof segments and thereby alleviates the human effort. And consistency\nregularization is further considered to improve the performance of\nsemi-supervised learning. However, we notice that, unlike general\nclassification tasks, in PbRL there exits a unique phenomenon that we defined\nas similarity trap in this paper. Intuitively, human can have diametrically\nopposite preferredness for similar segment pairs, but such similarity may trap\nconsistency regularization fail in PbRL. Due to the existence of similarity\ntrap, such consistency regularization improperly enhances the consistency\npossiblity of the model's predictions between segment pairs, and thus reduces\nthe confidence in reward learning, since the augmented distribution does not\nmatch with the original one in PbRL. To overcome such issue, we present a\nself-training method along with our proposed peer regularization, which\npenalizes the reward model memorizing uninformative labels and acquires\nconfident predictions. Empirically, we demonstrate that our approach is capable\nof learning well a variety of locomotion and robotic manipulation behaviors\nusing different semi-supervised alternatives and peer regularization.\n","authors":["Yachen Kang","Li He","Jinxin Liu","Zifeng Zhuang","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09691v1","updated":"2023-07-19T00:27:49Z","published":"2023-07-19T00:27:49Z","title":"Joint Service Caching, Communication and Computing Resource Allocation\n  in Collaborative MEC Systems: A DRL-based Two-timescale Approach","summary":"  Meeting the strict Quality of Service (QoS) requirements of terminals has\nimposed a signiffcant challenge on Multiaccess Edge Computing (MEC) systems,\ndue to the limited multidimensional resources. To address this challenge, we\npropose a collaborative MEC framework that facilitates resource sharing between\nthe edge servers, and with the aim to maximize the long-term QoS and reduce the\ncache switching cost through joint optimization of service caching,\ncollaborative offfoading, and computation and communication resource\nallocation. The dual timescale feature and temporal recurrence relationship\nbetween service caching and other resource allocation make solving the problem\neven more challenging. To solve it, we propose a deep reinforcement learning\n(DRL)-based dual timescale scheme, called DGL-DDPG, which is composed of a\nshort-term genetic algorithm (GA) and a long short-term memory network-based\ndeep deterministic policy gradient (LSTM-DDPG). In doing so, we reformulate the\noptimization problem as a Markov decision process (MDP) where the\nsmall-timescale resource allocation decisions generated by an improved GA are\ntaken as the states and input into a centralized LSTM-DDPG agent to generate\nthe service caching decision for the large-timescale. Simulation results\ndemonstrate that our proposed algorithm outperforms the baseline algorithms in\nterms of the average QoS and cache switching cost.\n","authors":["Qianqian Liu","Haixia Zhang","Xin Zhang","Dongfeng Yuan"],"pdf_url":"https://arxiv.org/pdf/2307.09691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09688v1","updated":"2023-07-19T00:08:49Z","published":"2023-07-19T00:08:49Z","title":"Amazon-M2: A Multilingual Multi-locale Shopping Session Dataset for\n  Recommendation and Text Generation","summary":"  Modeling customer shopping intentions is a crucial task for e-commerce, as it\ndirectly impacts user experience and engagement. Thus, accurately understanding\ncustomer preferences is essential for providing personalized recommendations.\nSession-based recommendation, which utilizes customer session data to predict\ntheir next interaction, has become increasingly popular. However, existing\nsession datasets have limitations in terms of item attributes, user diversity,\nand dataset scale. As a result, they cannot comprehensively capture the\nspectrum of user behaviors and preferences. To bridge this gap, we present the\nAmazon Multilingual Multi-locale Shopping Session Dataset, namely Amazon-M2. It\nis the first multilingual dataset consisting of millions of user sessions from\nsix different locales, where the major languages of products are English,\nGerman, Japanese, French, Italian, and Spanish. Remarkably, the dataset can\nhelp us enhance personalization and understanding of user preferences, which\ncan benefit various existing tasks as well as enable new tasks. To test the\npotential of the dataset, we introduce three tasks in this work: (1)\nnext-product recommendation, (2) next-product recommendation with domain\nshifts, and (3) next-product title generation. With the above tasks, we\nbenchmark a range of algorithms on our proposed dataset, drawing new insights\nfor further research and practice. In addition, based on the proposed dataset\nand tasks, we hosted a competition in the KDD CUP 2023 and have attracted\nthousands of users and submissions. The winning solutions and the associated\nworkshop can be accessed at our website https://kddcup23.github.io/.\n","authors":["Wei Jin","Haitao Mao","Zheng Li","Haoming Jiang","Chen Luo","Hongzhi Wen","Haoyu Han","Hanqing Lu","Zhengyang Wang","Ruirui Li","Zhen Li","Monica Xiao Cheng","Rahul Goutam","Haiyang Zhang","Karthik Subbian","Suhang Wang","Yizhou Sun","Jiliang Tang","Bing Yin","Xianfeng Tang"],"pdf_url":"https://arxiv.org/pdf/2307.09688v1.pdf","comment":"Dataset for KDD Cup 2023, https://kddcup23.github.io/"},{"id":"http://arxiv.org/abs/2210.01834v2","updated":"2023-07-19T23:50:47Z","published":"2022-10-04T18:06:29Z","title":"Invariant Aggregator for Defending against Federated Backdoor Attacks","summary":"  Federated learning is gaining popularity as it enables training high-utility\nmodels across several clients without directly sharing their private data. As a\ndownside, the federated setting makes the model vulnerable to various\nadversarial attacks in the presence of malicious clients. Despite the\ntheoretical and empirical success in defending against attacks that aim to\ndegrade models' utility, defense against backdoor attacks that increase model\naccuracy on backdoor samples exclusively without hurting the utility on other\nsamples remains challenging. To this end, we first analyze the vulnerability of\nfederated learning to backdoor attacks over a flat loss landscape which is\ncommon for well-designed neural networks such as Resnet [He et al., 2015] but\nis often overlooked by previous works. Over a flat loss landscape, misleading\nfederated learning models to exclusively benefit malicious clients with\nbackdoor samples do not require a significant difference between malicious and\nbenign client-wise updates, making existing defenses insufficient. In contrast,\nwe propose an invariant aggregator that redirects the aggregated update to\ninvariant directions that are generally useful via selectively masking out the\ngradient elements that favor few and possibly malicious clients regardless of\nthe difference magnitude. Theoretical results suggest that our approach\nprovably mitigates backdoor attacks over both flat and sharp loss landscapes.\nEmpirical results on three datasets with different modalities and varying\nnumbers of clients further demonstrate that our approach mitigates a broad\nclass of backdoor attacks with a negligible cost on the model utility.\n","authors":["Xiaoyang Wang","Dimitrios Dimitriadis","Sanmi Koyejo","Shruti Tople"],"pdf_url":"https://arxiv.org/pdf/2210.01834v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.12877v2","updated":"2023-07-19T23:38:55Z","published":"2022-07-26T13:12:22Z","title":"Representing Random Utility Choice Models with Neural Networks","summary":"  Motivated by the successes of deep learning, we propose a class of neural\nnetwork-based discrete choice models, called RUMnets, inspired by the random\nutility maximization (RUM) framework. This model formulates the agents' random\nutility function using a sample average approximation. We show that RUMnets\nsharply approximate the class of RUM discrete choice models: any model derived\nfrom random utility maximization has choice probabilities that can be\napproximated arbitrarily closely by a RUMnet. Reciprocally, any RUMnet is\nconsistent with the RUM principle. We derive an upper bound on the\ngeneralization error of RUMnets fitted on choice data, and gain theoretical\ninsights on their ability to predict choices on new, unseen data depending on\ncritical parameters of the dataset and architecture. By leveraging open-source\nlibraries for neural networks, we find that RUMnets are competitive against\nseveral choice modeling and machine learning methods in terms of predictive\naccuracy on two real-world datasets.\n","authors":["Ali Aouad","Antoine Désir"],"pdf_url":"https://arxiv.org/pdf/2207.12877v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2207.00419v3","updated":"2023-07-19T16:00:08Z","published":"2022-06-18T00:26:52Z","title":"Self-Supervised Learning for Videos: A Survey","summary":"  The remarkable success of deep learning in various domains relies on the\navailability of large-scale annotated datasets. However, obtaining annotations\nis expensive and requires great effort, which is especially challenging for\nvideos. Moreover, the use of human-generated annotations leads to models with\nbiased learning and poor domain generalization and robustness. As an\nalternative, self-supervised learning provides a way for representation\nlearning which does not require annotations and has shown promise in both image\nand video domains. Different from the image domain, learning video\nrepresentations are more challenging due to the temporal dimension, bringing in\nmotion and other environmental dynamics. This also provides opportunities for\nvideo-exclusive ideas that advance self-supervised learning in the video and\nmultimodal domain. In this survey, we provide a review of existing approaches\non self-supervised learning focusing on the video domain. We summarize these\nmethods into four different categories based on their learning objectives: 1)\npretext tasks, 2) generative learning, 3) contrastive learning, and 4)\ncross-modal agreement. We further introduce the commonly used datasets,\ndownstream evaluation tasks, insights into the limitations of existing works,\nand the potential future directions in this area.\n","authors":["Madeline C. Schiappa","Yogesh S. Rawat","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2207.00419v3.pdf","comment":"ACM CSUR (December 2022). Project Link: https://bit.ly/3Oimc7Q"},{"id":"http://arxiv.org/abs/2307.10003v1","updated":"2023-07-19T14:23:26Z","published":"2023-07-19T14:23:26Z","title":"TbExplain: A Text-based Explanation Method for Scene Classification\n  Models with the Statistical Prediction Correction","summary":"  The field of Explainable Artificial Intelligence (XAI) aims to improve the\ninterpretability of black-box machine learning models. Building a heatmap based\non the importance value of input features is a popular method for explaining\nthe underlying functions of such models in producing their predictions.\nHeatmaps are almost understandable to humans, yet they are not without flaws.\nNon-expert users, for example, may not fully understand the logic of heatmaps\n(the logic in which relevant pixels to the model's prediction are highlighted\nwith different intensities or colors). Additionally, objects and regions of the\ninput image that are relevant to the model prediction are frequently not\nentirely differentiated by heatmaps. In this paper, we propose a framework\ncalled TbExplain that employs XAI techniques and a pre-trained object detector\nto present text-based explanations of scene classification models. Moreover,\nTbExplain incorporates a novel method to correct predictions and textually\nexplain them based on the statistics of objects in the input image when the\ninitial prediction is unreliable. To assess the trustworthiness and validity of\nthe text-based explanations, we conducted a qualitative experiment, and the\nfindings indicated that these explanations are sufficiently reliable.\nFurthermore, our quantitative and qualitative experiments on TbExplain with\nscene classification datasets reveal an improvement in classification accuracy\nover ResNet variants.\n","authors":["Amirhossein Aminimehr","Pouya Khani","Amirali Molaei","Amirmohammad Kazemeini","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2307.10003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09936v1","updated":"2023-07-19T12:21:39Z","published":"2023-07-19T12:21:39Z","title":"AGAR: Attention Graph-RNN for Adaptative Motion Prediction of Point\n  Clouds of Deformable Objects","summary":"  This paper focuses on motion prediction for point cloud sequences in the\nchallenging case of deformable 3D objects, such as human body motion. First, we\ninvestigate the challenges caused by deformable shapes and complex motions\npresent in this type of representation, with the ultimate goal of understanding\nthe technical limitations of state-of-the-art models. From this understanding,\nwe propose an improved architecture for point cloud prediction of deformable 3D\nobjects. Specifically, to handle deformable shapes, we propose a graph-based\napproach that learns and exploits the spatial structure of point clouds to\nextract more representative features. Then we propose a module able to combine\nthe learned features in an adaptative manner according to the point cloud\nmovements. The proposed adaptative module controls the composition of local and\nglobal motions for each point, enabling the network to model complex motions in\ndeformable 3D objects more effectively. We tested the proposed method on the\nfollowing datasets: MNIST moving digits, the Mixamo human bodies motions, JPEG\nand CWIPC-SXR real-world dynamic bodies. Simulation results demonstrate that\nour method outperforms the current baseline methods given its improved ability\nto model complex movements as well as preserve point cloud shape. Furthermore,\nwe demonstrate the generalizability of the proposed framework for dynamic\nfeature learning, by testing the framework for action recognition on the\nMSRAction3D dataset and achieving results on-par with state-of-the-art methods\n","authors":["Pedro Gomes","Silvia Rossi","Laura Toni"],"pdf_url":"https://arxiv.org/pdf/2307.09936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09915v1","updated":"2023-07-19T11:35:21Z","published":"2023-07-19T11:35:21Z","title":"Embedded Heterogeneous Attention Transformer for Cross-lingual Image\n  Captioning","summary":"  Cross-lingual image captioning is confronted with both cross-lingual and\ncross-modal challenges for multimedia analysis. The crucial issue in this task\nis to model the global and local matching between the image and different\nlanguages. Existing cross-modal embedding methods based on Transformer\narchitecture oversight the local matching between the image region and\nmonolingual words, not to mention in the face of a variety of differentiated\nlanguages. Due to the heterogeneous property of the cross-modal and\ncross-lingual task, we utilize the heterogeneous network to establish\ncross-domain relationships and the local correspondences between the image and\ndifferent languages. In this paper, we propose an Embedded Heterogeneous\nAttention Transformer (EHAT) to build reasoning paths bridging cross-domain for\ncross-lingual image captioning and integrate into transformer. The proposed\nEHAT consists of a Masked Heterogeneous Cross-attention (MHCA), Heterogeneous\nAttention Reasoning Network (HARN) and Heterogeneous Co-attention (HCA). HARN\nas the core network, models and infers cross-domain relationship anchored by\nvision bounding box representation features to connect two languages word\nfeatures and learn the heterogeneous maps. MHCA and HCA implement cross-domain\nintegration in the encoder through the special heterogeneous attention and\nenable single model to generate two language captioning. We test on MSCOCO\ndataset to generate English and Chinese, which are most widely used and have\nobvious difference between their language families. Our experiments show that\nour method even achieve better than advanced monolingual methods.\n","authors":["Zijie Song","Zhenzhen Hu","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2307.09915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09821v1","updated":"2023-07-19T08:16:34Z","published":"2023-07-19T08:16:34Z","title":"Hierarchical Semantic Perceptual Listener Head Video Generation: A\n  High-performance Pipeline","summary":"  In dyadic speaker-listener interactions, the listener's head reactions along\nwith the speaker's head movements, constitute an important non-verbal semantic\nexpression together. The listener Head generation task aims to synthesize\nresponsive listener's head videos based on audios of the speaker and reference\nimages of the listener. Compared to the Talking-head generation, it is more\nchallenging to capture the correlation clues from the speaker's audio and\nvisual information. Following the ViCo baseline scheme, we propose a\nhigh-performance solution by enhancing the hierarchical semantic extraction\ncapability of the audio encoder module and improving the decoder part, renderer\nand post-processing modules. Our solution gets the first place on the official\nleaderboard for the track of listening head generation. This paper is a\ntechnical report of ViCo@2023 Conversational Head Generation Challenge in ACM\nMultimedia 2023 conference.\n","authors":["Zhigang Chang","Weitai Hu","Qing Yang","Shibao Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.09821v1.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2306.07848v5","updated":"2023-07-19T04:56:33Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n  Pretraining for Speech Emotion Recognition","summary":"  Contrastive learning based cross-modality pretraining methods have recently\nexhibited impressive success in diverse fields. In this paper, we propose\nGEmo-CLAP, a kind of gender-attribute-enhanced contrastive language-audio\npretraining (CLAP) method for speech emotion recognition. Specifically, a novel\nemotion CLAP model (Emo-CLAP) is first built, utilizing various self-supervised\npre-trained models. Second, considering the importance of gender attribute in\nspeech emotion modeling, the soft label based GEmo-CLAP (SL-GEmo-CLAP) and\nmulti-task learning based GEmo-CLAP (ML-GEmo-CLAP) are further proposed to\nintegrate the emotion and gender information of speech signals, forming more\nreasonable objectives. Extensive experiments on IEMOCAP show that our proposed\ntwo GEmo-CLAP models consistently outperform the baseline Emo-CLAP with\ndifferent pre-trained models, while also achieving the best recognition\nperformance compared with recent state-of-the-art methods. Noticeably, the\nproposed WavLM-based ML-GEmo-CLAP obtains the best UAR of 80.16\\% and WAR of\n82.06\\%.\n","authors":["Yu Pan","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2306.07848v5.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2307.09729v1","updated":"2023-07-19T02:33:42Z","published":"2023-07-19T02:33:42Z","title":"NTIRE 2023 Quality Assessment of Video Enhancement Challenge","summary":"  This paper reports on the NTIRE 2023 Quality Assessment of Video Enhancement\nChallenge, which will be held in conjunction with the New Trends in Image\nRestoration and Enhancement Workshop (NTIRE) at CVPR 2023. This challenge is to\naddress a major challenge in the field of video processing, namely, video\nquality assessment (VQA) for enhanced videos. The challenge uses the VQA\nDataset for Perceptual Video Enhancement (VDPVE), which has a total of 1211\nenhanced videos, including 600 videos with color, brightness, and contrast\nenhancements, 310 videos with deblurring, and 301 deshaked videos. The\nchallenge has a total of 167 registered participants. 61 participating teams\nsubmitted their prediction results during the development phase, with a total\nof 3168 submissions. A total of 176 submissions were submitted by 37\nparticipating teams during the final testing phase. Finally, 19 participating\nteams submitted their models and fact sheets, and detailed the methods they\nused. Some methods have achieved better results than baseline methods, and the\nwinning methods have demonstrated superior prediction performance.\n","authors":["Xiaohong Liu","Xiongkuo Min","Wei Sun","Yulun Zhang","Kai Zhang","Radu Timofte","Guangtao Zhai","Yixuan Gao","Yuqin Cao","Tengchuan Kou","Yunlong Dong","Ziheng Jia","Yilin Li","Wei Wu","Shuming Hu","Sibin Deng","Pengxiang Xiao","Ying Chen","Kai Li","Kai Zhao","Kun Yuan","Ming Sun","Heng Cong","Hao Wang","Lingzhi Fu","Yusheng Zhang","Rongyu Zhang","Hang Shi","Qihang Xu","Longan Xiao","Zhiliang Ma","Mirko Agarla","Luigi Celona","Claudio Rota","Raimondo Schettini","Zhiwei Huang","Yanan Li","Xiaotao Wang","Lei Lei","Hongye Liu","Wei Hong","Ironhead Chuang","Allen Lin","Drake Guan","Iris Chen","Kae Lou","Willy Huang","Yachun Tasi","Yvonne Kao","Haotian Fan","Fangyuan Kong","Shiqi Zhou","Hao Liu","Yu Lai","Shanshan Chen","Wenqi Wang","Haoning Wu","Chaofeng Chen","Chunzheng Zhu","Zekun Guo","Shiling Zhao","Haibing Yin","Hongkui Wang","Hanene Brachemi Meftah","Sid Ahmed Fezza","Wassim Hamidouche","Olivier Déforges","Tengfei Shi","Azadeh Mansouri","Hossein Motamednia","Amir Hossein Bakhtiari","Ahmad Mahmoudi Aznaveh"],"pdf_url":"https://arxiv.org/pdf/2307.09729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10346v1","updated":"2023-07-19T17:10:56Z","published":"2023-07-19T17:10:56Z","title":"Estudio de la Experiencia de Usuario mediante un Sistema de Dashboards\n  de Análisis de Aprendizaje Multimodal","summary":"  In the article, we present a Web-based System called M2LADS, which supports\nthe integration and visualization of multimodal data recorded in user\nexperiences (UX) in a Learning Analytics (LA) system in the form of Web-based\nDashboards. Based on the edBB platform, the multimodal data gathered contains\nbiometric and behavioral signals including electroencephalogram data to measure\nlearners' cognitive attention, heart rate for affective measures and visual\nattention from the video recordings. Additionally, learners' static background\ndata and their learning performance measures are tracked using LOGGE tool.\nM2LADS provides opportunities to capture learners' holistic experience during\ntheir interactions with the learning analytic system in order to improve the\nsystem and the user experience of the learners.\n  --\n  En este art\\'iculo, presentamos M2LADS, un sistema que permite la\nintegraci\\'on y visualizaci\\'on de datos multimodales en forma de Dashboards\nWeb. Estos datos provienen de sesiones de experiencia de usuario en un sistema\nde Learning Analytics (LA) llevadas a cabo por estudiantes de MOOCs. Los datos\nmultimodales incluyen se\\~nales biom\\'etricas y de comportamiento monitorizados\npor la plataforma edBB, como electroencefalogramas (EEG) de 5 canales,\nfrecuencia card\\'iaca, atenci\\'on visual, videos en el espectro visible y NIR,\nentre otros. Adem\\'as, se incluyen datos de interacci\\'on de los estudiantes\ncon el sistema de LA a trav\\'es de la herramienta LOGGE. Toda esta\ninformaci\\'on proporciona una comprensi\\'on completa de la experiencia del\nusuario al utilizar el sistema de LA, lo que ha permitido tanto mejorar el\nsistema LA como la experiencia de aprendizaje de los estudiantes de MOOCs.\n","authors":["Álvaro Becerra","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2307.10346v1.pdf","comment":"Accepted in \"XXIII CONGRESO INTERNACIONAL DE INTERACCI\\'ON\n  PERSONA-ORDENADOR 2023\". Article in Spanish language. The abstract in English\n  and Spanish. There is an extended abstract of 2 pages in English"}]},"2023-07-20T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.11088v1","updated":"2023-07-20T17:59:41Z","published":"2023-07-20T17:59:41Z","title":"L-Eval: Instituting Standardized Evaluation for Long Context Language\n  Models","summary":"  Recently, there has been growing interest in extending the context length of\ninstruction-following models in order to effectively process single-turn long\ninput (e.g. summarizing a paper) and conversations with more extensive\nhistories. While proprietary models such as GPT-4 and Claude have demonstrated\nconsiderable advancements in handling tens of thousands of tokens of context,\nopen-sourced models are still in the early stages of experimentation. It also\nremains unclear whether developing these long context models can offer\nsubstantial gains on practical downstream tasks over retrieval-based methods or\nmodels simply trained on chunked contexts. To address this challenge, we\npropose to institute standardized evaluation for long context language models.\nConcretely, we develop L-Eval which contains 411 long documents and over 2,000\nquery-response pairs manually annotated and checked by the authors encompassing\nareas such as law, finance, school lectures, lengthy conversations, news,\nlong-form novels, and meetings. L-Eval also adopts diverse evaluation methods\nand instruction styles, enabling a more reliable assessment of Long Context\nLanguage Models (LCLMs). Our findings indicate that while open-source models\ntypically lag behind their commercial counterparts, they still exhibit\nimpressive performance. LLaMA2 achieves the best results (win 45\\% vs\nturbo-16k) on open-ended tasks with only 4k context length and ChatGLM2\nachieves the best results on closed-ended tasks with 8k input tokens. We\nrelease our new evaluation suite, code, and all generation results including\npredictions from all open-sourced LCLMs, GPT4-32k, Cluade-100k at\n{\\url{https://github.com/OpenLMLab/LEval}}.\n","authors":["Chenxin An","Shansan Gong","Ming Zhong","Mukai Li","Jun Zhang","Lingpeng Kong","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2307.11088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10172v2","updated":"2023-07-20T17:59:35Z","published":"2023-07-19T17:57:53Z","title":"DialogStudio: Towards Richest and Most Diverse Unified Dataset\n  Collection for Conversational AI","summary":"  Despite advancements in conversational AI, language models encounter\nchallenges to handle diverse conversational tasks, and existing dialogue\ndataset collections often lack diversity and comprehensiveness. To tackle these\nissues, we introduce DialogStudio: the largest and most diverse collection of\ndialogue datasets, unified under a consistent format while preserving their\noriginal information. Our collection encompasses data from open-domain\ndialogues, task-oriented dialogues, natural language understanding,\nconversational recommendation, dialogue summarization, and knowledge-grounded\ndialogues, making it an incredibly rich and diverse resource for dialogue\nresearch and model training. To further enhance the utility of DialogStudio, we\nidentify the licenses for each dataset and design domain-aware prompts for\nselected dialogues to facilitate instruction-aware fine-tuning. Furthermore, we\ndevelop conversational AI models using the dataset collection, and our\nexperiments in both zero-shot and few-shot learning scenarios demonstrate the\nsuperiority of DialogStudio. To improve transparency and support dataset and\ntask-based research, as well as language model pre-training, all datasets,\nlicenses, codes, and models associated with DialogStudio are made publicly\naccessible at https://github.com/salesforce/DialogStudio\n","authors":["Jianguo Zhang","Kun Qian","Zhiwei Liu","Shelby Heinecke","Rui Meng","Ye Liu","Zhou Yu","Huan Wang","Silvio Savarese","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2307.10172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13867v2","updated":"2023-07-20T17:59:14Z","published":"2023-01-31T18:59:03Z","title":"Mathematical Capabilities of ChatGPT","summary":"  We investigate the mathematical capabilities of two iterations of ChatGPT\n(released 9-January-2023 and 30-January-2023) and of GPT-4 by testing them on\npublicly available datasets, as well as hand-crafted ones, using a novel\nmethodology. In contrast to formal mathematics, where large databases of formal\nproofs are available (e.g., the Lean Mathematical Library), current datasets of\nnatural-language mathematics, used to benchmark language models, either cover\nonly elementary mathematics or are very small. We address this by publicly\nreleasing two new datasets: GHOSTS and miniGHOSTS. These are the first\nnatural-language datasets curated by working researchers in mathematics that\n(1) aim to cover graduate-level mathematics, (2) provide a holistic overview of\nthe mathematical capabilities of language models, and (3) distinguish multiple\ndimensions of mathematical reasoning. These datasets also test whether ChatGPT\nand GPT-4 can be helpful assistants to professional mathematicians by emulating\nuse cases that arise in the daily professional activities of mathematicians. We\nbenchmark the models on a range of fine-grained performance metrics. For\nadvanced mathematics, this is the most detailed evaluation effort to date. We\nfind that ChatGPT can be used most successfully as a mathematical assistant for\nquerying facts, acting as a mathematical search engine and knowledge base\ninterface. GPT-4 can additionally be used for undergraduate-level mathematics\nbut fails on graduate-level difficulty. Contrary to many positive reports in\nthe media about GPT-4 and ChatGPT's exam-solving abilities (a potential case of\nselection bias), their overall mathematical performance is well below the level\nof a graduate student. Hence, if your goal is to use ChatGPT to pass a\ngraduate-level math exam, you would be better off copying from your average\npeer!\n","authors":["Simon Frieder","Luca Pinchetti","Alexis Chevalier","Ryan-Rhys Griffiths","Tommaso Salvatori","Thomas Lukasiewicz","Philipp Christian Petersen","Julius Berner"],"pdf_url":"https://arxiv.org/pdf/2301.13867v2.pdf","comment":"Added further evaluations on another ChatGPT version and on GPT-4.\n  The GHOSTS and miniGHOSTS datasets are available at\n  https://github.com/xyfrieder/science-GHOSTS"},{"id":"http://arxiv.org/abs/2304.07880v3","updated":"2023-07-20T17:34:39Z","published":"2023-04-16T20:11:19Z","title":"Sabiá: Portuguese Large Language Models","summary":"  As the capabilities of language models continue to advance, it is conceivable\nthat \"one-size-fits-all\" model will remain as the main paradigm. For instance,\ngiven the vast number of languages worldwide, many of which are low-resource,\nthe prevalent practice is to pretrain a single model on multiple languages. In\nthis paper, we add to the growing body of evidence that challenges this\npractice, demonstrating that monolingual pretraining on the target language\nsignificantly improves models already extensively trained on diverse corpora.\nMore specifically, we further pretrain GPT-J and LLaMA models on Portuguese\ntexts using 3% or less of their original pretraining budget. Few-shot\nevaluations on Poeta, a suite of 14 Portuguese datasets, reveal that our models\noutperform English-centric and multilingual counterparts by a significant\nmargin. Our best model, Sabi\\'a-65B, performs on par with GPT-3.5-turbo. By\nevaluating on datasets originally conceived in the target language as well as\ntranslated ones, we study the contributions of language-specific pretraining in\nterms of 1) capturing linguistic nuances and structures inherent to the target\nlanguage, and 2) enriching the model's knowledge about a domain or culture. Our\nresults indicate that the majority of the benefits stem from the\ndomain-specific knowledge acquired through monolingual pretraining.\n","authors":["Ramon Pires","Hugo Abonizio","Thales Sales Almeida","Rodrigo Nogueira"],"pdf_url":"https://arxiv.org/pdf/2304.07880v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11031v1","updated":"2023-07-20T17:07:28Z","published":"2023-07-20T17:07:28Z","title":"Embroid: Unsupervised Prediction Smoothing Can Improve Few-Shot\n  Classification","summary":"  Recent work has shown that language models' (LMs) prompt-based learning\ncapabilities make them well suited for automating data labeling in domains\nwhere manual annotation is expensive. The challenge is that while writing an\ninitial prompt is cheap, improving a prompt is costly -- practitioners often\nrequire significant labeled data in order to evaluate the impact of prompt\nmodifications. Our work asks whether it is possible to improve prompt-based\nlearning without additional labeled data. We approach this problem by\nattempting to modify the predictions of a prompt, rather than the prompt\nitself. Our intuition is that accurate predictions should also be consistent:\nsamples which are similar under some feature representation should receive the\nsame prompt prediction. We propose Embroid, a method which computes multiple\nrepresentations of a dataset under different embedding functions, and uses the\nconsistency between the LM predictions for neighboring samples to identify\nmispredictions. Embroid then uses these neighborhoods to create additional\npredictions for each sample, and combines these predictions with a simple\nlatent variable graphical model in order to generate a final corrected\nprediction. In addition to providing a theoretical analysis of Embroid, we\nconduct a rigorous empirical evaluation across six different LMs and up to 95\ndifferent tasks. We find that (1) Embroid substantially improves performance\nover original prompts (e.g., by an average of 7.3 points on GPT-JT), (2) also\nrealizes improvements for more sophisticated prompting strategies (e.g.,\nchain-of-thought), and (3) can be specialized to domains like law through the\nembedding functions.\n","authors":["Neel Guha","Mayee F. Chen","Kush Bhatia","Azalia Mirhoseini","Frederic Sala","Christopher Ré"],"pdf_url":"https://arxiv.org/pdf/2307.11031v1.pdf","comment":"38 pages, 22 figures, 8 tables"},{"id":"http://arxiv.org/abs/2307.10811v1","updated":"2023-07-20T16:55:25Z","published":"2023-07-20T16:55:25Z","title":"\"It Felt Like Having a Second Mind\": Investigating Human-AI\n  Co-creativity in Prewriting with Large Language Models","summary":"  Prewriting is the process of discovering and developing ideas before a first\ndraft, which requires divergent thinking and often implies unstructured\nstrategies such as diagramming, outlining, free-writing, etc. Although large\nlanguage models (LLMs) have been demonstrated to be useful for a variety of\ntasks including creative writing, little is known about how users would\ncollaborate with LLMs to support prewriting. The preferred collaborative role\nand initiative of LLMs during such a creativity process is also unclear. To\ninvestigate human-LLM collaboration patterns and dynamics during prewriting, we\nconducted a three-session qualitative study with 15 participants in two\ncreative tasks: story writing and slogan writing. The findings indicated that\nduring collaborative prewriting, there appears to be a three-stage iterative\nHuman-AI Co-creativity process that includes Ideation, Illumination, and\nImplementation stages. This collaborative process champions the human in a\ndominant role, in addition to mixed and shifting levels of initiative that\nexist between humans and LLMs. This research also reports on collaboration\nbreakdowns that occur during this process, user perceptions of using existing\nLLMs during Human-AI Co-creativity, and discusses design implications to\nsupport this co-creativity process.\n","authors":["Qian Wan","Siying Hu","Yu Zhang","Piaohong Wang","Bo Wen","Zhicong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.10811v1.pdf","comment":"Under review at CSCW after a Major Revision"},{"id":"http://arxiv.org/abs/2307.11019v1","updated":"2023-07-20T16:46:10Z","published":"2023-07-20T16:46:10Z","title":"Investigating the Factual Knowledge Boundary of Large Language Models\n  with Retrieval Augmentation","summary":"  Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require\na substantial amount of factual knowledge and often rely on external\ninformation for assistance. Recently, large language models (LLMs) (e.g.,\nChatGPT), have demonstrated impressive prowess in solving a wide range of tasks\nwith world knowledge, including knowledge-intensive tasks. However, it remains\nunclear how well LLMs are able to perceive their factual knowledge boundaries,\nparticularly how they behave when incorporating retrieval augmentation. In this\nstudy, we present an initial analysis of the factual knowledge boundaries of\nLLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,\nwe focus on three primary research questions and analyze them by examining QA\nperformance, priori judgement and posteriori judgement of LLMs. We show\nevidence that LLMs possess unwavering confidence in their capabilities to\nrespond to questions and the accuracy of their responses. Furthermore,\nretrieval augmentation proves to be an effective approach in enhancing LLMs'\nawareness of knowledge boundaries, thereby improving their judgemental\nabilities. Additionally, we also find that LLMs have a propensity to rely on\nthe provided retrieval results when formulating answers, while the quality of\nthese results significantly impacts their reliance. The code to reproduce this\nwork is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.\n","authors":["Ruiyang Ren","Yuhao Wang","Yingqi Qu","Wayne Xin Zhao","Jing Liu","Hao Tian","Hua Wu","Ji-Rong Wen","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11005v1","updated":"2023-07-20T16:34:40Z","published":"2023-07-20T16:34:40Z","title":"Integrating Pretrained ASR and LM to Perform Sequence Generation for\n  Spoken Language Understanding","summary":"  There has been an increased interest in the integration of pretrained speech\nrecognition (ASR) and language models (LM) into the SLU framework. However,\nprior methods often struggle with a vocabulary mismatch between pretrained\nmodels, and LM cannot be directly utilized as they diverge from its NLU\nformulation. In this study, we propose a three-pass end-to-end (E2E) SLU system\nthat effectively integrates ASR and LM subnetworks into the SLU formulation for\nsequence generation tasks. In the first pass, our architecture predicts ASR\ntranscripts using the ASR subnetwork. This is followed by the LM subnetwork,\nwhich makes an initial SLU prediction. Finally, in the third pass, the\ndeliberation subnetwork conditions on representations from the ASR and LM\nsubnetworks to make the final prediction. Our proposed three-pass SLU system\nshows improved performance over cascaded and E2E SLU models on two benchmark\nSLU datasets, SLURP and SLUE, especially on acoustically challenging\nutterances.\n","authors":["Siddhant Arora","Hayato Futami","Yosuke Kashiwagi","Emiru Tsunoo","Brian Yan","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2307.11005v1.pdf","comment":"Accepted at INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2210.05335v3","updated":"2023-07-20T16:24:14Z","published":"2022-10-11T10:54:54Z","title":"MAP: Multimodal Uncertainty-Aware Vision-Language Pre-training Model","summary":"  Multimodal semantic understanding often has to deal with uncertainty, which\nmeans the obtained messages tend to refer to multiple targets. Such uncertainty\nis problematic for our interpretation, including inter- and intra-modal\nuncertainty. Little effort has studied the modeling of this uncertainty,\nparticularly in pre-training on unlabeled datasets and fine-tuning in\ntask-specific downstream datasets. In this paper, we project the\nrepresentations of all modalities as probabilistic distributions via a\nProbability Distribution Encoder (PDE) by utilizing sequence-level\ninteractions. Compared to the existing deterministic methods, such uncertainty\nmodeling can convey richer multimodal semantic information and more complex\nrelationships. Furthermore, we integrate uncertainty modeling with popular\npre-training frameworks and propose suitable pre-training tasks:\nDistribution-based Vision-Language Contrastive learning (D-VLC),\nDistribution-based Masked Language Modeling (D-MLM), and Distribution-based\nImage-Text Matching (D-ITM). The fine-tuned models are applied to challenging\ndownstream tasks, including image-text retrieval, visual question answering,\nvisual reasoning, and visual entailment, and achieve state-of-the-art results.\n","authors":["Yatai Ji","Junjie Wang","Yuan Gong","Lin Zhang","Yanru Zhu","Hongfa Wang","Jiaxing Zhang","Tetsuya Sakai","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2210.05335v3.pdf","comment":"CVPR 2023 Main Track Long Paper"},{"id":"http://arxiv.org/abs/2307.10982v1","updated":"2023-07-20T16:09:57Z","published":"2023-07-20T16:09:57Z","title":"MASR: Metadata Aware Speech Representation","summary":"  In the recent years, speech representation learning is constructed primarily\nas a self-supervised learning (SSL) task, using the raw audio signal alone,\nwhile ignoring the side-information that is often available for a given speech\nrecording. In this paper, we propose MASR, a Metadata Aware Speech\nRepresentation learning framework, which addresses the aforementioned\nlimitations. MASR enables the inclusion of multiple external knowledge sources\nto enhance the utilization of meta-data information. The external knowledge\nsources are incorporated in the form of sample-level pair-wise similarity\nmatrices that are useful in a hard-mining loss. A key advantage of the MASR\nframework is that it can be combined with any choice of SSL method. Using MASR\nrepresentations, we perform evaluations on several downstream tasks such as\nlanguage identification, speech recognition and other non-semantic tasks such\nas speaker and emotion recognition. In these experiments, we illustrate\nsignificant performance improvements for the MASR over other established\nbenchmarks. We perform a detailed analysis on the language identification task\nto provide insights on how the proposed loss function enables the\nrepresentations to separate closely related languages.\n","authors":["Anjali Raj","Shikhar Bharadwaj","Sriram Ganapathy","Min Ma","Shikhar Vashishth"],"pdf_url":"https://arxiv.org/pdf/2307.10982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12619v2","updated":"2023-07-20T16:04:19Z","published":"2023-06-22T01:14:47Z","title":"Class-Incremental Learning based on Label Generation","summary":"  Despite the great success of pre-trained language models, it is still a\nchallenge to use these models for continual learning, especially for the\nclass-incremental learning (CIL) setting due to catastrophic forgetting (CF).\nThis paper reports our finding that if we formulate CIL as a continual label\ngeneration problem, CF is drastically reduced and the generalizable\nrepresentations of pre-trained models can be better retained. We thus propose a\nnew CIL method (VAG) that also leverages the sparsity of vocabulary to focus\nthe generation and creates pseudo-replay samples by using label semantics.\nExperimental results show that VAG outperforms baselines by a large margin.\n","authors":["Yijia Shao","Yiduo Guo","Dongyan Zhao","Bing Liu"],"pdf_url":"https://arxiv.org/pdf/2306.12619v2.pdf","comment":"12 pages, ACL 2023 Main Conference"},{"id":"http://arxiv.org/abs/2306.14192v2","updated":"2023-07-20T15:20:51Z","published":"2023-06-25T10:16:49Z","title":"$α$-$β$-Factorization and the Binary Case of Simon's Congruence","summary":"  In 1991 H\\'ebrard introduced a factorization of words that turned out to be a\npowerful tool for the investigation of a word's scattered factors (also known\nas (scattered) subwords or subsequences). Based on this, first Karandikar and\nSchnoebelen introduced the notion of $k$-richness and later on Barker et al.\nthe notion of $k$-universality. In 2022 Fleischmann et al. presented a\ngeneralization of the arch factorization by intersecting the arch factorization\nof a word and its reverse. While the authors merely used this factorization for\nthe investigation of shortest absent scattered factors, in this work we\ninvestigate this new $\\alpha$-$\\beta$-factorization as such. We characterize\nthe famous Simon congruence of $k$-universal words in terms of $1$-universal\nwords. Moreover, we apply these results to binary words. In this special case,\nwe obtain a full characterization of the classes and calculate the index of the\ncongruence. Lastly, we start investigating the ternary case, present a full\nlist of possibilities for $\\alpha\\beta\\alpha$-factors, and characterize their\ncongruence.\n","authors":["Pamela Fleischmann","Jonas Höfer","Annika Huch","Dirk Nowotka"],"pdf_url":"https://arxiv.org/pdf/2306.14192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10932v1","updated":"2023-07-20T15:02:42Z","published":"2023-07-20T15:02:42Z","title":"Identical and Fraternal Twins: Fine-Grained Semantic Contrastive\n  Learning of Sentence Representations","summary":"  The enhancement of unsupervised learning of sentence representations has been\nsignificantly achieved by the utility of contrastive learning. This approach\nclusters the augmented positive instance with the anchor instance to create a\ndesired embedding space. However, relying solely on the contrastive objective\ncan result in sub-optimal outcomes due to its inability to differentiate subtle\nsemantic variations between positive pairs. Specifically, common data\naugmentation techniques frequently introduce semantic distortion, leading to a\nsemantic margin between the positive pair. While the InfoNCE loss function\noverlooks the semantic margin and prioritizes similarity maximization between\npositive pairs during training, leading to the insensitive semantic\ncomprehension ability of the trained model. In this paper, we introduce a novel\nIdentical and Fraternal Twins of Contrastive Learning (named IFTCL) framework,\ncapable of simultaneously adapting to various positive pairs generated by\ndifferent augmentation techniques. We propose a \\textit{Twins Loss} to preserve\nthe innate margin during training and promote the potential of data enhancement\nin order to overcome the sub-optimal issue. We also present proof-of-concept\nexperiments combined with the contrastive objective to prove the validity of\nthe proposed Twins Loss. Furthermore, we propose a hippocampus queue mechanism\nto restore and reuse the negative instances without additional calculation,\nwhich further enhances the efficiency and performance of the IFCL. We verify\nthe IFCL framework on nine semantic textual similarity tasks with both English\nand Chinese datasets, and the experimental results show that IFCL outperforms\nstate-of-the-art methods.\n","authors":["Qingfa Xiao","Shuangyin Li","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2307.10932v1.pdf","comment":"This article has been accepted for publication in European Conference\n  on Artificial Intelligence (ECAI2023). 9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.10930v1","updated":"2023-07-20T14:59:02Z","published":"2023-07-20T14:59:02Z","title":"MediaGPT : A Large Language Model Target Chinese Media","summary":"  The development of large language models (LLMs) has seen rapid progress in\nrecent years. One of the most widely used LLMs is the Generative Pre-trained\nTransformer (GPT) series, which has been applied in various fields, including\nthe media domain. However, in practical applications, the differences between\nthe media's use cases and the general-purpose applications of LLMs have become\nincreasingly apparent, especially Chinese. As a result, there is a growing need\nto develop LLM that are specifically tailored to the unique requirements of the\nmedia domain. In this paper, we present MediaGPT, a large language model\ntraining on variety of media data and addressing the practical needs of Chinese\nmedia. We have designed a diverse set of task instruction types to cater to the\nspecific requirements of the domain. To further validate the effectiveness of\nour proposed LLM, we have constructed unique datasets that are tailored to the\nmedia domain and have also developed verification methods that are specifically\ndesigned for generative-type tasks. By doing so, we aim to bridge the gap\nbetween the general-purpose LLM and the requirements of the media domain, and\nto pave the way for more effective and efficient use of LLM in this field. This\npaper aims to explore the challenges and opportunities of developing LLM for\nmedia applications and to propose potential solutions for addressing these\nchallenges.\n","authors":["Zhonghao Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10928v1","updated":"2023-07-20T14:56:35Z","published":"2023-07-20T14:56:35Z","title":"FLASK: Fine-grained Language Model Evaluation based on Alignment Skill\n  Sets","summary":"  Evaluation of Large Language Models (LLMs) is challenging because aligning to\nhuman values requires the composition of multiple skills and the required set\nof skills varies depending on the instruction. Recent studies have evaluated\nthe performance of LLMs in two ways, (1) automatic evaluation on several\nindependent benchmarks and (2) human or machined-based evaluation giving an\noverall score to the response. However, both settings are coarse-grained\nevaluations, not considering the nature of user instructions that require\ninstance-wise skill composition, which limits the interpretation of the true\ncapabilities of LLMs. In this paper, we introduce FLASK (Fine-grained Language\nModel Evaluation based on Alignment SKill Sets), a fine-grained evaluation\nprotocol that can be used for both model-based and human-based evaluation which\ndecomposes coarse-level scoring to an instance-wise skill set-level.\nSpecifically, we define 12 fine-grained skills needed for LLMs to follow\nopen-ended user instructions and construct an evaluation set by allocating a\nset of skills for each instance. Additionally, by annotating the target domains\nand difficulty level for each instance, FLASK provides a holistic view with a\ncomprehensive analysis of a model's performance depending on skill, domain, and\ndifficulty. Through using FLASK, we compare multiple open-sourced and\nproprietary LLMs and observe highly-correlated findings between model-based and\nhuman-based evaluations. FLASK enables developers to more accurately measure\nthe model performance and how it can be improved by analyzing factors that make\nLLMs proficient in particular skills. For practitioners, FLASK can be used to\nrecommend suitable models for particular situations through comprehensive\ncomparison among various LLMs. We release the evaluation data and code\nimplementation at https://github.com/kaistAI/FLASK.\n","authors":["Seonghyeon Ye","Doyoung Kim","Sungdong Kim","Hyeonbin Hwang","Seungone Kim","Yongrae Jo","James Thorne","Juho Kim","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2307.10928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14030v2","updated":"2023-07-20T13:54:05Z","published":"2023-06-24T18:17:38Z","title":"My Boli: Code-mixed Marathi-English Corpora, Pretrained Language Models\n  and Evaluation Benchmarks","summary":"  The research on code-mixed data is limited due to the unavailability of\ndedicated code-mixed datasets and pre-trained language models. In this work, we\nfocus on the low-resource Indian language Marathi which lacks any prior work in\ncode-mixing. We present L3Cube-MeCorpus, a large code-mixed Marathi-English\n(Mr-En) corpus with 10 million social media sentences for pretraining. We also\nrelease L3Cube-MeBERT and MeRoBERTa, code-mixed BERT-based transformer models\npre-trained on MeCorpus. Furthermore, for benchmarking, we present three\nsupervised datasets MeHate, MeSent, and MeLID for downstream tasks like\ncode-mixed Mr-En hate speech detection, sentiment analysis, and language\nidentification respectively. These evaluation datasets individually consist of\nmanually annotated \\url{~}12,000 Marathi-English code-mixed tweets. Ablations\nshow that the models trained on this novel corpus significantly outperform the\nexisting state-of-the-art BERT models. This is the first work that presents\nartifacts for code-mixed Marathi research. All datasets and models are publicly\nreleased at https://github.com/l3cube-pune/MarathiNLP .\n","authors":["Tanmay Chavan","Omkar Gokhale","Aditya Kane","Shantanu Patankar","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2306.14030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10867v1","updated":"2023-07-20T13:40:22Z","published":"2023-07-20T13:40:22Z","title":"FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with\n  Human Feedback","summary":"  Captions are crucial for understanding scientific visualizations and\ndocuments. Existing captioning methods for scientific figures rely on\nfigure-caption pairs extracted from documents for training, many of which fall\nshort with respect to metrics like helpfulness, explainability, and\nvisual-descriptiveness [15] leading to generated captions being misaligned with\nreader preferences. To enable the generation of high-quality figure captions,\nwe introduce FigCaps-HF a new framework for figure-caption generation that can\nincorporate domain expert feedback in generating captions optimized for reader\npreferences. Our framework comprises of 1) an automatic method for evaluating\nquality of figure-caption pairs, 2) a novel reinforcement learning with human\nfeedback (RLHF) method to optimize a generative figure-to-caption model for\nreader preferences. We demonstrate the effectiveness of our simple learning\nframework by improving performance over standard fine-tuning across different\ntypes of models. In particular, when using BLIP as the base model, our RLHF\nframework achieves a mean gain of 35.7%, 16.9%, and 9% in ROUGE, BLEU, and\nMeteor, respectively. Finally, we release a large-scale benchmark dataset with\nhuman feedback on figure-caption pairs to enable further evaluation and\ndevelopment of RLHF techniques for this problem.\n","authors":["Ashish Singh","Prateek Agarwal","Zixuan Huang","Arpita Singh","Tong Yu","Sungchul Kim","Victor Bursztyn","Nikos Vlassis","Ryan A. Rossi"],"pdf_url":"https://arxiv.org/pdf/2307.10867v1.pdf","comment":"19 pages, 4 figures. Benchmark Documentation:\n  https://figcapshf.github.io/"},{"id":"http://arxiv.org/abs/2307.10864v1","updated":"2023-07-20T13:33:28Z","published":"2023-07-20T13:33:28Z","title":"Divide & Bind Your Attention for Improved Generative Semantic Nursing","summary":"  Emerging large-scale text-to-image generative models, e.g., Stable Diffusion\n(SD), have exhibited overwhelming results with high fidelity. Despite the\nmagnificent progress, current state-of-the-art models still struggle to\ngenerate images fully adhering to the input prompt. Prior work, Attend &\nExcite, has introduced the concept of Generative Semantic Nursing (GSN), aiming\nto optimize cross-attention during inference time to better incorporate the\nsemantics. It demonstrates promising results in generating simple prompts,\ne.g., ``a cat and a dog''. However, its efficacy declines when dealing with\nmore complex prompts, and it does not explicitly address the problem of\nimproper attribute binding. To address the challenges posed by complex prompts\nor scenarios involving multiple entities and to achieve improved attribute\nbinding, we propose Divide & Bind. We introduce two novel loss objectives for\nGSN: a novel attendance loss and a binding loss. Our approach stands out in its\nability to faithfully synthesize desired objects with improved attribute\nalignment from complex prompts and exhibits superior performance across\nmultiple evaluation benchmarks. More videos and updates can be found on the\nproject page \\url{https://sites.google.com/view/divide-and-bind}.\n","authors":["Yumeng Li","Margret Keuper","Dan Zhang","Anna Khoreva"],"pdf_url":"https://arxiv.org/pdf/2307.10864v1.pdf","comment":"Project page: \\url{https://sites.google.com/view/divide-and-bind}"},{"id":"http://arxiv.org/abs/2305.01146v3","updated":"2023-07-20T13:10:07Z","published":"2023-05-02T01:33:02Z","title":"RadAdapt: Radiology Report Summarization via Lightweight Domain\n  Adaptation of Large Language Models","summary":"  We systematically investigate lightweight strategies to adapt large language\nmodels (LLMs) for the task of radiology report summarization (RRS).\nSpecifically, we focus on domain adaptation via pretraining (on natural\nlanguage, biomedical text, or clinical text) and via discrete prompting or\nparameter-efficient fine-tuning. Our results consistently achieve best\nperformance by maximally adapting to the task via pretraining on clinical text\nand fine-tuning on RRS examples. Importantly, this method fine-tunes a mere\n0.32% of parameters throughout the model, in contrast to end-to-end fine-tuning\n(100% of parameters). Additionally, we study the effect of in-context examples\nand out-of-distribution (OOD) training before concluding with a radiologist\nreader study and qualitative analysis. Our findings highlight the importance of\ndomain adaptation in RRS and provide valuable insights toward developing\neffective natural language processing solutions for clinical tasks.\n","authors":["Dave Van Veen","Cara Van Uden","Maayane Attias","Anuj Pareek","Christian Bluethgen","Malgorzata Polacin","Wah Chiu","Jean-Benoit Delbrouck","Juan Manuel Zambrano Chaves","Curtis P. Langlotz","Akshay S. Chaudhari","John Pauly"],"pdf_url":"https://arxiv.org/pdf/2305.01146v3.pdf","comment":"12 pages, 10 figures. Published in ACL BioNLP. Compared to v1, v2\n  includes minor edits and one additional figure in the appendix. Compared to\n  v2, v3 includes a link to the project's GitHub repository"},{"id":"http://arxiv.org/abs/2307.10826v1","updated":"2023-07-20T12:41:35Z","published":"2023-07-20T12:41:35Z","title":"Yelp Reviews and Food Types: A Comparative Analysis of Ratings,\n  Sentiments, and Topics","summary":"  This study examines the relationship between Yelp reviews and food types,\ninvestigating how ratings, sentiments, and topics vary across different types\nof food. Specifically, we analyze how ratings and sentiments of reviews vary\nacross food types, cluster food types based on ratings and sentiments, infer\nreview topics using machine learning models, and compare topic distributions\namong different food types. Our analyses reveal that some food types have\nsimilar ratings, sentiments, and topics distributions, while others have\ndistinct patterns. We identify four clusters of food types based on ratings and\nsentiments and find that reviewers tend to focus on different topics when\nreviewing certain food types. These findings have important implications for\nunderstanding user behavior and cultural influence on digital media platforms\nand promoting cross-cultural understanding and appreciation.\n","authors":["Wenyu Liao","Yiqing Shi","Yujia Hu","Wei Quan"],"pdf_url":"https://arxiv.org/pdf/2307.10826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10814v1","updated":"2023-07-20T12:24:23Z","published":"2023-07-20T12:24:23Z","title":"Cross-Corpus Multilingual Speech Emotion Recognition: Amharic vs. Other\n  Languages","summary":"  In a conventional Speech emotion recognition (SER) task, a classifier for a\ngiven language is trained on a pre-existing dataset for that same language.\nHowever, where training data for a language does not exist, data from other\nlanguages can be used instead. We experiment with cross-lingual and\nmultilingual SER, working with Amharic, English, German and URDU. For Amharic,\nwe use our own publicly-available Amharic Speech Emotion Dataset (ASED). For\nEnglish, German and Urdu we use the existing RAVDESS, EMO-DB and URDU datasets.\nWe followed previous research in mapping labels for all datasets to just two\nclasses, positive and negative. Thus we can compare performance on different\nlanguages directly, and combine languages for training and testing. In\nExperiment 1, monolingual SER trials were carried out using three classifiers,\nAlexNet, VGGE (a proposed variant of VGG), and ResNet50. Results averaged for\nthe three models were very similar for ASED and RAVDESS, suggesting that\nAmharic and English SER are equally difficult. Similarly, German SER is more\ndifficult, and Urdu SER is easier. In Experiment 2, we trained on one language\nand tested on another, in both directions for each pair: Amharic<->German,\nAmharic<->English, and Amharic<->Urdu. Results with Amharic as target suggested\nthat using English or German as source will give the best result. In Experiment\n3, we trained on several non-Amharic languages and then tested on Amharic. The\nbest accuracy obtained was several percent greater than the best accuracy in\nExperiment 2, suggesting that a better result can be obtained when using two or\nthree non-Amharic languages for training than when using just one non-Amharic\nlanguage. Overall, the results suggest that cross-lingual and multilingual\ntraining can be an effective strategy for training a SER classifier when\nresources for a language are scarce.\n","authors":["Ephrem Afele Retta","Richard Sutcliffe","Jabar Mahmood","Michael Abebe Berwo","Eiad Almekhlafi","Sajjad Ahmed Khan","Shehzad Ashraf Chaudhry","Mustafa Mhamed","Jun Feng"],"pdf_url":"https://arxiv.org/pdf/2307.10814v1.pdf","comment":"16 pages, 9 tables, 5 figures"},{"id":"http://arxiv.org/abs/2307.10802v1","updated":"2023-07-20T12:10:29Z","published":"2023-07-20T12:10:29Z","title":"Meta-Transformer: A Unified Framework for Multimodal Learning","summary":"  Multimodal learning aims to build models that can process and relate\ninformation from multiple modalities. Despite years of development in this\nfield, it still remains challenging to design a unified network for processing\nvarious modalities ($\\textit{e.g.}$ natural language, 2D images, 3D point\nclouds, audio, video, time series, tabular data) due to the inherent gaps among\nthem. In this work, we propose a framework, named Meta-Transformer, that\nleverages a $\\textbf{frozen}$ encoder to perform multimodal perception without\nany paired multimodal training data. In Meta-Transformer, the raw input data\nfrom various modalities are mapped into a shared token space, allowing a\nsubsequent encoder with frozen parameters to extract high-level semantic\nfeatures of the input data. Composed of three main components: a unified data\ntokenizer, a modality-shared encoder, and task-specific heads for downstream\ntasks, Meta-Transformer is the first framework to perform unified learning\nacross 12 modalities with unpaired data. Experiments on different benchmarks\nreveal that Meta-Transformer can handle a wide range of tasks including\nfundamental perception (text, image, point cloud, audio, video), practical\napplication (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph,\ntabular, and time-series). Meta-Transformer indicates a promising future for\ndeveloping unified multimodal intelligence with transformers. Code will be\navailable at https://github.com/invictus717/MetaTransformer\n","authors":["Yiyuan Zhang","Kaixiong Gong","Kaipeng Zhang","Hongsheng Li","Yu Qiao","Wanli Ouyang","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.10802v1.pdf","comment":"Project website: https://kxgong.github.io/meta_transformer/"},{"id":"http://arxiv.org/abs/2307.10799v1","updated":"2023-07-20T12:01:40Z","published":"2023-07-20T12:01:40Z","title":"Layer-wise Representation Fusion for Compositional Generalization","summary":"  Despite successes across a broad range of applications, sequence-to-sequence\nmodels' construct of solutions are argued to be less compositional than\nhuman-like generalization. There is mounting evidence that one of the reasons\nhindering compositional generalization is representations of the encoder and\ndecoder uppermost layer are entangled. In other words, the syntactic and\nsemantic representations of sequences are twisted inappropriately. However,\nmost previous studies mainly concentrate on enhancing token-level semantic\ninformation to alleviate the representations entanglement problem, rather than\ncomposing and using the syntactic and semantic representations of sequences\nappropriately as humans do. In addition, we explain why the entanglement\nproblem exists from the perspective of recent studies about training deeper\nTransformer, mainly owing to the ``shallow'' residual connections and its\nsimple, one-step operations, which fails to fuse previous layers' information\neffectively. Starting from this finding and inspired by humans' strategies, we\npropose \\textsc{FuSion} (\\textbf{Fu}sing \\textbf{S}yntactic and\nSemant\\textbf{i}c Representati\\textbf{on}s), an extension to\nsequence-to-sequence models to learn to fuse previous layers' information back\ninto the encoding and decoding process appropriately through introducing a\n\\emph{fuse-attention module} at each encoder and decoder layer. \\textsc{FuSion}\nachieves competitive and even \\textbf{state-of-the-art} results on two\nrealistic benchmarks, which empirically demonstrates the effectiveness of our\nproposal.\n","authors":["Yafang Zheng","Lei Lin","Zhaohong Lai","Binling Wang","Shan Liu","Biao Fu","Wenhao Rao","Peigen Ye","Yidong Chen","Xiaodong Shi"],"pdf_url":"https://arxiv.org/pdf/2307.10799v1.pdf","comment":"work in progress. arXiv admin note: substantial text overlap with\n  arXiv:2305.12169"},{"id":"http://arxiv.org/abs/2210.11835v2","updated":"2023-07-20T11:56:40Z","published":"2022-10-21T09:28:54Z","title":"A Textless Metric for Speech-to-Speech Comparison","summary":"  In this paper, we introduce a new and simple method for comparing speech\nutterances without relying on text transcripts. Our speech-to-speech comparison\nmetric utilizes state-of-the-art speech2unit encoders like HuBERT to convert\nspeech utterances into discrete acoustic units. We then propose a simple and\neasily replicable neural architecture that learns a speech-based metric that\nclosely corresponds to its text-based counterpart. This textless metric has\nnumerous potential applications, including evaluating speech-to-speech\ntranslation for oral languages, languages without dependable ASR systems, or to\navoid the need for ASR transcription altogether. This paper also shows that for\nspeech-to-speech translation evaluation, ASR-BLEU (which consists in\nautomatically transcribing both speech hypothesis and reference and compute\nsentence-level BLEU between transcripts) is a poor proxy to real text-BLEU even\nwhen ASR system is strong.\n","authors":["Laurent Besacier","Swen Ribeiro","Olivier Galibert","Ioan Calapodescu"],"pdf_url":"https://arxiv.org/pdf/2210.11835v2.pdf","comment":"link to supplementary material:\n  https://github.com/besacier/textless-metric"},{"id":"http://arxiv.org/abs/2307.10778v1","updated":"2023-07-20T11:29:15Z","published":"2023-07-20T11:29:15Z","title":"Extreme Multi-Label Skill Extraction Training using Large Language\n  Models","summary":"  Online job ads serve as a valuable source of information for skill\nrequirements, playing a crucial role in labor market analysis and e-recruitment\nprocesses. Since such ads are typically formatted in free text, natural\nlanguage processing (NLP) technologies are required to automatically process\nthem. We specifically focus on the task of detecting skills (mentioned\nliterally, or implicitly described) and linking them to a large skill ontology,\nmaking it a challenging case of extreme multi-label classification (XMLC).\nGiven that there is no sizable labeled (training) dataset are available for\nthis specific XMLC task, we propose techniques to leverage general Large\nLanguage Models (LLMs). We describe a cost-effective approach to generate an\naccurate, fully synthetic labeled dataset for skill extraction, and present a\ncontrastive learning strategy that proves effective in the task. Our results\nacross three skill extraction benchmarks show a consistent increase of between\n15 to 25 percentage points in \\textit{R-Precision@5} compared to previously\npublished results that relied solely on distant supervision through literal\nmatches.\n","authors":["Jens-Joris Decorte","Severine Verlinden","Jeroen Van Hautte","Johannes Deleu","Chris Develder","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2307.10778v1.pdf","comment":"Accepted to the International workshop on AI for Human Resources and\n  Public Employment Services (AI4HR&PES) as part of ECML-PKDD 2023"},{"id":"http://arxiv.org/abs/2305.15299v2","updated":"2023-07-20T10:43:57Z","published":"2023-05-24T16:23:46Z","title":"Science in the Era of ChatGPT, Large Language Models and Generative AI:\n  Challenges for Research Ethics and How to Respond","summary":"  Large language models of artificial intelligence (AI), such as ChatGPT, find\nremarkable but controversial applicability in science and research. This paper\nreviews epistemological challenges, ethical and integrity risks in science\nconduct in the advent of generative AI. This is with the aim to lay new timely\nfoundations for a high-quality research ethics review. The role of AI language\nmodels as a research instrument and subject is scrutinized along with ethical\nimplications for scientists, participants and reviewers. New emerging practices\nfor research ethics review are discussed, concluding with ten recommendations\nthat shape a response for a more responsible research conduct in the era of AI.\n","authors":["Evangelos Pournaras"],"pdf_url":"https://arxiv.org/pdf/2305.15299v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10757v1","updated":"2023-07-20T10:42:16Z","published":"2023-07-20T10:42:16Z","title":"Vesper: A Compact and Effective Pretrained Model for Speech Emotion\n  Recognition","summary":"  This paper presents a paradigm that adapts general large-scale pretrained\nmodels (PTMs) to speech emotion recognition task. Although PTMs shed new light\non artificial general intelligence, they are constructed with general tasks in\nmind, and thus, their efficacy for specific tasks can be further improved.\nAdditionally, employing PTMs in practical applications can be challenging due\nto their considerable size. Above limitations spawn another research direction,\nnamely, optimizing large-scale PTMs for specific tasks to generate\ntask-specific PTMs that are both compact and effective. In this paper, we focus\non the speech emotion recognition task and propose an improved emotion-specific\npretrained encoder called Vesper. Vesper is pretrained on a speech dataset\nbased on WavLM and takes into account emotional characteristics. To enhance\nsensitivity to emotional information, Vesper employs an emotion-guided masking\nstrategy to identify the regions that need masking. Subsequently, Vesper\nemploys hierarchical and cross-layer self-supervision to improve its ability to\ncapture acoustic and semantic representations, both of which are crucial for\nemotion recognition. Experimental results on the IEMOCAP, MELD, and CREMA-D\ndatasets demonstrate that Vesper with 4 layers outperforms WavLM Base with 12\nlayers, and the performance of Vesper with 12 layers surpasses that of WavLM\nLarge with 24 layers.\n","authors":["Weidong Chen","Xiaofen Xing","Peihao Chen","Xiangmin Xu"],"pdf_url":"https://arxiv.org/pdf/2307.10757v1.pdf","comment":"13 pages, 5 figures, 8 tables"},{"id":"http://arxiv.org/abs/2307.10751v1","updated":"2023-07-20T10:26:57Z","published":"2023-07-20T10:26:57Z","title":"Exploring Perspectives on the Impact of Artificial Intelligence on the\n  Creativity of Knowledge Work: Beyond Mechanised Plagiarism and Stochastic\n  Parrots","summary":"  Artificial Intelligence (AI), and in particular generative models, are\ntransformative tools for knowledge work. They problematise notions of\ncreativity, originality, plagiarism, the attribution of credit, and copyright\nownership. Critics of generative models emphasise the reliance on large amounts\nof training data, and view the output of these models as no more than\nrandomised plagiarism, remix, or collage of the source data. On these grounds,\nmany have argued for stronger regulations on the deployment, use, and\nattribution of the output of these models. However, these issues are not new or\nunique to artificial intelligence. In this position paper, using examples from\nliterary criticism, the history of art, and copyright law, I show how\ncreativity and originality resist definition as a notatable or\ninformation-theoretic property of an object, and instead can be seen as the\nproperty of a process, an author, or a viewer. Further alternative views hold\nthat all creative work is essentially reuse (mostly without attribution), or\nthat randomness itself can be creative. I suggest that creativity is ultimately\ndefined by communities of creators and receivers, and the deemed sources of\ncreativity in a workflow often depend on which parts of the workflow can be\nautomated. Using examples from recent studies of AI in creative knowledge work,\nI suggest that AI shifts knowledge work from material production to critical\nintegration. This position paper aims to begin a conversation around a more\nnuanced approach to the problems of creativity and credit assignment for\ngenerative models, one which more fully recognises the importance of the\ncreative and curatorial voice of the users of these models and moves away from\nsimpler notational or information-theoretic views.\n","authors":["Advait Sarkar"],"pdf_url":"https://arxiv.org/pdf/2307.10751v1.pdf","comment":"Advait Sarkar. 2023. Exploring Perspectives on the Impact of\n  Artificial Intelligence on the Creativity of Knowledge Work Beyond Mechanised\n  Plagiarism and Stochastic Parrots. In Annual Symposium on Human-Computer\n  Interaction for Work 2023 (CHIWORK 2023), June 13-16, 2023, Oldenburg,\n  Germany. ACM, New York, NY, USA, 17 pages"},{"id":"http://arxiv.org/abs/2301.11596v4","updated":"2023-07-20T08:58:12Z","published":"2023-01-27T08:45:53Z","title":"ThoughtSource: A central hub for large language model reasoning data","summary":"  Large language models (LLMs) such as GPT-4 have recently demonstrated\nimpressive results across a wide range of tasks. LLMs are still limited,\nhowever, in that they frequently fail at complex reasoning, their reasoning\nprocesses are opaque, they are prone to 'hallucinate' facts, and there are\nconcerns about their underlying biases. Letting models verbalize reasoning\nsteps as natural language, a technique known as chain-of-thought prompting, has\nrecently been proposed as a way to address some of these issues. Here we\npresent ThoughtSource, a meta-dataset and software library for chain-of-thought\n(CoT) reasoning. The goal of ThoughtSource is to improve future artificial\nintelligence systems by facilitating qualitative understanding of CoTs,\nenabling empirical evaluations, and providing training data. This first release\nof ThoughtSource integrates six scientific/medical, three general-domain and\nfive math word question answering datasets.\n","authors":["Simon Ott","Konstantin Hebenstreit","Valentin Liévin","Christoffer Egeberg Hother","Milad Moradi","Maximilian Mayrhauser","Robert Praas","Ole Winther","Matthias Samwald"],"pdf_url":"https://arxiv.org/pdf/2301.11596v4.pdf","comment":"Revision: added datasets, formatting"},{"id":"http://arxiv.org/abs/2011.00696v2","updated":"2023-07-20T08:56:26Z","published":"2020-11-02T03:07:38Z","title":"ABNIRML: Analyzing the Behavior of Neural IR Models","summary":"  Pretrained contextualized language models such as BERT and T5 have\nestablished a new state-of-the-art for ad-hoc search. However, it is not yet\nwell-understood why these methods are so effective, what makes some variants\nmore effective than others, and what pitfalls they may have. We present a new\ncomprehensive framework for Analyzing the Behavior of Neural IR ModeLs\n(ABNIRML), which includes new types of diagnostic probes that allow us to test\nseveral characteristics -- such as writing styles, factuality, sensitivity to\nparaphrasing and word order -- that are not addressed by previous techniques.\nTo demonstrate the value of the framework, we conduct an extensive empirical\nstudy that yields insights into the factors that contribute to the neural\nmodel's gains, and identify potential unintended biases the models exhibit.\nSome of our results confirm conventional wisdom, like that recent neural\nranking models rely less on exact term overlap with the query, and instead\nleverage richer linguistic information, evidenced by their higher sensitivity\nto word and sentence order. Other results are more surprising, such as that\nsome models (e.g., T5 and ColBERT) are biased towards factually correct (rather\nthan simply relevant) texts. Further, some characteristics vary even for the\nsame base language model, and other characteristics can appear due to random\nvariations during model training.\n","authors":["Sean MacAvaney","Sergey Feldman","Nazli Goharian","Doug Downey","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2011.00696v2.pdf","comment":"TACL version"},{"id":"http://arxiv.org/abs/2306.06427v2","updated":"2023-07-20T08:47:14Z","published":"2023-06-10T12:42:36Z","title":"Boosting Language Models Reasoning with Chain-of-Knowledge Prompting","summary":"  Recently, Chain-of-Thought (CoT) prompting has delivered success on complex\nreasoning tasks, which aims at designing a simple prompt like ``Let's think\nstep by step'' or multiple in-context exemplars with well-designed rationales\nto elicit Large Language Models (LLMs) to generate intermediate reasoning\nsteps. However, the generated rationales often come with mistakes, making\nunfactual and unfaithful reasoning chains. To mitigate this brittleness, we\npropose a novel Chain-of-Knowledge (CoK) prompting, where we aim at eliciting\nLLMs to generate explicit pieces of knowledge evidence in the form of structure\ntriple. This is inspired by our human behaviors, i.e., we can draw a mind map\nor knowledge map as the reasoning evidence in the brain before answering a\ncomplex question. Benefiting from CoK, we additionally introduce a\nF^2-Verification method to estimate the reliability of the reasoning chains in\nterms of factuality and faithfulness. For the unreliable response, the wrong\nevidence can be indicated to prompt the LLM to rethink. Extensive experiments\ndemonstrate that our method can further improve the performance of commonsense,\nfactual, symbolic, and arithmetic reasoning tasks.\n","authors":["Jianing Wang","Qiushi Sun","Nuo Chen","Xiang Li","Ming Gao"],"pdf_url":"https://arxiv.org/pdf/2306.06427v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2307.10700v1","updated":"2023-07-20T08:45:00Z","published":"2023-07-20T08:45:00Z","title":"Large language models shape and are shaped by society: A survey of arXiv\n  publication patterns","summary":"  There has been a steep recent increase in the number of large language model\n(LLM) papers, producing a dramatic shift in the scientific landscape which\nremains largely undocumented through bibliometric analysis. Here, we analyze\n388K papers posted on the CS and Stat arXivs, focusing on changes in\npublication patterns in 2023 vs. 2018-2022. We analyze how the proportion of\nLLM papers is increasing; the LLM-related topics receiving the most attention;\nthe authors writing LLM papers; how authors' research topics correlate with\ntheir backgrounds; the factors distinguishing highly cited LLM papers; and the\npatterns of international collaboration. We show that LLM research increasingly\nfocuses on societal impacts: there has been an 18x increase in the proportion\nof LLM-related papers on the Computers and Society sub-arXiv, and authors newly\npublishing on LLMs are more likely to focus on applications and societal\nimpacts than more experienced authors. LLM research is also shaped by social\ndynamics: we document gender and academic/industry disparities in the topics\nLLM authors focus on, and a US/China schism in the collaboration network.\nOverall, our analysis documents the profound ways in which LLM research both\nshapes and is shaped by society, attesting to the necessity of sociotechnical\nlenses.\n","authors":["Rajiv Movva","Sidhika Balachandar","Kenny Peng","Gabriel Agostini","Nikhil Garg","Emma Pierson"],"pdf_url":"https://arxiv.org/pdf/2307.10700v1.pdf","comment":"Working paper"},{"id":"http://arxiv.org/abs/2303.12112v3","updated":"2023-07-20T08:16:09Z","published":"2023-03-21T18:03:14Z","title":"Positive-Augmented Contrastive Learning for Image and Video Captioning\n  Evaluation","summary":"  The CLIP model has been recently proven to be very effective for a variety of\ncross-modal tasks, including the evaluation of captions generated from\nvision-and-language architectures. In this paper, we propose a new recipe for a\ncontrastive-based evaluation metric for image captioning, namely\nPositive-Augmented Contrastive learning Score (PAC-S), that in a novel way\nunifies the learning of a contrastive visual-semantic space with the addition\nof generated images and text on curated data. Experiments spanning several\ndatasets demonstrate that our new metric achieves the highest correlation with\nhuman judgments on both images and videos, outperforming existing\nreference-based metrics like CIDEr and SPICE and reference-free metrics like\nCLIP-Score. Finally, we test the system-level correlation of the proposed\nmetric when considering popular image captioning approaches, and assess the\nimpact of employing different cross-modal features. Our source code and trained\nmodels are publicly available at: https://github.com/aimagelab/pacscore.\n","authors":["Sara Sarto","Manuele Barraco","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2303.12112v3.pdf","comment":"CVPR 2023 (highlight paper)"},{"id":"http://arxiv.org/abs/2307.10666v1","updated":"2023-07-20T07:47:08Z","published":"2023-07-20T07:47:08Z","title":"A Dataset and Strong Baselines for Classification of Czech News Texts","summary":"  Pre-trained models for Czech Natural Language Processing are often evaluated\non purely linguistic tasks (POS tagging, parsing, NER) and relatively simple\nclassification tasks such as sentiment classification or article classification\nfrom a single news source. As an alternative, we present\nCZEch~NEws~Classification~dataset (CZE-NEC), one of the largest Czech\nclassification datasets, composed of news articles from various sources\nspanning over twenty years, which allows a more rigorous evaluation of such\nmodels. We define four classification tasks: news source, news category,\ninferred author's gender, and day of the week. To verify the task difficulty,\nwe conducted a human evaluation, which revealed that human performance lags\nbehind strong machine-learning baselines built upon pre-trained transformer\nmodels. Furthermore, we show that language-specific pre-trained encoder\nanalysis outperforms selected commercially available large-scale generative\nlanguage models.\n","authors":["Hynek Kydlíček","Jindřich Libovický"],"pdf_url":"https://arxiv.org/pdf/2307.10666v1.pdf","comment":"12 pages, Accepted to Text, Speech and Dialogue (TSD) 2023"},{"id":"http://arxiv.org/abs/2307.10652v1","updated":"2023-07-20T07:33:30Z","published":"2023-07-20T07:33:30Z","title":"Exploring the Landscape of Natural Language Processing Research","summary":"  As an efficient approach to understand, generate, and process natural\nlanguage texts, research in natural language processing (NLP) has exhibited a\nrapid spread and wide adoption in recent years. Given the increasing amount of\nresearch work in this area, several NLP-related approaches have been surveyed\nin the research community. However, a comprehensive study that categorizes\nestablished topics, identifies trends, and outlines areas for future research\nremains absent to this day. Contributing to closing this gap, we have\nsystematically classified and analyzed research papers included in the ACL\nAnthology. As a result, we present a structured overview of the research\nlandscape, provide a taxonomy of fields-of-study in NLP, analyze recent\ndevelopments in NLP, summarize our findings, and highlight directions for\nfuture work.\n","authors":["Tim Schopf","Karim Arabi","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.10652v1.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n  Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2307.10635v1","updated":"2023-07-20T07:01:57Z","published":"2023-07-20T07:01:57Z","title":"SciBench: Evaluating College-Level Scientific Problem-Solving Abilities\n  of Large Language Models","summary":"  Recent advances in large language models (LLMs) have demonstrated notable\nprogress on many mathematical benchmarks. However, most of these benchmarks\nonly feature problems grounded in junior and senior high school subjects,\ncontain only multiple-choice questions, and are confined to a limited scope of\nelementary arithmetic operations. To address these issues, this paper\nintroduces an expansive benchmark suite SciBench that aims to systematically\nexamine the reasoning capabilities required for complex scientific problem\nsolving. SciBench contains two carefully curated datasets: an open set\nfeaturing a range of collegiate-level scientific problems drawn from\nmathematics, chemistry, and physics textbooks, and a closed set comprising\nproblems from undergraduate-level exams in computer science and mathematics.\nBased on the two datasets, we conduct an in-depth benchmark study of two\nrepresentative LLMs with various prompting strategies. The results reveal that\ncurrent LLMs fall short of delivering satisfactory performance, with an overall\nscore of merely 35.80%. Furthermore, through a detailed user study, we\ncategorize the errors made by LLMs into ten problem-solving abilities. Our\nanalysis indicates that no single prompting strategy significantly outperforms\nothers and some strategies that demonstrate improvements in certain\nproblem-solving skills result in declines in other skills. We envision that\nSciBench will catalyze further developments in the reasoning abilities of LLMs,\nthereby ultimately contributing to scientific research and discovery.\n","authors":["Xiaoxuan Wang","Ziniu Hu","Pan Lu","Yanqiao Zhu","Jieyu Zhang","Satyen Subramaniam","Arjun R. Loomba","Shichang Zhang","Yizhou Sun","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10635v1.pdf","comment":"Work in progress, 18 pages"},{"id":"http://arxiv.org/abs/2307.10634v1","updated":"2023-07-20T06:59:02Z","published":"2023-07-20T06:59:02Z","title":"Generative Language Models on Nucleotide Sequences of Human Genes","summary":"  Language models, primarily transformer-based ones, obtained colossal success\nin NLP. To be more precise, studies like BERT in NLU and works such as GPT-3\nfor NLG are very crucial. DNA sequences are very close to natural language in\nterms of structure, so if the DNA-related bioinformatics domain is concerned,\ndiscriminative models, like DNABert, exist. Yet, the generative side of the\ncoin is mainly unexplored to the best of our knowledge. Consequently, we\nfocused on developing an autoregressive generative language model like GPT-3\nfor DNA sequences. Because working with whole DNA sequences is challenging\nwithout substantial computational resources, we decided to carry out our study\non a smaller scale, focusing on nucleotide sequences of human genes, unique\nparts in DNA with specific functionalities, instead of the whole DNA. This\ndecision did not change the problem structure a lot due to the fact that both\nDNA and genes can be seen as 1D sequences consisting of four different\nnucleotides without losing much information and making too much simplification.\nFirst of all, we systematically examined an almost entirely unexplored problem\nand observed that RNNs performed the best while simple techniques like N-grams\nwere also promising. Another beneficial point was learning how to work with\ngenerative models on languages we do not understand, unlike natural language.\nHow essential using real-life tasks beyond the classical metrics such as\nperplexity is observed. Furthermore, checking whether the data-hungry nature of\nthese models can be changed through selecting a language with minimal\nvocabulary size, four owing to four different types of nucleotides, is\nexamined. The reason for reviewing this was that choosing such a language might\nmake the problem easier. However, what we observed in this study was it did not\nprovide that much of a change in the amount of data needed.\n","authors":["Musa Nuri Ihtiyar","Arzucan Ozgur"],"pdf_url":"https://arxiv.org/pdf/2307.10634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10633v1","updated":"2023-07-20T06:58:55Z","published":"2023-07-20T06:58:55Z","title":"Multi-Method Self-Training: Improving Code Generation With Text, And\n  Vice Versa","summary":"  Large Language Models have many methods for solving the same problem. This\nintroduces novel strengths (different methods may work well for different\nproblems) and weaknesses (it may be difficult for users to know which method to\nuse). In this paper, we introduce Multi-Method Self-Training (MMST), where one\nmethod is trained on the filtered outputs of another, allowing us to augment\nthe strengths and ameliorate the weaknesses of each method. Using a 176B\nparameter model trained on both language and code, we show that MMST can 1)\nimprove the less performant method (up to 30%) making the model easier to use,\n2) improve the more performant method (up to 32.2%) making the model more\nperformant, and 3) improve the performance of related but distinct tasks (up to\n10.3%) by improving the ability of the model to generate rationales. We then\nconduct ablation analyses to explore why MMST works. We show that MMST\ngenerates more data than traditional self-training, but the improvement in\nperformance is driven by the use of multiple methods. We also analyze\nprompt-engineering and anti-correlated performance between methods as means of\nmaking MMST more effective. We hope the evidence from our paper motivates\nmachine learning researchers to explore ways in which advances in language\nmodels allow for new forms of training.\n","authors":["Shriyash K. Upadhyay","Etan J. Ginsberg"],"pdf_url":"https://arxiv.org/pdf/2307.10633v1.pdf","comment":"23 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.10587v1","updated":"2023-07-20T05:03:00Z","published":"2023-07-20T05:03:00Z","title":"A Deep Dive into the Disparity of Word Error Rates Across Thousands of\n  NPTEL MOOC Videos","summary":"  Automatic speech recognition (ASR) systems are designed to transcribe spoken\nlanguage into written text and find utility in a variety of applications\nincluding voice assistants and transcription services. However, it has been\nobserved that state-of-the-art ASR systems which deliver impressive benchmark\nresults, struggle with speakers of certain regions or demographics due to\nvariation in their speech properties. In this work, we describe the curation of\na massive speech dataset of 8740 hours consisting of $\\sim9.8$K technical\nlectures in the English language along with their transcripts delivered by\ninstructors representing various parts of Indian demography. The dataset is\nsourced from the very popular NPTEL MOOC platform. We use the curated dataset\nto measure the existing disparity in YouTube Automatic Captions and OpenAI\nWhisper model performance across the diverse demographic traits of speakers in\nIndia. While there exists disparity due to gender, native region, age and\nspeech rate of speakers, disparity based on caste is non-existent. We also\nobserve statistically significant disparity across the disciplines of the\nlectures. These results indicate the need of more inclusive and robust ASR\nsystems and more representational datasets for disparity evaluation in them.\n","authors":["Anand Kumar Rai","Siddharth D Jaiswal","Animesh Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2307.10587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10558v1","updated":"2023-07-20T03:54:24Z","published":"2023-07-20T03:54:24Z","title":"Instruction-following Evaluation through Verbalizer Manipulation","summary":"  While instruction-tuned models have shown remarkable success in various\nnatural language processing tasks, accurately evaluating their ability to\nfollow instructions remains challenging. Existing benchmarks primarily focus on\ncommon instructions that align well with what the model learned during\ntraining. However, proficiency in responding to these instructions does not\nnecessarily imply strong ability in instruction following. In this paper, we\npropose a novel instruction-following evaluation protocol called verbalizer\nmanipulation. It instructs the model to verbalize the task label with words\naligning with model priors to different extents, adopting verbalizers from\nhighly aligned (e.g., outputting ``postive'' for positive sentiment), to\nminimally aligned (e.g., outputting ``negative'' for positive sentiment).\nVerbalizer manipulation can be seamlessly integrated with any classification\nbenchmark to examine the model's reliance on priors and its ability to override\nthem to accurately follow the instructions. We conduct a comprehensive\nevaluation of four major model families across nine datasets, employing twelve\nsets of verbalizers for each of them. We observe that the instruction-following\nabilities of models, across different families and scales, are significantly\ndistinguished by their performance on less natural verbalizers. Even the\nstrongest GPT-4 model struggles to perform better than random guessing on the\nmost challenging verbalizer, emphasizing the need for continued advancements to\nimprove their instruction-following abilities.\n","authors":["Shiyang Li","Jun Yan","Hai Wang","Zheng Tang","Xiang Ren","Vijay Srinivasan","Hongxia Jin"],"pdf_url":"https://arxiv.org/pdf/2307.10558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14795v2","updated":"2023-07-20T03:39:19Z","published":"2023-06-26T15:53:02Z","title":"MotionGPT: Human Motion as a Foreign Language","summary":"  Though the advancement of pre-trained large language models unfolds, the\nexploration of building a unified model for language and other multi-modal\ndata, such as motion, remains challenging and untouched so far. Fortunately,\nhuman motion displays a semantic coupling akin to human language, often\nperceived as a form of body language. By fusing language data with large-scale\nmotion models, motion-language pre-training that can enhance the performance of\nmotion-related tasks becomes feasible. Driven by this insight, we propose\nMotionGPT, a unified, versatile, and user-friendly motion-language model to\nhandle multiple motion-relevant tasks. Specifically, we employ the discrete\nvector quantization for human motion and transfer 3D motion into motion tokens,\nsimilar to the generation process of word tokens. Building upon this \"motion\nvocabulary\", we perform language modeling on both motion and text in a unified\nmanner, treating human motion as a specific language. Moreover, inspired by\nprompt learning, we pre-train MotionGPT with a mixture of motion-language data\nand fine-tune it on prompt-based question-and-answer tasks. Extensive\nexperiments demonstrate that MotionGPT achieves state-of-the-art performances\non multiple motion tasks including text-driven motion generation, motion\ncaptioning, motion prediction, and motion in-between.\n","authors":["Biao Jiang","Xin Chen","Wen Liu","Jingyi Yu","Gang Yu","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2306.14795v2.pdf","comment":"Project Page: https://github.com/OpenMotionLab/MotionGPT"},{"id":"http://arxiv.org/abs/2307.10549v1","updated":"2023-07-20T03:26:57Z","published":"2023-07-20T03:26:57Z","title":"Dynamic Large Language Models on Blockchains","summary":"  Training and deploying the large language models requires a large mount of\ncomputational resource because the language models contain billions of\nparameters and the text has thousands of tokens. Another problem is that the\nlarge language models are static. They are fixed after the training process. To\ntackle these issues, in this paper, we propose to train and deploy the dynamic\nlarge language model on blockchains, which have high computation performance\nand are distributed across a network of computers. A blockchain is a secure,\ndecentralized, and transparent system that allows for the creation of a\ntamper-proof ledger for transactions without the need for intermediaries. The\ndynamic large language models can continuously learn from the user input after\nthe training process. Our method provides a new way to develop the large\nlanguage models and also sheds a light on the next generation artificial\nintelligence systems.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2307.10549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00470v4","updated":"2023-07-20T03:03:25Z","published":"2023-07-02T04:32:41Z","title":"PatternGPT :A Pattern-Driven Framework for Large Language Model Text\n  Generation","summary":"  Large language models(LLMS)have shown excellent text generation capabilities,\ncapable of generating fluent human-like responses for many downstream tasks.\nHowever, applying large language models to real-world critical tasks remains\nchallenging due to their susceptibility to hallucinations and inability to\ndirectly use external knowledge. To cope with the above challenges, this paper\nproposes PatternGPT, a pattern-driven text generation framework for Large\nLanguage Models. Firstly, the framework utilizes the extraction capability of\nLarge Language Models to generate rich and diversified structured and\nformalized patterns, which facilitates the introduction of external knowledge\nto do the computation, and then draws on the idea of federated learning to use\nmultiple agents to achieve the sharing in order to obtain more diversified\npatterns, and finally uses judgment criteria and optimization algorithm to\nsearch for high-quality patterns to guide the generation of models. Finally,\nexternal knowledge such as judgment criteria and optimization algorithms are\nused to search for high-quality patterns, and the searched patterns are used to\nguide model generation. This framework has the advantages of generating\ndiversified patterns, protecting data privacy, combining external knowledge,\nand improving the quality of generation, which provides an effective method to\noptimize the text generation capability of large language models, and make it\nbetter applied to the field of intelligent dialogue and content generation.\n","authors":["Le Xiao","Xin Shan"],"pdf_url":"https://arxiv.org/pdf/2307.00470v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10168v2","updated":"2023-07-20T02:29:25Z","published":"2023-07-19T17:54:43Z","title":"LLMs as Workers in Human-Computational Algorithms? Replicating\n  Crowdsourcing Pipelines with LLMs","summary":"  LLMs have shown promise in replicating human-like behavior in crowdsourcing\ntasks that were previously thought to be exclusive to human abilities. However,\ncurrent efforts focus mainly on simple atomic tasks. We explore whether LLMs\ncan replicate more complex crowdsourcing pipelines. We find that modern LLMs\ncan simulate some of crowdworkers' abilities in these \"human computation\nalgorithms,\" but the level of success is variable and influenced by requesters'\nunderstanding of LLM capabilities, the specific skills required for sub-tasks,\nand the optimal interaction modality for performing these sub-tasks. We reflect\non human and LLMs' different sensitivities to instructions, stress the\nimportance of enabling human-facing safeguards for LLMs, and discuss the\npotential of training humans and LLMs with complementary skill sets. Crucially,\nwe show that replicating crowdsourcing pipelines offers a valuable platform to\ninvestigate (1) the relative strengths of LLMs on different tasks (by\ncross-comparing their performances on sub-tasks) and (2) LLMs' potential in\ncomplex tasks, where they can complete part of the tasks while leaving others\nto humans.\n","authors":["Tongshuang Wu","Haiyi Zhu","Maya Albayrak","Alexis Axon","Amanda Bertsch","Wenxing Deng","Ziqi Ding","Bill Guo","Sireesh Gururaja","Tzu-Sheng Kuo","Jenny T. Liang","Ryan Liu","Ihita Mandal","Jeremiah Milbauer","Xiaolin Ni","Namrata Padmanabhan","Subhashini Ramkumar","Alexis Sudjianto","Jordan Taylor","Ying-Jui Tseng","Patricia Vaidos","Zhijin Wu","Wei Wu","Chenyang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.10168v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11296v2","updated":"2023-07-20T02:20:35Z","published":"2023-06-20T05:20:29Z","title":"ChatGPT Chemistry Assistant for Text Mining and Prediction of MOF\n  Synthesis","summary":"  We use prompt engineering to guide ChatGPT in the automation of text mining\nof metal-organic frameworks (MOFs) synthesis conditions from diverse formats\nand styles of the scientific literature. This effectively mitigates ChatGPT's\ntendency to hallucinate information -- an issue that previously made the use of\nLarge Language Models (LLMs) in scientific fields challenging. Our approach\ninvolves the development of a workflow implementing three different processes\nfor text mining, programmed by ChatGPT itself. All of them enable parsing,\nsearching, filtering, classification, summarization, and data unification with\ndifferent tradeoffs between labor, speed, and accuracy. We deploy this system\nto extract 26,257 distinct synthesis parameters pertaining to approximately 800\nMOFs sourced from peer-reviewed research articles. This process incorporates\nour ChemPrompt Engineering strategy to instruct ChatGPT in text mining,\nresulting in impressive precision, recall, and F1 scores of 90-99%.\nFurthermore, with the dataset built by text mining, we constructed a\nmachine-learning model with over 86% accuracy in predicting MOF experimental\ncrystallization outcomes and preliminarily identifying important factors in MOF\ncrystallization. We also developed a reliable data-grounded MOF chatbot to\nanswer questions on chemical reactions and synthesis procedures. Given that the\nprocess of using ChatGPT reliably mines and tabulates diverse MOF synthesis\ninformation in a unified format, while using only narrative language requiring\nno coding expertise, we anticipate that our ChatGPT Chemistry Assistant will be\nvery useful across various other chemistry sub-disciplines.\n","authors":["Zhiling Zheng","Oufan Zhang","Christian Borgs","Jennifer T. Chayes","Omar M. Yaghi"],"pdf_url":"https://arxiv.org/pdf/2306.11296v2.pdf","comment":"Published on Journal of the American Chemical Society (2023); 102\n  pages (18-page manuscript, 84 pages of supporting information)"},{"id":"http://arxiv.org/abs/2307.07946v2","updated":"2023-07-20T02:01:34Z","published":"2023-07-16T04:50:52Z","title":"Unifying Token and Span Level Supervisions for Few-Shot Sequence\n  Labeling","summary":"  Few-shot sequence labeling aims to identify novel classes based on only a few\nlabeled samples. Existing methods solve the data scarcity problem mainly by\ndesigning token-level or span-level labeling models based on metric learning.\nHowever, these methods are only trained at a single granularity (i.e., either\ntoken level or span level) and have some weaknesses of the corresponding\ngranularity. In this paper, we first unify token and span level supervisions\nand propose a Consistent Dual Adaptive Prototypical (CDAP) network for few-shot\nsequence labeling. CDAP contains the token-level and span-level networks,\njointly trained at different granularities. To align the outputs of two\nnetworks, we further propose a consistent loss to enable them to learn from\neach other. During the inference phase, we propose a consistent greedy\ninference algorithm that first adjusts the predicted probability and then\ngreedily selects non-overlapping spans with maximum probability. Extensive\nexperiments show that our model achieves new state-of-the-art results on three\nbenchmark datasets.\n","authors":["Zifeng Cheng","Qingyu Zhou","Zhiwei Jiang","Xuemin Zhao","Yunbo Cao","Qing Gu"],"pdf_url":"https://arxiv.org/pdf/2307.07946v2.pdf","comment":"Accepted by ACM Transactions on Information Systems"},{"id":"http://arxiv.org/abs/2307.10522v1","updated":"2023-07-20T01:48:51Z","published":"2023-07-20T01:48:51Z","title":"Gender-tuning: Empowering Fine-tuning for Debiasing Pre-trained Language\n  Models","summary":"  Recent studies have revealed that the widely-used Pre-trained Language Models\n(PLMs) propagate societal biases from the large unmoderated pre-training\ncorpora. Existing solutions require debiasing training processes and datasets\nfor debiasing, which are resource-intensive and costly. Furthermore, these\nmethods hurt the PLMs' performance on downstream tasks. In this study, we\npropose Gender-tuning, which debiases the PLMs through fine-tuning on\ndownstream tasks' datasets. For this aim, Gender-tuning integrates Masked\nLanguage Modeling (MLM) training objectives into fine-tuning's training\nprocess. Comprehensive experiments show that Gender-tuning outperforms the\nstate-of-the-art baselines in terms of average gender bias scores in PLMs while\nimproving PLMs' performance on downstream tasks solely using the downstream\ntasks' dataset. Also, Gender-tuning is a deployable debiasing tool for any PLM\nthat works with original fine-tuning.\n","authors":["Somayeh Ghanbarzadeh","Yan Huang","Hamid Palangi","Radames Cruz Moreno","Hamed Khanpour"],"pdf_url":"https://arxiv.org/pdf/2307.10522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10514v1","updated":"2023-07-20T01:26:34Z","published":"2023-07-20T01:26:34Z","title":"Building Socio-culturally Inclusive Stereotype Resources with Community\n  Engagement","summary":"  With rapid development and deployment of generative language models in global\nsettings, there is an urgent need to also scale our measurements of harm, not\njust in the number and types of harms covered, but also how well they account\nfor local cultural contexts, including marginalized identities and the social\nbiases experienced by them. Current evaluation paradigms are limited in their\nabilities to address this, as they are not representative of diverse, locally\nsituated but global, socio-cultural perspectives. It is imperative that our\nevaluation resources are enhanced and calibrated by including people and\nexperiences from different cultures and societies worldwide, in order to\nprevent gross underestimations or skews in measurements of harm. In this work,\nwe demonstrate a socio-culturally aware expansion of evaluation resources in\nthe Indian societal context, specifically for the harm of stereotyping. We\ndevise a community engaged effort to build a resource which contains\nstereotypes for axes of disparity that are uniquely present in India. The\nresultant resource increases the number of stereotypes known for and in the\nIndian context by over 1000 stereotypes across many unique identities. We also\ndemonstrate the utility and effectiveness of such expanded resources for\nevaluations of language models. CONTENT WARNING: This paper contains examples\nof stereotypes that may be offensive.\n","authors":["Sunipa Dev","Jaya Goyal","Dinesh Tewari","Shachi Dave","Vinodkumar Prabhakaran"],"pdf_url":"https://arxiv.org/pdf/2307.10514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02288v3","updated":"2023-07-20T01:13:27Z","published":"2023-07-05T13:40:57Z","title":"Performance Comparison of Large Language Models on VNHSGE English\n  Dataset: OpenAI ChatGPT, Microsoft Bing Chat, and Google Bard","summary":"  This paper presents a performance comparison of three large language models\n(LLMs), namely OpenAI ChatGPT, Microsoft Bing Chat (BingChat), and Google Bard,\non the VNHSGE English dataset. The performance of BingChat, Bard, and ChatGPT\n(GPT-3.5) is 92.4\\%, 86\\%, and 79.2\\%, respectively. The results show that\nBingChat is better than ChatGPT and Bard. Therefore, BingChat and Bard can\nreplace ChatGPT while ChatGPT is not yet officially available in Vietnam. The\nresults also indicate that BingChat, Bard and ChatGPT outperform Vietnamese\nstudents in English language proficiency. The findings of this study contribute\nto the understanding of the potential of LLMs in English language education.\nThe remarkable performance of ChatGPT, BingChat, and Bard demonstrates their\npotential as effective tools for teaching and learning English at the high\nschool level.\n","authors":["Xuan-Quy Dao"],"pdf_url":"https://arxiv.org/pdf/2307.02288v3.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.10512v1","updated":"2023-07-20T01:11:14Z","published":"2023-07-20T01:11:14Z","title":"IvyGPT: InteractiVe Chinese pathwaY language model in medical domain","summary":"  General large language models (LLMs) such as ChatGPT have shown remarkable\nsuccess. However, such LLMs have not been widely adopted for medical purposes,\ndue to poor accuracy and inability to provide medical advice. We propose\nIvyGPT, an LLM based on LLaMA that is trained and fine-tuned with high-quality\nmedical question-answer (QA) instances and Reinforcement Learning from Human\nFeedback (RLHF). After supervised fine-tuning, IvyGPT has good multi-turn\nconversation capabilities, but it cannot perform like a doctor in other\naspects, such as comprehensive diagnosis. Through RLHF, IvyGPT can output\nricher diagnosis and treatment answers that are closer to human. In the\ntraining, we used QLoRA to train 33 billion parameters on a small number of\nNVIDIA A100 (80GB) GPUs. Experimental results show that IvyGPT has outperformed\nother medical GPT models.\n","authors":["Rongsheng Wang","Yaofei Duan","ChanTong Lam","Jiexi Chen","Jiangsheng Xu","Haoming Chen","Xiaohong Liu","Patrick Cheong-Iao Pang","Tao Tan"],"pdf_url":"https://arxiv.org/pdf/2307.10512v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2305.11408v2","updated":"2023-07-20T00:58:30Z","published":"2023-05-19T03:31:42Z","title":"AlignAtt: Using Attention-based Audio-Translation Alignments as a Guide\n  for Simultaneous Speech Translation","summary":"  Attention is the core mechanism of today's most used architectures for\nnatural language processing and has been analyzed from many perspectives,\nincluding its effectiveness for machine translation-related tasks. Among these\nstudies, attention resulted to be a useful source of information to get\ninsights about word alignment also when the input text is substituted with\naudio segments, as in the case of the speech translation (ST) task. In this\npaper, we propose AlignAtt, a novel policy for simultaneous ST (SimulST) that\nexploits the attention information to generate source-target alignments that\nguide the model during inference. Through experiments on the 8 language pairs\nof MuST-C v1.0, we show that AlignAtt outperforms previous state-of-the-art\nSimulST policies applied to offline-trained models with gains in terms of BLEU\nof 2 points and latency reductions ranging from 0.5s to 0.8s across the 8\nlanguages.\n","authors":["Sara Papi","Marco Turchi","Matteo Negri"],"pdf_url":"https://arxiv.org/pdf/2305.11408v2.pdf","comment":"Accepted at Interspeech 2023"},{"id":"http://arxiv.org/abs/2307.09702v2","updated":"2023-07-20T00:40:41Z","published":"2023-07-19T01:14:49Z","title":"Efficient Guided Generation for Large Language Models","summary":"  In this article we describe an efficient approach to guiding language model\ntext generation with regular expressions and context-free grammars. Our\napproach adds little to no overhead to the token sequence generation process,\nand makes guided generation feasible in practice. An implementation is provided\nin the open source Python library Outlines.\n","authors":["Brandon T. Willard","Rémi Louf"],"pdf_url":"https://arxiv.org/pdf/2307.09702v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10511v1","updated":"2023-07-20T00:36:41Z","published":"2023-07-20T00:36:41Z","title":"General Debiasing for Multimodal Sentiment Analysis","summary":"  Existing work on Multimodal Sentiment Analysis (MSA) utilizes multimodal\ninformation for prediction yet unavoidably suffers from fitting the spurious\ncorrelations between multimodal features and sentiment labels. For example, if\nmost videos with a blue background have positive labels in a dataset, the model\nwill rely on such correlations for prediction, while ``blue background'' is not\na sentiment-related feature. To address this problem, we define a general\ndebiasing MSA task, which aims to enhance the Out-Of-Distribution (OOD)\ngeneralization ability of MSA models by reducing their reliance on spurious\ncorrelations. To this end, we propose a general debiasing framework based on\nInverse Probability Weighting (IPW), which adaptively assigns small weights to\nthe samples with larger bias i.e., the severer spurious correlations). The key\nto this debiasing framework is to estimate the bias of each sample, which is\nachieved by two steps: 1) disentangling the robust features and biased features\nin each modality, and 2) utilizing the biased features to estimate the bias.\nFinally, we employ IPW to reduce the effects of large-biased samples,\nfacilitating robust feature learning for sentiment prediction. To examine the\nmodel's generalization ability, we keep the original testing sets on two\nbenchmarks and additionally construct multiple unimodal and multimodal OOD\ntesting sets. The empirical results demonstrate the superior generalization\nability of our proposed framework. We have released the code and data to\nfacilitate the reproduction.\n","authors":["Teng Sun","Juntong Ni","Wenjie Wang","Liqiang Jing","Yinwei Wei","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2307.10511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11254v1","updated":"2023-07-20T22:10:04Z","published":"2023-07-20T22:10:04Z","title":"A Systematic Evaluation of Federated Learning on Biomedical Natural\n  Language Processing","summary":"  Language models (LMs) like BERT and GPT have revolutionized natural language\nprocessing (NLP). However, privacy-sensitive domains, particularly the medical\nfield, face challenges to train LMs due to limited data access and privacy\nconstraints imposed by regulations like the Health Insurance Portability and\nAccountability Act (HIPPA) and the General Data Protection Regulation (GDPR).\nFederated learning (FL) offers a decentralized solution that enables\ncollaborative learning while ensuring the preservation of data privacy. In this\nstudy, we systematically evaluate FL in medicine across $2$ biomedical NLP\ntasks using $6$ LMs encompassing $8$ corpora. Our results showed that: 1) FL\nmodels consistently outperform LMs trained on individual client's data and\nsometimes match the model trained with polled data; 2) With the fixed number of\ntotal data, LMs trained using FL with more clients exhibit inferior\nperformance, but pre-trained transformer-based models exhibited greater\nresilience. 3) LMs trained using FL perform nearly on par with the model\ntrained with pooled data when clients' data are IID distributed while\nexhibiting visible gaps with non-IID data. Our code is available at:\nhttps://github.com/PL97/FedNLP\n","authors":["Le Peng","sicheng zhou","jiandong chen","Rui Zhang","Ziyue Xu","Ju Sun"],"pdf_url":"https://arxiv.org/pdf/2307.11254v1.pdf","comment":"Accepted by KDD 2023 Workshop FL4Data-Mining"},{"id":"http://arxiv.org/abs/2307.11224v1","updated":"2023-07-20T20:37:24Z","published":"2023-07-20T20:37:24Z","title":"Jina Embeddings: A Novel Set of High-Performance Sentence Embedding\n  Models","summary":"  Jina Embeddings constitutes a set of high-performance sentence embedding\nmodels adept at translating various textual inputs into numerical\nrepresentations, thereby capturing the semantic essence of the text. While\nthese models are not exclusively designed for text generation, they excel in\napplications such as dense retrieval and semantic textual similarity. This\npaper details the development of Jina Embeddings, starting with the creation of\na high-quality pairwise and triplet dataset. It underlines the crucial role of\ndata cleaning in dataset preparation, gives in-depth insights into the model\ntraining process, and concludes with a comprehensive performance evaluation\nusing the Massive Textual Embedding Benchmark (MTEB).\n","authors":["Michael Günther","Louis Milliken","Jonathan Geuter","Georgios Mastrapas","Bo Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.11224v1.pdf","comment":"9 pages, 2 page appendix, EMNLP 2023 Industrial Track"},{"id":"http://arxiv.org/abs/2307.09782v2","updated":"2023-07-20T18:47:20Z","published":"2023-07-19T06:58:03Z","title":"ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization\n  Using Floating-Point Formats","summary":"  In the complex domain of large language models (LLMs), striking a balance\nbetween computational efficiency and maintaining model quality is a formidable\nchallenge. Navigating the inherent limitations of uniform quantization,\nparticularly when dealing with outliers, and motivated by the launch of\nNVIDIA's H100 hardware, this study delves into the viability of floating-point\n(FP) quantization, particularly focusing on FP8 and FP4, as a potential\nsolution. Our comprehensive investigation reveals that for LLMs, FP8 activation\nconsistently outshines its integer (INT8) equivalent, with the performance edge\nbecoming more noticeable in models possessing parameters beyond one billion.\nFor weight quantization, our findings indicate that FP4 exhibits comparable, if\nnot superior, performance to INT4, simplifying deployment on FP-supported\nhardware like H100. To mitigate the overhead from precision alignment caused by\nthe disparity between weights and activations, we propose two scaling\nconstraints for weight quantization that negligibly impact the performance\ncompared to the standard W4A8 model. We additionally enhance our quantization\nmethods by integrating the Low Rank Compensation (LoRC) strategy, yielding\nimprovements especially in smaller models. The results of our investigation\nemphasize the immense potential of FP quantization for LLMs, paving the way for\nhigh-efficiency deployment in resource-limited settings.\n","authors":["Xiaoxia Wu","Zhewei Yao","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2307.09782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11170v1","updated":"2023-07-20T18:08:34Z","published":"2023-07-20T18:08:34Z","title":"UMLS-KGI-BERT: Data-Centric Knowledge Integration in Transformers for\n  Biomedical Entity Recognition","summary":"  Pre-trained transformer language models (LMs) have in recent years become the\ndominant paradigm in applied NLP. These models have achieved state-of-the-art\nperformance on tasks such as information extraction, question answering,\nsentiment analysis, document classification and many others. In the biomedical\ndomain, significant progress has been made in adapting this paradigm to NLP\ntasks that require the integration of domain-specific knowledge as well as\nstatistical modelling of language. In particular, research in this area has\nfocused on the question of how best to construct LMs that take into account not\nonly the patterns of token distribution in medical text, but also the wealth of\nstructured information contained in terminology resources such as the UMLS.\nThis work contributes a data-centric paradigm for enriching the language\nrepresentations of biomedical transformer-encoder LMs by extracting text\nsequences from the UMLS. This allows for graph-based learning objectives to be\ncombined with masked-language pre-training. Preliminary results from\nexperiments in the extension of pre-trained LMs as well as training from\nscratch show that this framework improves downstream performance on multiple\nbiomedical and clinical Named Entity Recognition (NER) tasks.\n","authors":["Aidan Mannion","Thierry Chevalier","Didier Schwab","Lorraine Geouriot"],"pdf_url":"https://arxiv.org/pdf/2307.11170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11585v2","updated":"2023-07-20T14:31:10Z","published":"2023-06-20T15:02:25Z","title":"FAIR: A Causal Framework for Accurately Inferring Judgments Reversals","summary":"  Artificial intelligence researchers have made significant advances in legal\nintelligence in recent years. However, the existing studies have not focused on\nthe important value embedded in judgments reversals, which limits the\nimprovement of the efficiency of legal intelligence. In this paper, we propose\na causal Framework for Accurately Inferring case Reversals (FAIR), which models\nthe problem of judgments reversals based on real Chinese judgments. We mine the\ncauses of judgments reversals by causal inference methods and inject the\nobtained causal relationships into the neural network as a priori knowledge.\nAnd then, our framework is validated on a challenging dataset as a legal\njudgment prediction task. The experimental results show that our framework can\ntap the most critical factors in judgments reversal, and the obtained causal\nrelationships can effectively improve the neural network's performance. In\naddition, we discuss the generalization ability of large language models for\nlegal intelligence tasks using ChatGPT as an example. Our experiment has found\nthat the generalization ability of large language models still has defects, and\nmining causal relationships can effectively improve the accuracy and explain\nability of model predictions.\n","authors":["Minghua He","Nanfei Gu","Yuntao Shi","Qionghui Zhang","Yaying Chen"],"pdf_url":"https://arxiv.org/pdf/2306.11585v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11788v1","updated":"2023-07-20T18:30:35Z","published":"2023-07-20T18:30:35Z","title":"Applying QNLP to sentiment analysis in finance","summary":"  As an application domain where the slightest qualitative improvements can\nyield immense value, finance is a promising candidate for early quantum\nadvantage. Focusing on the rapidly advancing field of Quantum Natural Language\nProcessing (QNLP), we explore the practical applicability of the two central\napproaches DisCoCat and Quantum-Enhanced Long Short-Term Memory (QLSTM) to the\nproblem of sentiment analysis in finance. Utilizing a novel ChatGPT-based data\ngeneration approach, we conduct a case study with more than 1000 realistic\nsentences and find that QLSTMs can be trained substantially faster than\nDisCoCat while also achieving close to classical results for their available\nsoftware implementations.\n","authors":["Jonas Stein","Ivo Christ","Nicolas Kraus","Maximilian Balthasar Mansky","Robert Müller","Claudia Linnhof-Popien"],"pdf_url":"https://arxiv.org/pdf/2307.11788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11787v1","updated":"2023-07-20T16:22:36Z","published":"2023-07-20T16:22:36Z","title":"LLM Cognitive Judgements Differ From Human","summary":"  Large Language Models (LLMs) have lately been on the spotlight of\nresearchers, businesses, and consumers alike. While the linguistic capabilities\nof such models have been studied extensively, there is growing interest in\ninvestigating them as cognitive subjects. In the present work I examine GPT-3\nand ChatGPT capabilities on an limited-data inductive reasoning task from the\ncognitive science literature. The results suggest that these models' cognitive\njudgements are not human-like.\n","authors":["Sotiris Lamprinidis"],"pdf_url":"https://arxiv.org/pdf/2307.11787v1.pdf","comment":"7 pages, 1 figure"},{"id":"http://arxiv.org/abs/2307.11785v1","updated":"2023-07-20T12:44:47Z","published":"2023-07-20T12:44:47Z","title":"Adversarial Conversational Shaping for Intelligent Agents","summary":"  The recent emergence of deep learning methods has enabled the research\ncommunity to achieve state-of-the art results in several domains including\nnatural language processing. However, the current robocall system remains\nunstable and inaccurate: text generator and chat-bots can be tedious and\nmisunderstand human-like dialogue. In this work, we study the performance of\ntwo models able to enhance an intelligent conversational agent through\nadversarial conversational shaping: a generative adversarial network with\npolicy gradient (GANPG) and a generative adversarial network with reward for\nevery generation step (REGS) based on the REGS model presented in Li et al.\n[18] . This model is able to assign rewards to both partially and fully\ngenerated text sequences. We discuss performance with different training\ndetails : seq2seq [ 36] and transformers [37 ] in a reinforcement learning\nframework.\n","authors":["Piotr Tarasiewicz","Sultan Kenjeyev","Ilana Sebag","Shehab Alshehabi"],"pdf_url":"https://arxiv.org/pdf/2307.11785v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.11086v1","updated":"2023-07-20T17:59:33Z","published":"2023-07-20T17:59:33Z","title":"PAPR: Proximity Attention Point Rendering","summary":"  Learning accurate and parsimonious point cloud representations of scene\nsurfaces from scratch remains a challenge in 3D representation learning.\nExisting point-based methods often suffer from the vanishing gradient problem\nor require a large number of points to accurately model scene geometry and\ntexture. To address these limitations, we propose Proximity Attention Point\nRendering (PAPR), a novel method that consists of a point-based scene\nrepresentation and a differentiable renderer. Our scene representation uses a\npoint cloud where each point is characterized by its spatial position,\nforeground score, and view-independent feature vector. The renderer selects the\nrelevant points for each ray and produces accurate colours using their\nassociated features. PAPR effectively learns point cloud positions to represent\nthe correct scene geometry, even when the initialization drastically differs\nfrom the target geometry. Notably, our method captures fine texture details\nwhile using only a parsimonious set of points. We also demonstrate four\npractical applications of our method: geometry editing, object manipulation,\ntexture transfer, and exposure control. More results and code are available on\nour project website at https://zvict.github.io/papr/.\n","authors":["Yanshu Zhang","Shichong Peng","Alireza Moazeni","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2307.11086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07269v2","updated":"2023-07-20T17:59:25Z","published":"2023-07-14T10:50:43Z","title":"Frequency Domain Adversarial Training for Robust Volumetric Medical\n  Segmentation","summary":"  It is imperative to ensure the robustness of deep learning models in critical\napplications such as, healthcare. While recent advances in deep learning have\nimproved the performance of volumetric medical image segmentation models, these\nmodels cannot be deployed for real-world applications immediately due to their\nvulnerability to adversarial attacks. We present a 3D frequency domain\nadversarial attack for volumetric medical image segmentation models and\ndemonstrate its advantages over conventional input or voxel domain attacks.\nUsing our proposed attack, we introduce a novel frequency domain adversarial\ntraining approach for optimizing a robust model against voxel and frequency\ndomain attacks. Moreover, we propose frequency consistency loss to regulate our\nfrequency domain adversarial training that achieves a better tradeoff between\nmodel's performance on clean and adversarial samples. Code is publicly\navailable at https://github.com/asif-hanif/vafa.\n","authors":["Asif Hanif","Muzammal Naseer","Salman Khan","Mubarak Shah","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2307.07269v2.pdf","comment":"This paper has been accepted in MICCAI 2023 conference"},{"id":"http://arxiv.org/abs/2307.11085v1","updated":"2023-07-20T17:59:11Z","published":"2023-07-20T17:59:11Z","title":"Representation Learning in Anomaly Detection: Successes, Limits and a\n  Grand Challenge","summary":"  In this perspective paper, we argue that the dominant paradigm in anomaly\ndetection cannot scale indefinitely and will eventually hit fundamental limits.\nThis is due to the a no free lunch principle for anomaly detection. These\nlimitations can be overcome when there are strong tasks priors, as is the case\nfor many industrial tasks. When such priors do not exists, the task is much\nharder for anomaly detection. We pose two such tasks as grand challenges for\nanomaly detection: i) scientific discovery by anomaly detection ii) a\n\"mini-grand\" challenge of detecting the most anomalous image in the ImageNet\ndataset. We believe new anomaly detection tools and ideas would need to be\ndeveloped to overcome these challenges.\n","authors":["Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2307.11085v1.pdf","comment":"Keynote talk at the Visual Anomaly and Novelty Detection Workshop,\n  CVPR'23"},{"id":"http://arxiv.org/abs/2307.11081v1","updated":"2023-07-20T17:57:04Z","published":"2023-07-20T17:57:04Z","title":"GLSFormer : Gated - Long, Short Sequence Transformer for Step\n  Recognition in Surgical Videos","summary":"  Automated surgical step recognition is an important task that can\nsignificantly improve patient safety and decision-making during surgeries.\nExisting state-of-the-art methods for surgical step recognition either rely on\nseparate, multi-stage modeling of spatial and temporal information or operate\non short-range temporal resolution when learned jointly. However, the benefits\nof joint modeling of spatio-temporal features and long-range information are\nnot taken in account. In this paper, we propose a vision transformer-based\napproach to jointly learn spatio-temporal features directly from sequence of\nframe-level patches. Our method incorporates a gated-temporal attention\nmechanism that intelligently combines short-term and long-term spatio-temporal\nfeature representations. We extensively evaluate our approach on two cataract\nsurgery video datasets, namely Cataract-101 and D99, and demonstrate superior\nperformance compared to various state-of-the-art methods. These results\nvalidate the suitability of our proposed approach for automated surgical step\nrecognition. Our code is released at:\nhttps://github.com/nisargshah1999/GLSFormer\n","authors":["Nisarg A. Shah","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2307.11081v1.pdf","comment":"Accepted to MICCAI 2023 (Early Accept)"},{"id":"http://arxiv.org/abs/2307.11077v1","updated":"2023-07-20T17:55:14Z","published":"2023-07-20T17:55:14Z","title":"AlignDet: Aligning Pre-training and Fine-tuning in Object Detection","summary":"  The paradigm of large-scale pre-training followed by downstream fine-tuning\nhas been widely employed in various object detection algorithms. In this paper,\nwe reveal discrepancies in data, model, and task between the pre-training and\nfine-tuning procedure in existing practices, which implicitly limit the\ndetector's performance, generalization ability, and convergence speed. To this\nend, we propose AlignDet, a unified pre-training framework that can be adapted\nto various existing detectors to alleviate the discrepancies. AlignDet\ndecouples the pre-training process into two stages, i.e., image-domain and\nbox-domain pre-training. The image-domain pre-training optimizes the detection\nbackbone to capture holistic visual abstraction, and box-domain pre-training\nlearns instance-level semantics and task-aware concepts to initialize the parts\nout of the backbone. By incorporating the self-supervised pre-trained\nbackbones, we can pre-train all modules for various detectors in an\nunsupervised paradigm. As depicted in Figure 1, extensive experiments\ndemonstrate that AlignDet can achieve significant improvements across diverse\nprotocols, such as detection algorithm, model backbone, data setting, and\ntraining schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by\n2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs.\n","authors":["Ming Li","Jie Wu","Xionghui Wang","Chen Chen","Jie Qin","Xuefeng Xiao","Rui Wang","Min Zheng","Xin Pan"],"pdf_url":"https://arxiv.org/pdf/2307.11077v1.pdf","comment":"Accepted by ICCV 2023. Code and Models are publicly available.\n  Project Page: https://liming-ai.github.io/AlignDet"},{"id":"http://arxiv.org/abs/2307.11074v1","updated":"2023-07-20T17:53:57Z","published":"2023-07-20T17:53:57Z","title":"Learning Dense UV Completion for Human Mesh Recovery","summary":"  Human mesh reconstruction from a single image is challenging in the presence\nof occlusion, which can be caused by self, objects, or other humans. Existing\nmethods either fail to separate human features accurately or lack proper\nsupervision for feature completion. In this paper, we propose Dense Inpainting\nHuman Mesh Recovery (DIMR), a two-stage method that leverages dense\ncorrespondence maps to handle occlusion. Our method utilizes a dense\ncorrespondence map to separate visible human features and completes human\nfeatures on a structured UV map dense human with an attention-based feature\ncompletion module. We also design a feature inpainting training procedure that\nguides the network to learn from unoccluded features. We evaluate our method on\nseveral datasets and demonstrate its superior performance under heavily\noccluded scenarios compared to other methods. Extensive experiments show that\nour method obviously outperforms prior SOTA methods on heavily occluded images\nand achieves comparable results on the standard benchmarks (3DPW).\n","authors":["Yanjun Wang","Qingping Sun","Wenjia Wang","Jun Ling","Zhongang Cai","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2307.11074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11073v1","updated":"2023-07-20T17:53:46Z","published":"2023-07-20T17:53:46Z","title":"OBJECT 3DIT: Language-guided 3D-aware Image Editing","summary":"  Existing image editing tools, while powerful, typically disregard the\nunderlying 3D geometry from which the image is projected. As a result, edits\nmade using these tools may become detached from the geometry and lighting\nconditions that are at the foundation of the image formation process. In this\nwork, we formulate the newt ask of language-guided 3D-aware editing, where\nobjects in an image should be edited according to a language instruction in\ncontext of the underlying 3D scene. To promote progress towards this goal, we\nrelease OBJECT: a dataset consisting of 400K editing examples created from\nprocedurally generated 3D scenes. Each example consists of an input image,\nediting instruction in language, and the edited image. We also introduce 3DIT :\nsingle and multi-task models for four editing tasks. Our models show impressive\nabilities to understand the 3D composition of entire scenes, factoring in\nsurrounding objects, surfaces, lighting conditions, shadows, and\nphysically-plausible object configurations. Surprisingly, training on only\nsynthetic scenes from OBJECT, editing capabilities of 3DIT generalize to\nreal-world images.\n","authors":["Oscar Michel","Anand Bhattad","Eli VanderBilt","Ranjay Krishna","Aniruddha Kembhavi","Tanmay Gupta"],"pdf_url":"https://arxiv.org/pdf/2307.11073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01738v2","updated":"2023-07-20T17:53:41Z","published":"2023-07-04T14:14:12Z","title":"Mitigating Calibration Bias Without Fixed Attribute Grouping for\n  Improved Fairness in Medical Imaging Analysis","summary":"  Trustworthy deployment of deep learning medical imaging models into\nreal-world clinical practice requires that they be calibrated. However, models\nthat are well calibrated overall can still be poorly calibrated for a\nsub-population, potentially resulting in a clinician unwittingly making poor\ndecisions for this group based on the recommendations of the model. Although\nmethods have been shown to successfully mitigate biases across subgroups in\nterms of model accuracy, this work focuses on the open problem of mitigating\ncalibration biases in the context of medical image analysis. Our method does\nnot require subgroup attributes during training, permitting the flexibility to\nmitigate biases for different choices of sensitive attributes without\nre-training. To this end, we propose a novel two-stage method: Cluster-Focal to\nfirst identify poorly calibrated samples, cluster them into groups, and then\nintroduce group-wise focal loss to improve calibration bias. We evaluate our\nmethod on skin lesion classification with the public HAM10000 dataset, and on\npredicting future lesional activity for multiple sclerosis (MS) patients. In\naddition to considering traditional sensitive attributes (e.g. age, sex) with\ndemographic subgroups, we also consider biases among groups with different\nimage-derived attributes, such as lesion load, which are required in medical\nimage analysis. Our results demonstrate that our method effectively controls\ncalibration error in the worst-performing subgroups while preserving prediction\nperformance, and outperforming recent baselines.\n","authors":["Changjian Shui","Justin Szeto","Raghav Mehta","Douglas L. Arnold","Tal Arbel"],"pdf_url":"https://arxiv.org/pdf/2307.01738v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11067v1","updated":"2023-07-20T17:46:21Z","published":"2023-07-20T17:46:21Z","title":"CNOS: A Strong Baseline for CAD-based Novel Object Segmentation","summary":"  We propose a simple three-stage approach to segment unseen objects in RGB\nimages using their CAD models. Leveraging recent powerful foundation models,\nDINOv2 and Segment Anything, we create descriptors and generate proposals,\nincluding binary masks for a given input RGB image. By matching proposals with\nreference descriptors created from CAD models, we achieve precise object ID\nassignment along with modal masks. We experimentally demonstrate that our\nmethod achieves state-of-the-art results in CAD-based novel object\nsegmentation, surpassing existing approaches on the seven core datasets of the\nBOP challenge by 19.8\\% AP using the same BOP evaluation protocol. Our source\ncode is available at https://github.com/nv-nguyen/cnos.\n","authors":["Van Nguyen Nguyen","Tomas Hodan","Georgy Ponimatkin","Thibault Groueix","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2307.11067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11058v1","updated":"2023-07-20T17:38:55Z","published":"2023-07-20T17:38:55Z","title":"Driving Policy Prediction based on Deep Learning Models","summary":"  In this project, we implemented an end-to-end system that takes in combined\nvisual features of video frames from a normal camera and depth information from\na cloud points scanner, and predicts driving policies (vehicle speed and\nsteering angle). We verified the safety of our system by comparing the\npredicted results with standard behaviors by real-world experienced drivers.\nOur test results show that the predictions can be considered as accurate in at\nlease half of the testing cases (50% 80%, depending on the model), and using\ncombined features improved the performance in most cases than using video\nframes only.\n","authors":["Fuxiao Liu"],"pdf_url":"https://arxiv.org/pdf/2307.11058v1.pdf","comment":"5 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.11052v1","updated":"2023-07-20T17:33:57Z","published":"2023-07-20T17:33:57Z","title":"HRFNet: High-Resolution Forgery Network for Localizing Satellite Image\n  Manipulation","summary":"  Existing high-resolution satellite image forgery localization methods rely on\npatch-based or downsampling-based training. Both of these training methods have\nmajor drawbacks, such as inaccurate boundaries between pristine and forged\nregions, the generation of unwanted artifacts, etc. To tackle the\naforementioned challenges, inspired by the high-resolution image segmentation\nliterature, we propose a novel model called HRFNet to enable satellite image\nforgery localization effectively. Specifically, equipped with shallow and deep\nbranches, our model can successfully integrate RGB and resampling features in\nboth global and local manners to localize forgery more accurately. We perform\nvarious experiments to demonstrate that our method achieves the best\nperformance, while the memory requirement and processing speed are not\ncompromised compared to existing methods.\n","authors":["Fahim Faisal Niloy","Kishor Kumar Bhaumik","Simon S. Woo"],"pdf_url":"https://arxiv.org/pdf/2307.11052v1.pdf","comment":"ICIP 2023"},{"id":"http://arxiv.org/abs/2307.09023v3","updated":"2023-07-20T17:23:55Z","published":"2023-07-18T07:25:38Z","title":"LA-Net: Landmark-Aware Learning for Reliable Facial Expression\n  Recognition under Label Noise","summary":"  Facial expression recognition (FER) remains a challenging task due to the\nambiguity of expressions. The derived noisy labels significantly harm the\nperformance in real-world scenarios. To address this issue, we present a new\nFER model named Landmark-Aware Net~(LA-Net), which leverages facial landmarks\nto mitigate the impact of label noise from two perspectives. Firstly, LA-Net\nuses landmark information to suppress the uncertainty in expression space and\nconstructs the label distribution of each sample by neighborhood aggregation,\nwhich in turn improves the quality of training supervision. Secondly, the model\nincorporates landmark information into expression representations using the\ndevised expression-landmark contrastive loss. The enhanced expression feature\nextractor can be less susceptible to label noise. Our method can be integrated\nwith any deep neural network for better training supervision without\nintroducing extra inference costs. We conduct extensive experiments on both\nin-the-wild datasets and synthetic noisy datasets and demonstrate that LA-Net\nachieves state-of-the-art performance.\n","authors":["Zhiyu Wu","Jinshi Cui"],"pdf_url":"https://arxiv.org/pdf/2307.09023v3.pdf","comment":"accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.11035v1","updated":"2023-07-20T17:11:20Z","published":"2023-07-20T17:11:20Z","title":"Cascade-DETR: Delving into High-Quality Universal Object Detection","summary":"  Object localization in general environments is a fundamental part of vision\nsystems. While dominating on the COCO benchmark, recent Transformer-based\ndetection methods are not competitive in diverse domains. Moreover, these\nmethods still struggle to very accurately estimate the object bounding boxes in\ncomplex environments.\n  We introduce Cascade-DETR for high-quality universal object detection. We\njointly tackle the generalization to diverse domains and localization accuracy\nby proposing the Cascade Attention layer, which explicitly integrates\nobject-centric information into the detection decoder by limiting the attention\nto the previous box prediction. To further enhance accuracy, we also revisit\nthe scoring of queries. Instead of relying on classification scores, we predict\nthe expected IoU of the query, leading to substantially more well-calibrated\nconfidences. Lastly, we introduce a universal object detection benchmark,\nUDB10, that contains 10 datasets from diverse domains. While also advancing the\nstate-of-the-art on COCO, Cascade-DETR substantially improves DETR-based\ndetectors on all datasets in UDB10, even by over 10 mAP in some cases. The\nimprovements under stringent quality requirements are even more pronounced. Our\ncode and models will be released at https://github.com/SysCV/cascade-detr.\n","authors":["Mingqiao Ye","Lei Ke","Siyuan Li","Yu-Wing Tai","Chi-Keung Tang","Martin Danelljan","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2307.11035v1.pdf","comment":"Accepted in ICCV 2023. Our code and models will be released at\n  https://github.com/SysCV/cascade-detr"},{"id":"http://arxiv.org/abs/2305.05610v2","updated":"2023-07-20T16:46:36Z","published":"2023-05-09T17:01:17Z","title":"Can point cloud networks learn statistical shape models of anatomies?","summary":"  Statistical Shape Modeling (SSM) is a valuable tool for investigating and\nquantifying anatomical variations within populations of anatomies. However,\ntraditional correspondence-based SSM generation methods have a prohibitive\ninference process and require complete geometric proxies (e.g., high-resolution\nbinary volumes or surface meshes) as input shapes to construct the SSM.\nUnordered 3D point cloud representations of shapes are more easily acquired\nfrom various medical imaging practices (e.g., thresholded images and surface\nscanning). Point cloud deep networks have recently achieved remarkable success\nin learning permutation-invariant features for different point cloud tasks\n(e.g., completion, semantic segmentation, classification). However, their\napplication to learning SSM from point clouds is to-date unexplored. In this\nwork, we demonstrate that existing point cloud encoder-decoder-based completion\nnetworks can provide an untapped potential for SSM, capturing population-level\nstatistical representations of shapes while reducing the inference burden and\nrelaxing the input requirement. We discuss the limitations of these techniques\nto the SSM application and suggest future improvements. Our work paves the way\nfor further exploration of point cloud deep learning for SSM, a promising\navenue for advancing shape analysis literature and broadening SSM to diverse\nuse cases.\n","authors":["Jadie Adams","Shireen Elhabian"],"pdf_url":"https://arxiv.org/pdf/2305.05610v2.pdf","comment":"Accepted to MICCAI 2023. 13 pages, 5 figures, appendix"},{"id":"http://arxiv.org/abs/2307.11017v1","updated":"2023-07-20T16:45:16Z","published":"2023-07-20T16:45:16Z","title":"Multi-objective point cloud autoencoders for explainable myocardial\n  infarction prediction","summary":"  Myocardial infarction (MI) is one of the most common causes of death in the\nworld. Image-based biomarkers commonly used in the clinic, such as ejection\nfraction, fail to capture more complex patterns in the heart's 3D anatomy and\nthus limit diagnostic accuracy. In this work, we present the multi-objective\npoint cloud autoencoder as a novel geometric deep learning approach for\nexplainable infarction prediction, based on multi-class 3D point cloud\nrepresentations of cardiac anatomy and function. Its architecture consists of\nmultiple task-specific branches connected by a low-dimensional latent space to\nallow for effective multi-objective learning of both reconstruction and MI\nprediction, while capturing pathology-specific 3D shape information in an\ninterpretable latent space. Furthermore, its hierarchical branch design with\npoint cloud-based deep learning operations enables efficient multi-scale\nfeature learning directly on high-resolution anatomy point clouds. In our\nexperiments on a large UK Biobank dataset, the multi-objective point cloud\nautoencoder is able to accurately reconstruct multi-temporal 3D shapes with\nChamfer distances between predicted and input anatomies below the underlying\nimages' pixel resolution. Our method outperforms multiple machine learning and\ndeep learning benchmarks for the task of incident MI prediction by 19% in terms\nof Area Under the Receiver Operating Characteristic curve. In addition, its\ntask-specific compact latent space exhibits easily separable control and MI\nclusters with clinically plausible associations between subject encodings and\ncorresponding 3D shapes, thus demonstrating the explainability of the\nprediction.\n","authors":["Marcel Beetz","Abhirup Banerjee","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2307.11017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05797v2","updated":"2023-07-20T16:36:32Z","published":"2023-05-09T23:01:05Z","title":"Fully Bayesian VIB-DeepSSM","summary":"  Statistical shape modeling (SSM) enables population-based quantitative\nanalysis of anatomical shapes, informing clinical diagnosis. Deep learning\napproaches predict correspondence-based SSM directly from unsegmented 3D images\nbut require calibrated uncertainty quantification, motivating Bayesian\nformulations. Variational information bottleneck DeepSSM (VIB-DeepSSM) is an\neffective, principled framework for predicting probabilistic shapes of anatomy\nfrom images with aleatoric uncertainty quantification. However, VIB is only\nhalf-Bayesian and lacks epistemic uncertainty inference. We derive a fully\nBayesian VIB formulation and demonstrate the efficacy of two scalable\nimplementation approaches: concrete dropout and batch ensemble. Additionally,\nwe introduce a novel combination of the two that further enhances uncertainty\ncalibration via multimodal marginalization. Experiments on synthetic shapes and\nleft atrium data demonstrate that the fully Bayesian VIB network predicts SSM\nfrom images with improved uncertainty reasoning without sacrificing accuracy.\n","authors":["Jadie Adams","Shireen Elhabian"],"pdf_url":"https://arxiv.org/pdf/2305.05797v2.pdf","comment":"Accepted to MICCAI 2023. 13 pages, 4 figures, appendix"},{"id":"http://arxiv.org/abs/2210.05335v3","updated":"2023-07-20T16:24:14Z","published":"2022-10-11T10:54:54Z","title":"MAP: Multimodal Uncertainty-Aware Vision-Language Pre-training Model","summary":"  Multimodal semantic understanding often has to deal with uncertainty, which\nmeans the obtained messages tend to refer to multiple targets. Such uncertainty\nis problematic for our interpretation, including inter- and intra-modal\nuncertainty. Little effort has studied the modeling of this uncertainty,\nparticularly in pre-training on unlabeled datasets and fine-tuning in\ntask-specific downstream datasets. In this paper, we project the\nrepresentations of all modalities as probabilistic distributions via a\nProbability Distribution Encoder (PDE) by utilizing sequence-level\ninteractions. Compared to the existing deterministic methods, such uncertainty\nmodeling can convey richer multimodal semantic information and more complex\nrelationships. Furthermore, we integrate uncertainty modeling with popular\npre-training frameworks and propose suitable pre-training tasks:\nDistribution-based Vision-Language Contrastive learning (D-VLC),\nDistribution-based Masked Language Modeling (D-MLM), and Distribution-based\nImage-Text Matching (D-ITM). The fine-tuned models are applied to challenging\ndownstream tasks, including image-text retrieval, visual question answering,\nvisual reasoning, and visual entailment, and achieve state-of-the-art results.\n","authors":["Yatai Ji","Junjie Wang","Yuan Gong","Lin Zhang","Yanru Zhu","Hongfa Wang","Jiaxing Zhang","Tetsuya Sakai","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2210.05335v3.pdf","comment":"CVPR 2023 Main Track Long Paper"},{"id":"http://arxiv.org/abs/2307.10984v1","updated":"2023-07-20T16:14:23Z","published":"2023-07-20T16:14:23Z","title":"Metric3D: Towards Zero-shot Metric 3D Prediction from A Single Image","summary":"  Reconstructing accurate 3D scenes from images is a long-standing vision task.\nDue to the ill-posedness of the single-image reconstruction problem, most\nwell-established methods are built upon multi-view geometry. State-of-the-art\n(SOTA) monocular metric depth estimation methods can only handle a single\ncamera model and are unable to perform mixed-data training due to the metric\nambiguity. Meanwhile, SOTA monocular methods trained on large mixed datasets\nachieve zero-shot generalization by learning affine-invariant depths, which\ncannot recover real-world metrics. In this work, we show that the key to a\nzero-shot single-view metric depth model lies in the combination of large-scale\ndata training and resolving the metric ambiguity from various camera models. We\npropose a canonical camera space transformation module, which explicitly\naddresses the ambiguity problems and can be effortlessly plugged into existing\nmonocular models. Equipped with our module, monocular models can be stably\ntrained with over 8 million images with thousands of camera models, resulting\nin zero-shot generalization to in-the-wild images with unseen camera settings.\nExperiments demonstrate SOTA performance of our method on 7 zero-shot\nbenchmarks. Notably, our method won the championship in the 2nd Monocular Depth\nEstimation Challenge. Our method enables the accurate recovery of metric 3D\nstructures on randomly collected internet images, paving the way for plausible\nsingle-image metrology. The potential benefits extend to downstream tasks,\nwhich can be significantly improved by simply plugging in our model. For\nexample, our model relieves the scale drift issues of monocular-SLAM (Fig. 1),\nleading to high-quality metric scale dense mapping. The code is available at\nhttps://github.com/YvanYin/Metric3D.\n","authors":["Wei Yin","Chi Zhang","Hao Chen","Zhipeng Cai","Gang Yu","Kaixuan Wang","Xiaozhi Chen","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2307.10984v1.pdf","comment":"Accepted to ICCV 2023. Won the championship in the 2nd Monocular\n  Depth Estimation Challenge. The code is available at\n  https://github.com/YvanYin/Metric3D"},{"id":"http://arxiv.org/abs/2307.09676v2","updated":"2023-07-20T16:04:11Z","published":"2023-07-18T23:06:47Z","title":"Domain Adaptation based Enhanced Detection for Autonomous Driving in\n  Foggy and Rainy Weather","summary":"  Typically, object detection methods for autonomous driving that rely on\nsupervised learning make the assumption of a consistent feature distribution\nbetween the training and testing data, however such assumption may fail in\ndifferent weather conditions. Due to the domain gap, a detection model trained\nunder clear weather may not perform well in foggy and rainy conditions.\nOvercoming detection bottlenecks in foggy and rainy weather is a real challenge\nfor autonomous vehicles deployed in the wild. To bridge the domain gap and\nimprove the performance of object detectionin foggy and rainy weather, this\npaper presents a novel framework for domain-adaptive object detection. The\nadaptations at both the image-level and object-level are intended to minimize\nthe differences in image style and object appearance between domains.\nFurthermore, in order to improve the model's performance on challenging\nexamples, we introduce a novel adversarial gradient reversal layer that\nconducts adversarial mining on difficult instances in addition to domain\nadaptation. Additionally, we suggest generating an auxiliary domain through\ndata augmentation to enforce a new domain-level metric regularization.\nExperimental findings on public V2V benchmark exhibit a substantial enhancement\nin object detection specifically for foggy and rainy driving scenarios.\n","authors":["Jinlong Li","Runsheng Xu","Jin Ma","Qin Zou","Jiaqi Ma","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2307.09676v2.pdf","comment":"only change the title of this paper"},{"id":"http://arxiv.org/abs/2307.10974v1","updated":"2023-07-20T16:00:19Z","published":"2023-07-20T16:00:19Z","title":"Deep Spiking-UNet for Image Processing","summary":"  U-Net, known for its simple yet efficient architecture, is widely utilized\nfor image processing tasks and is particularly suitable for deployment on\nneuromorphic chips. This paper introduces the novel concept of Spiking-UNet for\nimage processing, which combines the power of Spiking Neural Networks (SNNs)\nwith the U-Net architecture. To achieve an efficient Spiking-UNet, we face two\nprimary challenges: ensuring high-fidelity information propagation through the\nnetwork via spikes and formulating an effective training strategy. To address\nthe issue of information loss, we introduce multi-threshold spiking neurons,\nwhich improve the efficiency of information transmission within the\nSpiking-UNet. For the training strategy, we adopt a conversion and fine-tuning\npipeline that leverage pre-trained U-Net models. During the conversion process,\nsignificant variability in data distribution across different parts is observed\nwhen utilizing skip connections. Therefore, we propose a connection-wise\nnormalization method to prevent inaccurate firing rates. Furthermore, we adopt\na flow-based training method to fine-tune the converted models, reducing time\nsteps while preserving performance. Experimental results show that, on image\nsegmentation and denoising, our Spiking-UNet achieves comparable performance to\nits non-spiking counterpart, surpassing existing SNN methods. Compared with the\nconverted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference\ntime by approximately 90\\%. This research broadens the application scope of\nSNNs in image processing and is expected to inspire further exploration in the\nfield of neuromorphic engineering. The code for our Spiking-UNet implementation\nis available at https://github.com/SNNresearch/Spiking-UNet.\n","authors":["Hebei Li","Yueyi Zhang","Zhiwei Xiong","Zheng-jun Zha","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.10974v1.pdf","comment":"22 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.10955v1","updated":"2023-07-20T15:26:57Z","published":"2023-07-20T15:26:57Z","title":"Spinal nerve segmentation method and dataset construction in endoscopic\n  surgical scenarios","summary":"  Endoscopic surgery is currently an important treatment method in the field of\nspinal surgery and avoiding damage to the spinal nerves through video guidance\nis a key challenge. This paper presents the first real-time segmentation method\nfor spinal nerves in endoscopic surgery, which provides crucial navigational\ninformation for surgeons. A finely annotated segmentation dataset of\napproximately 10,000 consec-utive frames recorded during surgery is constructed\nfor the first time for this field, addressing the problem of semantic\nsegmentation. Based on this dataset, we propose FUnet (Frame-Unet), which\nachieves state-of-the-art performance by utilizing inter-frame information and\nself-attention mechanisms. We also conduct extended exper-iments on a similar\npolyp endoscopy video dataset and show that the model has good generalization\nability with advantageous performance. The dataset and code of this work are\npresented at: https://github.com/zzzzzzpc/FUnet .\n","authors":["Shaowu Peng","Pengcheng Zhao","Yongyu Ye","Junying Chen","Yunbing Chang","Xiaoqing Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.10955v1.pdf","comment":"Accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.10954v1","updated":"2023-07-20T15:26:01Z","published":"2023-07-20T15:26:01Z","title":"Soft-tissue Driven Craniomaxillofacial Surgical Planning","summary":"  In CMF surgery, the planning of bony movement to achieve a desired facial\noutcome is a challenging task. Current bone driven approaches focus on\nnormalizing the bone with the expectation that the facial appearance will be\ncorrected accordingly. However, due to the complex non-linear relationship\nbetween bony structure and facial soft-tissue, such bone-driven methods are\ninsufficient to correct facial deformities. Despite efforts to simulate facial\nchanges resulting from bony movement, surgical planning still relies on\niterative revisions and educated guesses. To address these issues, we propose a\nsoft-tissue driven framework that can automatically create and verify surgical\nplans. Our framework consists of a bony planner network that estimates the bony\nmovements required to achieve the desired facial outcome and a facial simulator\nnetwork that can simulate the possible facial changes resulting from the\nestimated bony movement plans. By combining these two models, we can verify and\ndetermine the final bony movement required for planning. The proposed framework\nwas evaluated using a clinical dataset, and our experimental results\ndemonstrate that the soft-tissue driven approach greatly improves the accuracy\nand efficacy of surgical planning when compared to the conventional bone-driven\napproach.\n","authors":["Xi Fang","Daeseung Kim","Xuanang Xu","Tianshu Kuang","Nathan Lampen","Jungwook Lee","Hannah H. Deng","Jaime Gateno","Michael A. K. Liebschner","James J. Xia","Pingkun Yan"],"pdf_url":"https://arxiv.org/pdf/2307.10954v1.pdf","comment":"Early accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.10953v1","updated":"2023-07-20T15:25:55Z","published":"2023-07-20T15:25:55Z","title":"PE-YOLO: Pyramid Enhancement Network for Dark Object Detection","summary":"  Current object detection models have achieved good results on many benchmark\ndatasets, detecting objects in dark conditions remains a large challenge. To\naddress this issue, we propose a pyramid enhanced network (PENet) and joint it\nwith YOLOv3 to build a dark object detection framework named PE-YOLO. Firstly,\nPENet decomposes the image into four components of different resolutions using\nthe Laplacian pyramid. Specifically we propose a detail processing module (DPM)\nto enhance the detail of images, which consists of context branch and edge\nbranch. In addition, we propose a low-frequency enhancement filter (LEF) to\ncapture low-frequency semantics and prevent high-frequency noise. PE-YOLO\nadopts an end-to-end joint training approach and only uses normal detection\nloss to simplify the training process. We conduct experiments on the low-light\nobject detection dataset ExDark to demonstrate the effectiveness of ours. The\nresults indicate that compared with other dark detectors and low-light\nenhancement models, PE-YOLO achieves the advanced results, achieving 78.0% in\nmAP and 53.6 in FPS, respectively, which can adapt to object detection under\ndifferent low-light conditions. The code is available at\nhttps://github.com/XiangchenYin/PE-YOLO.\n","authors":["Xiangchen Yin","Zhenda Yu","Zetao Fei","Wenjun Lv","Xin Gao"],"pdf_url":"https://arxiv.org/pdf/2307.10953v1.pdf","comment":"Accepted at ICANN 2023"},{"id":"http://arxiv.org/abs/2307.10947v1","updated":"2023-07-20T15:21:28Z","published":"2023-07-20T15:21:28Z","title":"Improving Online Lane Graph Extraction by Object-Lane Clustering","summary":"  Autonomous driving requires accurate local scene understanding information.\nTo this end, autonomous agents deploy object detection and online BEV lane\ngraph extraction methods as a part of their perception stack. In this work, we\npropose an architecture and loss formulation to improve the accuracy of local\nlane graph estimates by using 3D object detection outputs. The proposed method\nlearns to assign the objects to centerlines by considering the centerlines as\ncluster centers and the objects as data points to be assigned a probability\ndistribution over the cluster centers. This training scheme ensures direct\nsupervision on the relationship between lanes and objects, thus leading to\nbetter performance. The proposed method improves lane graph estimation\nsubstantially over state-of-the-art methods. The extensive ablations show that\nour method can achieve significant performance improvements by using the\noutputs of existing 3D object detection methods. Since our method uses the\ndetection outputs rather than detection method intermediate representations, a\nsingle model of our method can use any detection method at test time.\n","authors":["Yigit Baran Can","Alexander Liniger","Danda Pani Paudel","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2307.10947v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10943v1","updated":"2023-07-20T15:13:29Z","published":"2023-07-20T15:13:29Z","title":"Proxy Anchor-based Unsupervised Learning for Continuous Generalized\n  Category Discovery","summary":"  Recent advances in deep learning have significantly improved the performance\nof various computer vision applications. However, discovering novel categories\nin an incremental learning scenario remains a challenging problem due to the\nlack of prior knowledge about the number and nature of new categories. Existing\nmethods for novel category discovery are limited by their reliance on labeled\ndatasets and prior knowledge about the number of novel categories and the\nproportion of novel samples in the batch. To address the limitations and more\naccurately reflect real-world scenarios, in this paper, we propose a novel\nunsupervised class incremental learning approach for discovering novel\ncategories on unlabeled sets without prior knowledge. The proposed method\nfine-tunes the feature extractor and proxy anchors on labeled sets, then splits\nsamples into old and novel categories and clusters on the unlabeled dataset.\nFurthermore, the proxy anchors-based exemplar generates representative category\nvectors to mitigate catastrophic forgetting. Experimental results demonstrate\nthat our proposed approach outperforms the state-of-the-art methods on\nfine-grained datasets under real-world scenarios.\n","authors":["Hyungmin Kim","Sungho Suh","Daehwan Kim","Daun Jeong","Hansang Cho","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.10943v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2301.06262v2","updated":"2023-07-20T15:09:50Z","published":"2023-01-16T05:08:50Z","title":"Collaborative Perception in Autonomous Driving: Methods, Datasets and\n  Challenges","summary":"  Collaborative perception is essential to address occlusion and sensor failure\nissues in autonomous driving. In recent years, theoretical and experimental\ninvestigations of novel works for collaborative perception have increased\ntremendously. So far, however, few reviews have focused on systematical\ncollaboration modules and large-scale collaborative perception datasets. This\nwork reviews recent achievements in this field to bridge this gap and motivate\nfuture research. We start with a brief overview of collaboration schemes. After\nthat, we systematically summarize the collaborative perception methods for\nideal scenarios and real-world issues. The former focus on collaboration\nmodules and efficiency, and the latter is devoted to addressing the problems in\nactual application. Furthermore, we present large-scale public datasets and\nsummarize quantitative results on these benchmarks. Finally, we highlight gaps\nand overlooked challenges between current academic research and real-world\napplications.\n","authors":["Yushan Han","Hui Zhang","Huifang Li","Yi Jin","Congyan Lang","Yidong Li"],"pdf_url":"https://arxiv.org/pdf/2301.06262v2.pdf","comment":"18 pages, 6 figures. Accepted by IEEE Intelligent Transportation\n  Systems Magazine. URL:\n  https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving"},{"id":"http://arxiv.org/abs/2307.10934v1","updated":"2023-07-20T15:06:44Z","published":"2023-07-20T15:06:44Z","title":"OCTraN: 3D Occupancy Convolutional Transformer Network in Unstructured\n  Traffic Scenarios","summary":"  Modern approaches for vision-centric environment perception for autonomous\nnavigation make extensive use of self-supervised monocular depth estimation\nalgorithms that output disparity maps. However, when this disparity map is\nprojected onto 3D space, the errors in disparity are magnified, resulting in a\ndepth estimation error that increases quadratically as the distance from the\ncamera increases. Though Light Detection and Ranging (LiDAR) can solve this\nissue, it is expensive and not feasible for many applications. To address the\nchallenge of accurate ranging with low-cost sensors, we propose, OCTraN, a\ntransformer architecture that uses iterative-attention to convert 2D image\nfeatures into 3D occupancy features and makes use of convolution and transpose\nconvolution to efficiently operate on spatial information. We also develop a\nself-supervised training pipeline to generalize the model to any scene by\neliminating the need for LiDAR ground truth by substituting it with\npseudo-ground truth labels obtained from boosted monocular depth estimation.\n","authors":["Aditya Nalgunda Ganesh","Dhruval Pobbathi Badrinath","Harshith Mohan Kumar","Priya SS","Surabhi Narayan"],"pdf_url":"https://arxiv.org/pdf/2307.10934v1.pdf","comment":"This work was accepted as a spotlight presentation at the\n  Transformers for Vision Workshop @CVPR 2023"},{"id":"http://arxiv.org/abs/2307.10927v1","updated":"2023-07-20T14:56:29Z","published":"2023-07-20T14:56:29Z","title":"Modeling 3D cardiac contraction and relaxation with point cloud\n  deformation networks","summary":"  Global single-valued biomarkers of cardiac function typically used in\nclinical practice, such as ejection fraction, provide limited insight on the\ntrue 3D cardiac deformation process and hence, limit the understanding of both\nhealthy and pathological cardiac mechanics. In this work, we propose the Point\nCloud Deformation Network (PCD-Net) as a novel geometric deep learning approach\nto model 3D cardiac contraction and relaxation between the extreme ends of the\ncardiac cycle. It employs the recent advances in point cloud-based deep\nlearning into an encoder-decoder structure, in order to enable efficient\nmulti-scale feature learning directly on multi-class 3D point cloud\nrepresentations of the cardiac anatomy. We evaluate our approach on a large\ndataset of over 10,000 cases from the UK Biobank study and find average Chamfer\ndistances between the predicted and ground truth anatomies below the pixel\nresolution of the underlying image acquisition. Furthermore, we observe similar\nclinical metrics between predicted and ground truth populations and show that\nthe PCD-Net can successfully capture subpopulation-specific differences between\nnormal subjects and myocardial infarction (MI) patients. We then demonstrate\nthat the learned 3D deformation patterns outperform multiple clinical\nbenchmarks by 13% and 7% in terms of area under the receiver operating\ncharacteristic curve for the tasks of prevalent MI detection and incident MI\nprediction and by 7% in terms of Harrell's concordance index for MI survival\nanalysis.\n","authors":["Marcel Beetz","Abhirup Banerjee","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2307.10927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10926v1","updated":"2023-07-20T14:52:45Z","published":"2023-07-20T14:52:45Z","title":"Confidence intervals for performance estimates in 3D medical image\n  segmentation","summary":"  Medical segmentation models are evaluated empirically. As such an evaluation\nis based on a limited set of example images, it is unavoidably noisy. Beyond a\nmean performance measure, reporting confidence intervals is thus crucial.\nHowever, this is rarely done in medical image segmentation. The width of the\nconfidence interval depends on the test set size and on the spread of the\nperformance measure (its standard-deviation across of the test set). For\nclassification, many test images are needed to avoid wide confidence intervals.\nSegmentation, however, has not been studied, and it differs by the amount of\ninformation brought by a given test image. In this paper, we study the typical\nconfidence intervals in medical image segmentation. We carry experiments on 3D\nimage segmentation using the standard nnU-net framework, two datasets from the\nMedical Decathlon challenge and two performance measures: the Dice accuracy and\nthe Hausdorff distance. We show that the parametric confidence intervals are\nreasonable approximations of the bootstrap estimates for varying test set sizes\nand spread of the performance metric. Importantly, we show that the test size\nneeded to achieve a given precision is often much lower than for classification\ntasks. Typically, a 1% wide confidence interval requires about 100-200 test\nsamples when the spread is low (standard-deviation around 3%). More difficult\nsegmentation tasks may lead to higher spreads and require over 1000 samples.\n","authors":["R. El Jurdi","G. Varoquax","O. Colliot"],"pdf_url":"https://arxiv.org/pdf/2307.10926v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.10924v1","updated":"2023-07-20T14:51:28Z","published":"2023-07-20T14:51:28Z","title":"Intrinsic Appearance Decomposition Using Point Cloud Representation","summary":"  Intrinsic decomposition is to infer the albedo and shading from the image.\nSince it is a heavily ill-posed problem, previous methods rely on prior\nassumptions from 2D images, however, the exploration of the data representation\nitself is limited. The point cloud is known as a rich format of scene\nrepresentation, which naturally aligns the geometric information and the color\ninformation of an image. Our proposed method, Point Intrinsic Net, in short,\nPoInt-Net, jointly predicts the albedo, light source direction, and shading,\nusing point cloud representation. Experiments reveal the benefits of PoInt-Net,\nin terms of accuracy, it outperforms 2D representation approaches on multiple\nmetrics across datasets; in terms of efficiency, it trains on small-scale point\nclouds and performs stably on any-scale point clouds; in terms of robustness,\nit only trains on single object level dataset, and demonstrates reasonable\ngeneralization ability for unseen objects and scenes.\n","authors":["Xiaoyan Xing","Konrad Groh","Sezer Karaoglu","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2307.10924v1.pdf","comment":"14 pages, 14 figures"},{"id":"http://arxiv.org/abs/2307.10922v1","updated":"2023-07-20T14:47:50Z","published":"2023-07-20T14:47:50Z","title":"Language-based Action Concept Spaces Improve Video Self-Supervised\n  Learning","summary":"  Recent contrastive language image pre-training has led to learning highly\ntransferable and robust image representations. However, adapting these models\nto video domains with minimal supervision remains an open problem. We explore a\nsimple step in that direction, using language tied self-supervised learning to\nadapt an image CLIP model to the video domain. A backbone modified for temporal\nmodeling is trained under self-distillation settings with train objectives\noperating in an action concept space. Feature vectors of various action\nconcepts extracted from a language encoder using relevant textual prompts\nconstruct this space. We introduce two train objectives, concept distillation\nand concept alignment, that retain generality of original representations while\nenforcing relations between actions and their attributes. Our approach improves\nzero-shot and linear probing performance on three action recognition\nbenchmarks.\n","authors":["Kanchana Ranasinghe","Michael Ryoo"],"pdf_url":"https://arxiv.org/pdf/2307.10922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10915v1","updated":"2023-07-20T14:39:46Z","published":"2023-07-20T14:39:46Z","title":"Revisiting Fine-Tuning Strategies for Self-supervised Medical Imaging\n  Analysis","summary":"  Despite the rapid progress in self-supervised learning (SSL), end-to-end\nfine-tuning still remains the dominant fine-tuning strategy for medical imaging\nanalysis. However, it remains unclear whether this approach is truly optimal\nfor effectively utilizing the pre-trained knowledge, especially considering the\ndiverse categories of SSL that capture different types of features. In this\npaper, we first establish strong contrastive and restorative SSL baselines that\noutperform SOTA methods across four diverse downstream tasks. Building upon\nthese strong baselines, we conduct an extensive fine-tuning analysis across\nmultiple pre-training and fine-tuning datasets, as well as various fine-tuning\ndataset sizes. Contrary to the conventional wisdom of fine-tuning only the last\nfew layers of a pre-trained network, we show that fine-tuning intermediate\nlayers is more effective, with fine-tuning the second quarter (25-50%) of the\nnetwork being optimal for contrastive SSL whereas fine-tuning the third quarter\n(50-75%) of the network being optimal for restorative SSL. Compared to the\nde-facto standard of end-to-end fine-tuning, our best fine-tuning strategy,\nwhich fine-tunes a shallower network consisting of the first three quarters\n(0-75%) of the pre-trained network, yields improvements of as much as 5.48%.\nAdditionally, using these insights, we propose a simple yet effective method to\nleverage the complementary strengths of multiple SSL models, resulting in\nenhancements of up to 3.57% compared to using the best model alone. Hence, our\nfine-tuning strategies not only enhance the performance of individual SSL\nmodels, but also enable effective utilization of the complementary strengths\noffered by multiple SSL models, leading to significant improvements in\nself-supervised medical imaging analysis.\n","authors":["Muhammad Osama Khan","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.10915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10912v1","updated":"2023-07-20T14:34:08Z","published":"2023-07-20T14:34:08Z","title":"WeakPolyp: You Only Look Bounding Box for Polyp Segmentation","summary":"  Limited by expensive pixel-level labels, polyp segmentation models are\nplagued by data shortage and suffer from impaired generalization. In contrast,\npolyp bounding box annotations are much cheaper and more accessible. Thus, to\nreduce labeling cost, we propose to learn a weakly supervised polyp\nsegmentation model (i.e., WeakPolyp) completely based on bounding box\nannotations. However, coarse bounding boxes contain too much noise. To avoid\ninterference, we introduce the mask-to-box (M2B) transformation. By supervising\nthe outer box mask of the prediction instead of the prediction itself, M2B\ngreatly mitigates the mismatch between the coarse label and the precise\nprediction. But, M2B only provides sparse supervision, leading to non-unique\npredictions. Therefore, we further propose a scale consistency (SC) loss for\ndense supervision. By explicitly aligning predictions across the same image at\ndifferent scales, the SC loss largely reduces the variation of predictions.\nNote that our WeakPolyp is a plug-and-play model, which can be easily ported to\nother appealing backbones. Besides, the proposed modules are only used during\ntraining, bringing no computation cost to inference. Extensive experiments\ndemonstrate the effectiveness of our proposed WeakPolyp, which surprisingly\nachieves a comparable performance with a fully supervised model, requiring no\nmask annotations at all.\n","authors":["Jun Wei","Yiwen Hu","Shuguang Cui","S. Kevin Zhou","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2307.10912v1.pdf","comment":"accepted by MICCAI 2023, codes are available at\n  https://github.com/weijun88/WeakPolyp"},{"id":"http://arxiv.org/abs/2306.14687v2","updated":"2023-07-20T14:29:39Z","published":"2023-06-26T13:32:09Z","title":"GSMorph: Gradient Surgery for cine-MRI Cardiac Deformable Registration","summary":"  Deep learning-based deformable registration methods have been widely\ninvestigated in diverse medical applications. Learning-based deformable\nregistration relies on weighted objective functions trading off registration\naccuracy and smoothness of the deformation field. Therefore, they inevitably\nrequire tuning the hyperparameter for optimal registration performance. Tuning\nthe hyperparameters is highly computationally expensive and introduces\nundesired dependencies on domain knowledge. In this study, we construct a\nregistration model based on the gradient surgery mechanism, named GSMorph, to\nachieve a hyperparameter-free balance on multiple losses. In GSMorph, we\nreformulate the optimization procedure by projecting the gradient of similarity\nloss orthogonally to the plane associated with the smoothness constraint,\nrather than additionally introducing a hyperparameter to balance these two\ncompeting terms. Furthermore, our method is model-agnostic and can be merged\ninto any deep registration network without introducing extra parameters or\nslowing down inference. In this study, We compared our method with\nstate-of-the-art (SOTA) deformable registration approaches over two publicly\navailable cardiac MRI datasets. GSMorph proves superior to five SOTA\nlearning-based registration models and two conventional registration\ntechniques, SyN and Demons, on both registration accuracy and smoothness.\n","authors":["Haoran Dou","Ning Bi","Luyi Han","Yuhao Huang","Ritse Mann","Xin Yang","Dong Ni","Nishant Ravikumar","Alejandro F. Frangi","Yunzhi Huang"],"pdf_url":"https://arxiv.org/pdf/2306.14687v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2110.05216v2","updated":"2023-07-20T14:29:07Z","published":"2021-10-11T12:32:56Z","title":"High-order Tensor Pooling with Attention for Action Recognition","summary":"  We aim at capturing high-order statistics of feature vectors formed by a\nneural network, and propose end-to-end second- and higher-order pooling to form\na tensor descriptor. Tensor descriptors require a robust similarity measure due\nto low numbers of aggregated vectors and the burstiness phenomenon, when a\ngiven feature appears more/less frequently than statistically expected. The\nHeat Diffusion Process (HDP) on a graph Laplacian is closely related to the\nEigenvalue Power Normalization (EPN) of the covariance/auto-correlation matrix,\nwhose inverse forms a loopy graph Laplacian. We show that the HDP and the EPN\nplay the same role, i.e., to boost or dampen the magnitude of the eigenspectrum\nthus preventing the burstiness. We equip higher-order tensors with EPN which\nacts as a spectral detector of higher-order occurrences to prevent burstiness.\nWe also prove that for a tensor of order r built from d dimensional feature\ndescriptors, such a detector gives the likelihood if at least one higher-order\noccurrence is 'projected' into one of binom(d,r) subspaces represented by the\ntensor; thus forming a tensor power normalization metric endowed with\nbinom(d,r) such 'detectors'. For experimental contributions, we apply several\nsecond- and higher-order pooling variants to action recognition, provide\npreviously not presented comparisons of such pooling variants, and show\nstate-of-the-art results on HMDB-51, YUP++ and MPII Cooking Activities.\n","authors":["Piotr Koniusz","Lei Wang","Ke Sun"],"pdf_url":"https://arxiv.org/pdf/2110.05216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10895v1","updated":"2023-07-20T14:18:44Z","published":"2023-07-20T14:18:44Z","title":"Variational Point Encoding Deformation for Dental Modeling","summary":"  Digital dentistry has made significant advancements in recent years, yet\nnumerous challenges remain to be addressed. In this study, we release a new\nextensive dataset of tooth meshes to encourage further research. Additionally,\nwe propose Variational FoldingNet (VF-Net), which extends FoldingNet to enable\nprobabilistic learning of point cloud representations. A key challenge in\nexisting latent variable models for point clouds is the lack of a 1-to-1\nmapping between input points and output points. Instead, they must rely on\noptimizing Chamfer distances, a metric that does not have a normalized\ndistributional counterpart, preventing its usage in probabilistic models. We\ndemonstrate that explicit minimization of Chamfer distances can be replaced by\na suitable encoder, which allows us to increase computational efficiency while\nsimplifying the probabilistic extension. Our experimental findings present\nempirical evidence demonstrating the superior performance of VF-Net over\nexisting models in terms of dental scan reconstruction and extrapolation.\nAdditionally, our investigation highlights the robustness of VF-Net's latent\nrepresentations. These results underscore the promising prospects of VF-Net as\nan effective and reliable method for point cloud reconstruction and analysis.\n","authors":["Johan Ziruo Ye","Thomas Ørkild","Peter Lempel Søndergaard","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2307.10895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10894v1","updated":"2023-07-20T14:15:20Z","published":"2023-07-20T14:15:20Z","title":"Human Motion Generation: A Survey","summary":"  Human motion generation aims to generate natural human pose sequences and\nshows immense potential for real-world applications. Substantial progress has\nbeen made recently in motion data collection technologies and generation\nmethods, laying the foundation for increasing interest in human motion\ngeneration. Most research within this field focuses on generating human motions\nbased on conditional signals, such as text, audio, and scene contexts. While\nsignificant advancements have been made in recent years, the task continues to\npose challenges due to the intricate nature of human motion and its implicit\nrelationship with conditional signals. In this survey, we present a\ncomprehensive literature review of human motion generation, which, to the best\nof our knowledge, is the first of its kind in this field. We begin by\nintroducing the background of human motion and generative models, followed by\nan examination of representative methods for three mainstream sub-tasks:\ntext-conditioned, audio-conditioned, and scene-conditioned human motion\ngeneration. Additionally, we provide an overview of common datasets and\nevaluation metrics. Lastly, we discuss open problems and outline potential\nfuture research directions. We hope that this survey could provide the\ncommunity with a comprehensive glimpse of this rapidly evolving field and\ninspire novel ideas that address the outstanding challenges.\n","authors":["Wentao Zhu","Xiaoxuan Ma","Dongwoo Ro","Hai Ci","Jinlu Zhang","Jiaxin Shi","Feng Gao","Qi Tian","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10894v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.10875v1","updated":"2023-07-20T13:47:30Z","published":"2023-07-20T13:47:30Z","title":"Risk-optimized Outlier Removal for Robust Point Cloud Classification","summary":"  The popularity of point cloud deep models for safety-critical purposes has\nincreased, but the reliability and security of these models can be compromised\nby intentional or naturally occurring point cloud noise. To combat this issue,\nwe present a novel point cloud outlier removal method called PointCVaR, which\nempowers standard-trained models to eliminate additional outliers and restore\nthe data. Our approach begins by conducting attribution analysis to determine\nthe influence of each point on the model output, which we refer to as point\nrisk. We then optimize the process of filtering high-risk points using\nConditional Value at Risk (CVaR) as the objective. The rationale for this\napproach is based on the observation that noise points in point clouds tend to\ncluster in the tail of the risk distribution, with a low frequency but a high\nlevel of risk, resulting in significant interference with classification\nresults. Despite requiring no additional training effort, our method produces\nexceptional results in various removal-and-classification experiments for noisy\npoint clouds, which are corrupted by random noise, adversarial noise, and\nbackdoor trigger noise. Impressively, it achieves 87% accuracy in defense\nagainst the backdoor attack by removing triggers. Overall, the proposed\nPointCVaR effectively eliminates noise points and enhances point cloud\nclassification, making it a promising plug-in module for various models in\ndifferent scenarios.\n","authors":["Xinke Li","Junchi Lu"],"pdf_url":"https://arxiv.org/pdf/2307.10875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10873v1","updated":"2023-07-20T13:43:48Z","published":"2023-07-20T13:43:48Z","title":"Conservative Estimation of Perception Relevance of Dynamic Objects for\n  Safe Trajectories in Automotive Scenarios","summary":"  Having efficient testing strategies is a core challenge that needs to be\novercome for the release of automated driving. This necessitates clear\nrequirements as well as suitable methods for testing. In this work, the\nrequirements for perception modules are considered with respect to relevance.\nThe concept of relevance currently remains insufficiently defined and\nspecified. In this paper, we propose a novel methodology to overcome this\nchallenge by exemplary application to collision safety in the highway domain.\nUsing this general system and use case specification, a corresponding concept\nfor relevance is derived. Irrelevant objects are thus defined as objects which\ndo not limit the set of safe actions available to the ego vehicle under\nconsideration of all uncertainties. As an initial step, the use case is\ndecomposed into functional scenarios with respect to collision relevance. For\neach functional scenario, possible actions of both the ego vehicle and any\nother dynamic object are formalized as equations. This set of possible actions\nis constrained by traffic rules, yielding relevance criteria. As a result, we\npresent a conservative estimation which dynamic objects are relevant for\nperception and need to be considered for a complete evaluation. The estimation\nprovides requirements which are applicable for offline testing and validation\nof perception components. A visualization is presented for examples from the\nhighD dataset, showing the plausibility of the results. Finally, a possibility\nfor a future validation of the presented relevance concept is outlined.\n","authors":["Ken Mori","Kai Storms","Steven Peters"],"pdf_url":"https://arxiv.org/pdf/2307.10873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10867v1","updated":"2023-07-20T13:40:22Z","published":"2023-07-20T13:40:22Z","title":"FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with\n  Human Feedback","summary":"  Captions are crucial for understanding scientific visualizations and\ndocuments. Existing captioning methods for scientific figures rely on\nfigure-caption pairs extracted from documents for training, many of which fall\nshort with respect to metrics like helpfulness, explainability, and\nvisual-descriptiveness [15] leading to generated captions being misaligned with\nreader preferences. To enable the generation of high-quality figure captions,\nwe introduce FigCaps-HF a new framework for figure-caption generation that can\nincorporate domain expert feedback in generating captions optimized for reader\npreferences. Our framework comprises of 1) an automatic method for evaluating\nquality of figure-caption pairs, 2) a novel reinforcement learning with human\nfeedback (RLHF) method to optimize a generative figure-to-caption model for\nreader preferences. We demonstrate the effectiveness of our simple learning\nframework by improving performance over standard fine-tuning across different\ntypes of models. In particular, when using BLIP as the base model, our RLHF\nframework achieves a mean gain of 35.7%, 16.9%, and 9% in ROUGE, BLEU, and\nMeteor, respectively. Finally, we release a large-scale benchmark dataset with\nhuman feedback on figure-caption pairs to enable further evaluation and\ndevelopment of RLHF techniques for this problem.\n","authors":["Ashish Singh","Prateek Agarwal","Zixuan Huang","Arpita Singh","Tong Yu","Sungchul Kim","Victor Bursztyn","Nikos Vlassis","Ryan A. Rossi"],"pdf_url":"https://arxiv.org/pdf/2307.10867v1.pdf","comment":"19 pages, 4 figures. Benchmark Documentation:\n  https://figcapshf.github.io/"},{"id":"http://arxiv.org/abs/2307.10864v1","updated":"2023-07-20T13:33:28Z","published":"2023-07-20T13:33:28Z","title":"Divide & Bind Your Attention for Improved Generative Semantic Nursing","summary":"  Emerging large-scale text-to-image generative models, e.g., Stable Diffusion\n(SD), have exhibited overwhelming results with high fidelity. Despite the\nmagnificent progress, current state-of-the-art models still struggle to\ngenerate images fully adhering to the input prompt. Prior work, Attend &\nExcite, has introduced the concept of Generative Semantic Nursing (GSN), aiming\nto optimize cross-attention during inference time to better incorporate the\nsemantics. It demonstrates promising results in generating simple prompts,\ne.g., ``a cat and a dog''. However, its efficacy declines when dealing with\nmore complex prompts, and it does not explicitly address the problem of\nimproper attribute binding. To address the challenges posed by complex prompts\nor scenarios involving multiple entities and to achieve improved attribute\nbinding, we propose Divide & Bind. We introduce two novel loss objectives for\nGSN: a novel attendance loss and a binding loss. Our approach stands out in its\nability to faithfully synthesize desired objects with improved attribute\nalignment from complex prompts and exhibits superior performance across\nmultiple evaluation benchmarks. More videos and updates can be found on the\nproject page \\url{https://sites.google.com/view/divide-and-bind}.\n","authors":["Yumeng Li","Margret Keuper","Dan Zhang","Anna Khoreva"],"pdf_url":"https://arxiv.org/pdf/2307.10864v1.pdf","comment":"Project page: \\url{https://sites.google.com/view/divide-and-bind}"},{"id":"http://arxiv.org/abs/2307.10854v1","updated":"2023-07-20T13:17:30Z","published":"2023-07-20T13:17:30Z","title":"BlendFace: Re-designing Identity Encoders for Face-Swapping","summary":"  The great advancements of generative adversarial networks and face\nrecognition models in computer vision have made it possible to swap identities\non images from single sources. Although a lot of studies seems to have proposed\nalmost satisfactory solutions, we notice previous methods still suffer from an\nidentity-attribute entanglement that causes undesired attributes swapping\nbecause widely used identity encoders, eg, ArcFace, have some crucial attribute\nbiases owing to their pretraining on face recognition tasks. To address this\nissue, we design BlendFace, a novel identity encoder for face-swapping. The key\nidea behind BlendFace is training face recognition models on blended images\nwhose attributes are replaced with those of another mitigates inter-personal\nbiases such as hairsyles. BlendFace feeds disentangled identity features into\ngenerators and guides generators properly as an identity loss function.\nExtensive experiments demonstrate that BlendFace improves the\nidentity-attribute disentanglement in face-swapping models, maintaining a\ncomparable quantitative performance to previous methods.\n","authors":["Kaede Shiohara","Xingchao Yang","Takafumi Taketomi"],"pdf_url":"https://arxiv.org/pdf/2307.10854v1.pdf","comment":"ICCV2023. Code: https://github.com/mapooon/BlendFace, Webpage:\n  https://mapooon.github.io/BlendFacePage/"},{"id":"http://arxiv.org/abs/2307.10853v1","updated":"2023-07-20T13:16:10Z","published":"2023-07-20T13:16:10Z","title":"Exploring Effective Priors and Efficient Models for Weakly-Supervised\n  Change Detection","summary":"  Weakly-supervised change detection (WSCD) aims to detect pixel-level changes\nwith only image-level annotations. Owing to its label efficiency, WSCD is\ndrawing increasing attention recently. However, current WSCD methods often\nencounter the challenge of change missing and fabricating, i.e., the\ninconsistency between image-level annotations and pixel-level predictions.\nSpecifically, change missing refer to the situation that the WSCD model fails\nto predict any changed pixels, even though the image-level label indicates\nchanged, and vice versa for change fabricating. To address this challenge, in\nthis work, we leverage global-scale and local-scale priors in WSCD and propose\ntwo components: a Dilated Prior (DP) decoder and a Label Gated (LG) constraint.\nThe DP decoder decodes samples with the changed image-level label, skips\nsamples with the unchanged label, and replaces them with an all-unchanged\npixel-level label. The LG constraint is derived from the correspondence between\nchanged representations and image-level labels, penalizing the model when it\nmispredicts the change status. Additionally, we develop TransWCD, a simple yet\npowerful transformer-based model, showcasing the potential of weakly-supervised\nlearning in change detection. By integrating the DP decoder and LG constraint\ninto TransWCD, we form TransWCD-DL. Our proposed TransWCD and TransWCD-DL\nachieve significant +6.33% and +9.55% F1 score improvements over the\nstate-of-the-art methods on the WHU-CD dataset, respectively. Some performance\nmetrics even exceed several fully-supervised change detection (FSCD)\ncompetitors. Code will be available at\nhttps://github.com/zhenghuizhao/TransWCD.\n","authors":["Zhenghui Zhao","Lixiang Ru","Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2307.10853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10845v1","updated":"2023-07-20T13:07:41Z","published":"2023-07-20T13:07:41Z","title":"Self-paced Weight Consolidation for Continual Learning","summary":"  Continual learning algorithms which keep the parameters of new tasks close to\nthat of previous tasks, are popular in preventing catastrophic forgetting in\nsequential task learning settings. However, 1) the performance for the new\ncontinual learner will be degraded without distinguishing the contributions of\npreviously learned tasks; 2) the computational cost will be greatly increased\nwith the number of tasks, since most existing algorithms need to regularize all\nprevious tasks when learning new tasks. To address the above challenges, we\npropose a self-paced Weight Consolidation (spWC) framework to attain robust\ncontinual learning via evaluating the discriminative contributions of previous\ntasks. To be specific, we develop a self-paced regularization to reflect the\npriorities of past tasks via measuring difficulty based on key performance\nindicator (i.e., accuracy). When encountering a new task, all previous tasks\nare sorted from \"difficult\" to \"easy\" based on the priorities. Then the\nparameters of the new continual learner will be learned via selectively\nmaintaining the knowledge amongst more difficult past tasks, which could well\novercome catastrophic forgetting with less computational cost. We adopt an\nalternative convex search to iteratively update the model parameters and\npriority weights in the bi-convex formulation. The proposed spWC framework is\nplug-and-play, which is applicable to most continual learning algorithms (e.g.,\nEWC, MAS and RCIL) in different directions (e.g., classification and\nsegmentation). Experimental results on several public benchmark datasets\ndemonstrate that our proposed framework can effectively improve performance\nwhen compared with other popular continual learning algorithms.\n","authors":["Wei Cong","Yang Cong","Gan Sun","Yuyang Liu","Jiahua Dong"],"pdf_url":"https://arxiv.org/pdf/2307.10845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10843v1","updated":"2023-07-20T13:04:26Z","published":"2023-07-20T13:04:26Z","title":"Global Precipitation Nowcasting of Integrated Multi-satellitE Retrievals\n  for GPM: A U-Net Convolutional LSTM Architecture","summary":"  This paper presents a deep learning architecture for nowcasting of\nprecipitation almost globally every 30 min with a 4-hour lead time. The\narchitecture fuses a U-Net and a convolutional long short-term memory (LSTM)\nneural network and is trained using data from the Integrated MultisatellitE\nRetrievals for GPM (IMERG) and a few key precipitation drivers from the Global\nForecast System (GFS). The impacts of different training loss functions,\nincluding the mean-squared error (regression) and the focal-loss\n(classification), on the quality of precipitation nowcasts are studied. The\nresults indicate that the regression network performs well in capturing light\nprecipitation (below 1.6 mm/hr), but the classification network can outperform\nthe regression network for nowcasting of precipitation extremes (>8 mm/hr), in\nterms of the critical success index (CSI).. Using the Wasserstein distance, it\nis shown that the predicted precipitation by the classification network has a\ncloser class probability distribution to the IMERG than the regression network.\nIt is uncovered that the inclusion of the physical variables can improve\nprecipitation nowcasting, especially at longer lead times in both networks.\nTaking IMERG as a relative reference, a multi-scale analysis in terms of\nfractions skill score (FSS), shows that the nowcasting machine remains skillful\n(FSS > 0.5) at the resolution of 10 km compared to 50 km for GFS. For\nprecipitation rates greater than 4~mm/hr, only the classification network\nremains FSS-skillful on scales greater than 50 km within a 2-hour lead time.\n","authors":["Reyhaneh Rahimi","Ardeshir Ebtehaj","Ali Behrangi","Jackson Tan"],"pdf_url":"https://arxiv.org/pdf/2307.10843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10842v1","updated":"2023-07-20T13:02:45Z","published":"2023-07-20T13:02:45Z","title":"Label Calibration for Semantic Segmentation Under Domain Shift","summary":"  Performance of a pre-trained semantic segmentation model is likely to\nsubstantially decrease on data from a new domain. We show a pre-trained model\ncan be adapted to unlabelled target domain data by calculating soft-label\nprototypes under the domain shift and making predictions according to the\nprototype closest to the vector with predicted class probabilities. The\nproposed adaptation procedure is fast, comes almost for free in terms of\ncomputational resources and leads to considerable performance improvements. We\ndemonstrate the benefits of such label calibration on the highly-practical\nsynthetic-to-real semantic segmentation problem.\n","authors":["Ondrej Bohdal","Da Li","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2307.10842v1.pdf","comment":"ICLR 2023 Workshop on Pitfalls of Limited Data and Computation for\n  Trustworthy ML"},{"id":"http://arxiv.org/abs/2307.08930v2","updated":"2023-07-20T12:41:19Z","published":"2023-07-18T02:35:01Z","title":"Unsupervised Deep Graph Matching Based on Cycle Consistency","summary":"  We contribute to the sparsely populated area of unsupervised deep graph\nmatching with application to keypoint matching in images. Contrary to the\nstandard \\emph{supervised} approach, our method does not require ground truth\ncorrespondences between keypoint pairs. Instead, it is self-supervised by\nenforcing consistency of matchings between images of the same object category.\nAs the matching and the consistency loss are discrete, their derivatives cannot\nbe straightforwardly used for learning. We address this issue in a principled\nway by building our method upon the recent results on black-box differentiation\nof combinatorial solvers. This makes our method exceptionally flexible, as it\nis compatible with arbitrary network architectures and combinatorial solvers.\nOur experimental evaluation suggests that our technique sets a new\nstate-of-the-art for unsupervised graph matching.\n","authors":["Siddharth Tourani","Carsten Rother","Muhammad Haris Khan","Bogdan Savchynskyy"],"pdf_url":"https://arxiv.org/pdf/2307.08930v2.pdf","comment":"12 pages, 5 figures, 3 papers"},{"id":"http://arxiv.org/abs/2307.10824v1","updated":"2023-07-20T12:38:17Z","published":"2023-07-20T12:38:17Z","title":"Parse and Recall: Towards Accurate Lung Nodule Malignancy Prediction\n  like Radiologists","summary":"  Lung cancer is a leading cause of death worldwide and early screening is\ncritical for improving survival outcomes. In clinical practice, the contextual\nstructure of nodules and the accumulated experience of radiologists are the two\ncore elements related to the accuracy of identification of benign and malignant\nnodules. Contextual information provides comprehensive information about\nnodules such as location, shape, and peripheral vessels, and experienced\nradiologists can search for clues from previous cases as a reference to enrich\nthe basis of decision-making. In this paper, we propose a radiologist-inspired\nmethod to simulate the diagnostic process of radiologists, which is composed of\ncontext parsing and prototype recalling modules. The context parsing module\nfirst segments the context structure of nodules and then aggregates contextual\ninformation for a more comprehensive understanding of the nodule. The prototype\nrecalling module utilizes prototype-based learning to condense previously\nlearned cases as prototypes for comparative analysis, which is updated online\nin a momentum way during training. Building on the two modules, our method\nleverages both the intrinsic characteristics of the nodules and the external\nknowledge accumulated from other nodules to achieve a sound diagnosis. To meet\nthe needs of both low-dose and noncontrast screening, we collect a large-scale\ndataset of 12,852 and 4,029 nodules from low-dose and noncontrast CTs\nrespectively, each with pathology- or follow-up-confirmed labels. Experiments\non several datasets demonstrate that our method achieves advanced screening\nperformance on both low-dose and noncontrast scenarios.\n","authors":["Jianpeng Zhang","Xianghua Ye","Jianfeng Zhang","Yuxing Tang","Minfeng Xu","Jianfei Guo","Xin Chen","Zaiyi Liu","Jingren Zhou","Le Lu","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10824v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2212.13792v2","updated":"2023-07-20T12:37:06Z","published":"2022-12-28T12:08:27Z","title":"Periocular Biometrics: A Modality for Unconstrained Scenarios","summary":"  Periocular refers to the externally visible region of the face that surrounds\nthe eye socket. This feature-rich area can provide accurate identification in\nunconstrained or uncooperative scenarios, where the iris or face modalities may\nnot offer sufficient biometric cues due to factors such as partial occlusion or\nhigh subject-to-camera distance. The COVID-19 pandemic has further highlighted\nits importance, as the ocular region remained the only visible facial area even\nin controlled settings due to the widespread use of masks. This paper discusses\nthe state of the art in periocular biometrics, presenting an overall framework\nencompassing its most significant research aspects, which include: (a) ocular\ndefinition, acquisition, and detection; (b) identity recognition, including\ncombination with other modalities and use of various spectra; and (c) ocular\nsoft-biometric analysis. Finally, we conclude by addressing current challenges\nand proposing future directions.\n","authors":["Fernando Alonso-Fernandez","Josef Bigun","Julian Fierrez","Naser Damer","Hugo Proença","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2212.13792v2.pdf","comment":"Published at IEEE Computer journal"},{"id":"http://arxiv.org/abs/2307.10822v1","updated":"2023-07-20T12:32:25Z","published":"2023-07-20T12:32:25Z","title":"Gradient-Semantic Compensation for Incremental Semantic Segmentation","summary":"  Incremental semantic segmentation aims to continually learn the segmentation\nof new coming classes without accessing the training data of previously learned\nclasses. However, most current methods fail to address catastrophic forgetting\nand background shift since they 1) treat all previous classes equally without\nconsidering different forgetting paces caused by imbalanced gradient\nback-propagation; 2) lack strong semantic guidance between classes. To tackle\nthe above challenges, in this paper, we propose a Gradient-Semantic\nCompensation (GSC) model, which surmounts incremental semantic segmentation\nfrom both gradient and semantic perspectives. Specifically, to address\ncatastrophic forgetting from the gradient aspect, we develop a step-aware\ngradient compensation that can balance forgetting paces of previously seen\nclasses via re-weighting gradient backpropagation. Meanwhile, we propose a\nsoft-sharp semantic relation distillation to distill consistent inter-class\nsemantic relations via soft labels for alleviating catastrophic forgetting from\nthe semantic aspect. In addition, we develop a prototypical pseudo re-labeling\nthat provides strong semantic guidance to mitigate background shift. It\nproduces high-quality pseudo labels for old classes in the background by\nmeasuring distances between pixels and class-wise prototypes. Extensive\nexperiments on three public datasets, i.e., Pascal VOC 2012, ADE20K, and\nCityscapes, demonstrate the effectiveness of our proposed GSC model.\n","authors":["Wei Cong","Yang Cong","Jiahua Dong","Gan Sun","Henghui Ding"],"pdf_url":"https://arxiv.org/pdf/2307.10822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10816v1","updated":"2023-07-20T12:25:06Z","published":"2023-07-20T12:25:06Z","title":"BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained\n  Diffusion","summary":"  Recent text-to-image diffusion models have demonstrated an astonishing\ncapacity to generate high-quality images. However, researchers mainly studied\nthe way of synthesizing images with only text prompts. While some works have\nexplored using other modalities as conditions, considerable paired data, e.g.,\nbox/mask-image pairs, and fine-tuning time are required for nurturing models.\nAs such paired data is time-consuming and labor-intensive to acquire and\nrestricted to a closed set, this potentially becomes the bottleneck for\napplications in an open world. This paper focuses on the simplest form of\nuser-provided conditions, e.g., box or scribble. To mitigate the aforementioned\nproblem, we propose a training-free method to control objects and contexts in\nthe synthesized images adhering to the given spatial conditions. Specifically,\nthree spatial constraints, i.e., Inner-Box, Outer-Box, and Corner Constraints,\nare designed and seamlessly integrated into the denoising step of diffusion\nmodels, requiring no additional training and massive annotated layout data.\nExtensive results show that the proposed constraints can control what and where\nto present in the images while retaining the ability of the Stable Diffusion\nmodel to synthesize with high fidelity and diverse concept coverage. The code\nis publicly available at https://github.com/Sierkinhane/BoxDiff.\n","authors":["Jinheng Xie","Yuexiang Li","Yawen Huang","Haozhe Liu","Wentian Zhang","Yefeng Zheng","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2307.10816v1.pdf","comment":"Accepted by ICCV 2023. The paper is still being revised for better\n  organization and comparison"},{"id":"http://arxiv.org/abs/2306.09683v2","updated":"2023-07-20T12:23:12Z","published":"2023-06-16T08:27:46Z","title":"Scaling Open-Vocabulary Object Detection","summary":"  Open-vocabulary object detection has benefited greatly from pretrained\nvision-language models, but is still limited by the amount of available\ndetection training data. While detection training data can be expanded by using\nWeb image-text pairs as weak supervision, this has not been done at scales\ncomparable to image-level pretraining. Here, we scale up detection data with\nself-training, which uses an existing detector to generate pseudo-box\nannotations on image-text pairs. Major challenges in scaling self-training are\nthe choice of label space, pseudo-annotation filtering, and training\nefficiency. We present the OWLv2 model and OWL-ST self-training recipe, which\naddress these challenges. OWLv2 surpasses the performance of previous\nstate-of-the-art open-vocabulary detectors already at comparable training\nscales (~10M examples). However, with OWL-ST, we can scale to over 1B examples,\nyielding further large improvement: With an L/14 architecture, OWL-ST improves\nAP on LVIS rare classes, for which the model has seen no human box annotations,\nfrom 31.2% to 44.6% (43% relative improvement). OWL-ST unlocks Web-scale\ntraining for open-world localization, similar to what has been seen for image\nclassification and language modelling.\n","authors":["Matthias Minderer","Alexey Gritsenko","Neil Houlsby"],"pdf_url":"https://arxiv.org/pdf/2306.09683v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10813v1","updated":"2023-07-20T12:21:26Z","published":"2023-07-20T12:21:26Z","title":"Perceptual Quality Assessment of Omnidirectional Audio-visual Signals","summary":"  Omnidirectional videos (ODVs) play an increasingly important role in the\napplication fields of medical, education, advertising, tourism, etc. Assessing\nthe quality of ODVs is significant for service-providers to improve the user's\nQuality of Experience (QoE). However, most existing quality assessment studies\nfor ODVs only focus on the visual distortions of videos, while ignoring that\nthe overall QoE also depends on the accompanying audio signals. In this paper,\nwe first establish a large-scale audio-visual quality assessment dataset for\nomnidirectional videos, which includes 375 distorted omnidirectional\naudio-visual (A/V) sequences generated from 15 high-quality pristine\nomnidirectional A/V contents, and the corresponding perceptual audio-visual\nquality scores. Then, we design three baseline methods for full-reference\nomnidirectional audio-visual quality assessment (OAVQA), which combine existing\nstate-of-the-art single-mode audio and video QA models via multimodal fusion\nstrategies. We validate the effectiveness of the A/V multimodal fusion method\nfor OAVQA on our dataset, which provides a new benchmark for omnidirectional\nQoE evaluation. Our dataset is available at https://github.com/iamazxl/OAVQA.\n","authors":["Xilei Zhu","Huiyu Duan","Yuqin Cao","Yuxin Zhu","Yucheng Zhu","Jing Liu","Li Chen","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2307.10813v1.pdf","comment":"12 pages, 5 figures, to be published in CICAI2023"},{"id":"http://arxiv.org/abs/2009.03259v2","updated":"2023-07-20T12:11:56Z","published":"2020-09-07T17:27:27Z","title":"Implicit Multidimensional Projection of Local Subspaces","summary":"  We propose a visualization method to understand the effect of\nmultidimensional projection on local subspaces, using implicit function\ndifferentiation. Here, we understand the local subspace as the multidimensional\nlocal neighborhood of data points. Existing methods focus on the projection of\nmultidimensional data points, and the neighborhood information is ignored. Our\nmethod is able to analyze the shape and directional information of the local\nsubspace to gain more insights into the global structure of the data through\nthe perception of local structures. Local subspaces are fitted by\nmultidimensional ellipses that are spanned by basis vectors. An accurate and\nefficient vector transformation method is proposed based on analytical\ndifferentiation of multidimensional projections formulated as implicit\nfunctions. The results are visualized as glyphs and analyzed using a full set\nof specifically-designed interactions supported in our efficient web-based\nvisualization tool. The usefulness of our method is demonstrated using various\nmulti- and high-dimensional benchmark datasets. Our implicit differentiation\nvector transformation is evaluated through numerical comparisons; the overall\nmethod is evaluated through exploration examples and use cases.\n","authors":["Rongzheng Bian","Yumeng Xue","Liang Zhou","Jian Zhang","Baoquan Chen","Daniel Weiskopf","Yunhai Wang"],"pdf_url":"https://arxiv.org/pdf/2009.03259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10802v1","updated":"2023-07-20T12:10:29Z","published":"2023-07-20T12:10:29Z","title":"Meta-Transformer: A Unified Framework for Multimodal Learning","summary":"  Multimodal learning aims to build models that can process and relate\ninformation from multiple modalities. Despite years of development in this\nfield, it still remains challenging to design a unified network for processing\nvarious modalities ($\\textit{e.g.}$ natural language, 2D images, 3D point\nclouds, audio, video, time series, tabular data) due to the inherent gaps among\nthem. In this work, we propose a framework, named Meta-Transformer, that\nleverages a $\\textbf{frozen}$ encoder to perform multimodal perception without\nany paired multimodal training data. In Meta-Transformer, the raw input data\nfrom various modalities are mapped into a shared token space, allowing a\nsubsequent encoder with frozen parameters to extract high-level semantic\nfeatures of the input data. Composed of three main components: a unified data\ntokenizer, a modality-shared encoder, and task-specific heads for downstream\ntasks, Meta-Transformer is the first framework to perform unified learning\nacross 12 modalities with unpaired data. Experiments on different benchmarks\nreveal that Meta-Transformer can handle a wide range of tasks including\nfundamental perception (text, image, point cloud, audio, video), practical\napplication (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph,\ntabular, and time-series). Meta-Transformer indicates a promising future for\ndeveloping unified multimodal intelligence with transformers. Code will be\navailable at https://github.com/invictus717/MetaTransformer\n","authors":["Yiyuan Zhang","Kaixiong Gong","Kaipeng Zhang","Hongsheng Li","Yu Qiao","Wanli Ouyang","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.10802v1.pdf","comment":"Project website: https://kxgong.github.io/meta_transformer/"},{"id":"http://arxiv.org/abs/2307.09906v2","updated":"2023-07-20T12:00:23Z","published":"2023-07-19T11:10:26Z","title":"Implicit Identity Representation Conditioned Memory Compensation Network\n  for Talking Head video Generation","summary":"  Talking head video generation aims to animate a human face in a still image\nwith dynamic poses and expressions using motion information derived from a\ntarget-driving video, while maintaining the person's identity in the source\nimage. However, dramatic and complex motions in the driving video cause\nambiguous generation, because the still source image cannot provide sufficient\nappearance information for occluded regions or delicate expression variations,\nwhich produces severe artifacts and significantly degrades the generation\nquality. To tackle this problem, we propose to learn a global facial\nrepresentation space, and design a novel implicit identity representation\nconditioned memory compensation network, coined as MCNet, for high-fidelity\ntalking head generation.~Specifically, we devise a network module to learn a\nunified spatial facial meta-memory bank from all training samples, which can\nprovide rich facial structure and appearance priors to compensate warped source\nfacial features for the generation. Furthermore, we propose an effective query\nmechanism based on implicit identity representations learned from the discrete\nkeypoints of the source image. It can greatly facilitate the retrieval of more\ncorrelated information from the memory bank for the compensation. Extensive\nexperiments demonstrate that MCNet can learn representative and complementary\nfacial memory, and can clearly outperform previous state-of-the-art talking\nhead generation methods on VoxCeleb1 and CelebV datasets. Please check our\n\\href{https://github.com/harlanhong/ICCV2023-MCNET}{Project}.\n","authors":["Fa-Ting Hong","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.09906v2.pdf","comment":"Accepted by ICCV2023, update the reference and figures"},{"id":"http://arxiv.org/abs/2307.10797v1","updated":"2023-07-20T11:59:42Z","published":"2023-07-20T11:59:42Z","title":"HyperReenact: One-Shot Reenactment via Jointly Learning to Refine and\n  Retarget Faces","summary":"  In this paper, we present our method for neural face reenactment, called\nHyperReenact, that aims to generate realistic talking head images of a source\nidentity, driven by a target facial pose. Existing state-of-the-art face\nreenactment methods train controllable generative models that learn to\nsynthesize realistic facial images, yet producing reenacted faces that are\nprone to significant visual artifacts, especially under the challenging\ncondition of extreme head pose changes, or requiring expensive few-shot\nfine-tuning to better preserve the source identity characteristics. We propose\nto address these limitations by leveraging the photorealistic generation\nability and the disentangled properties of a pretrained StyleGAN2 generator, by\nfirst inverting the real images into its latent space and then using a\nhypernetwork to perform: (i) refinement of the source identity characteristics\nand (ii) facial pose re-targeting, eliminating this way the dependence on\nexternal editing methods that typically produce artifacts. Our method operates\nunder the one-shot setting (i.e., using a single source frame) and allows for\ncross-subject reenactment, without requiring any subject-specific fine-tuning.\nWe compare our method both quantitatively and qualitatively against several\nstate-of-the-art techniques on the standard benchmarks of VoxCeleb1 and\nVoxCeleb2, demonstrating the superiority of our approach in producing\nartifact-free images, exhibiting remarkable robustness even under extreme head\npose changes. We make the code and the pretrained models publicly available at:\nhttps://github.com/StelaBou/HyperReenact .\n","authors":["Stella Bounareli","Christos Tzelepis","Vasileios Argyriou","Ioannis Patras","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2307.10797v1.pdf","comment":"Accepted for publication in ICCV 2023. Project page:\n  https://stelabou.github.io/hyperreenact.github.io/ Code:\n  https://github.com/StelaBou/HyperReenact"},{"id":"http://arxiv.org/abs/2307.10792v1","updated":"2023-07-20T11:45:38Z","published":"2023-07-20T11:45:38Z","title":"Optimizing PatchCore for Few/many-shot Anomaly Detection","summary":"  Few-shot anomaly detection (AD) is an emerging sub-field of general AD, and\ntries to distinguish between normal and anomalous data using only few selected\nsamples. While newly proposed few-shot AD methods do compare against\npre-existing algorithms developed for the full-shot domain as baselines, they\ndo not dedicatedly optimize them for the few-shot setting. It thus remains\nunclear if the performance of such pre-existing algorithms can be further\nimproved. We address said question in this work. Specifically, we present a\nstudy on the AD/anomaly segmentation (AS) performance of PatchCore, the current\nstate-of-the-art full-shot AD/AS algorithm, in both the few-shot and the\nmany-shot settings. We hypothesize that further performance improvements can be\nrealized by (I) optimizing its various hyperparameters, and by (II)\ntransferring techniques known to improve few-shot supervised learning to the AD\ndomain. Exhaustive experiments on the public VisA and MVTec AD datasets reveal\nthat (I) significant performance improvements can be realized by optimizing\nhyperparameters such as the underlying feature extractor, and that (II)\nimage-level augmentations can, but are not guaranteed, to improve performance.\nBased on these findings, we achieve a new state of the art in few-shot AD on\nVisA, further demonstrating the merit of adapting pre-existing AD/AS methods to\nthe few-shot setting. Last, we identify the investigation of feature extractors\nwith a strong inductive bias as a potential future research direction for\n(few-shot) AD/AS.\n","authors":["João Santos","Triet Tran","Oliver Rippel"],"pdf_url":"https://arxiv.org/pdf/2307.10792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10790v1","updated":"2023-07-20T11:42:24Z","published":"2023-07-20T11:42:24Z","title":"Behavioral Analysis of Vision-and-Language Navigation Agents","summary":"  To be successful, Vision-and-Language Navigation (VLN) agents must be able to\nground instructions to actions based on their surroundings. In this work, we\ndevelop a methodology to study agent behavior on a skill-specific basis --\nexamining how well existing agents ground instructions about stopping, turning,\nand moving towards specified objects or rooms. Our approach is based on\ngenerating skill-specific interventions and measuring changes in agent\npredictions. We present a detailed case study analyzing the behavior of a\nrecent agent and then compare multiple agents in terms of skill-specific\ncompetency scores. This analysis suggests that biases from training have\nlasting effects on agent behavior and that existing models are able to ground\nsimple referring expressions. Our comparisons between models show that\nskill-specific scores correlate with improvements in overall VLN task\nperformance.\n","authors":["Zijiao Yang","Arjun Majumdar","Stefan Lee"],"pdf_url":"https://arxiv.org/pdf/2307.10790v1.pdf","comment":"accepted to CVPR2023"},{"id":"http://arxiv.org/abs/2307.10787v1","updated":"2023-07-20T11:36:45Z","published":"2023-07-20T11:36:45Z","title":"Feed-Forward Source-Free Domain Adaptation via Class Prototypes","summary":"  Source-free domain adaptation has become popular because of its practical\nusefulness and no need to access source data. However, the adaptation process\nstill takes a considerable amount of time and is predominantly based on\noptimization that relies on back-propagation. In this work we present a simple\nfeed-forward approach that challenges the need for back-propagation based\nadaptation. Our approach is based on computing prototypes of classes under the\ndomain shift using a pre-trained model. It achieves strong improvements in\naccuracy compared to the pre-trained model and requires only a small fraction\nof time of existing domain adaptation methods.\n","authors":["Ondrej Bohdal","Da Li","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2307.10787v1.pdf","comment":"ECCV 2022 Workshop on Out of Distribution Generalization in Computer\n  Vision (OOD-CV)"},{"id":"http://arxiv.org/abs/2307.10784v1","updated":"2023-07-20T11:33:46Z","published":"2023-07-20T11:33:46Z","title":"SMURF: Spatial Multi-Representation Fusion for 3D Object Detection with\n  4D Imaging Radar","summary":"  The 4D Millimeter wave (mmWave) radar is a promising technology for vehicle\nsensing due to its cost-effectiveness and operability in adverse weather\nconditions. However, the adoption of this technology has been hindered by\nsparsity and noise issues in radar point cloud data. This paper introduces\nspatial multi-representation fusion (SMURF), a novel approach to 3D object\ndetection using a single 4D imaging radar. SMURF leverages multiple\nrepresentations of radar detection points, including pillarization and density\nfeatures of a multi-dimensional Gaussian mixture distribution through kernel\ndensity estimation (KDE). KDE effectively mitigates measurement inaccuracy\ncaused by limited angular resolution and multi-path propagation of radar\nsignals. Additionally, KDE helps alleviate point cloud sparsity by capturing\ndensity features. Experimental evaluations on View-of-Delft (VoD) and\nTJ4DRadSet datasets demonstrate the effectiveness and generalization ability of\nSMURF, outperforming recently proposed 4D imaging radar-based\nsingle-representation models. Moreover, while using 4D imaging radar only,\nSMURF still achieves comparable performance to the state-of-the-art 4D imaging\nradar and camera fusion-based method, with an increase of 1.22% in the mean\naverage precision on bird's-eye view of TJ4DRadSet dataset and 1.32% in the 3D\nmean average precision on the entire annotated area of VoD dataset. Our\nproposed method demonstrates impressive inference time and addresses the\nchallenges of real-time detection, with the inference time no more than 0.05\nseconds for most scans on both datasets. This research highlights the benefits\nof 4D mmWave radar and is a strong benchmark for subsequent works regarding 3D\nobject detection with 4D imaging radar.\n","authors":["Jianan Liu","Qiuchi Zhao","Weiyi Xiong","Tao Huang","Qing-Long Han","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.10784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10782v1","updated":"2023-07-20T11:32:51Z","published":"2023-07-20T11:32:51Z","title":"See More and Know More: Zero-shot Point Cloud Segmentation via\n  Multi-modal Visual Data","summary":"  Zero-shot point cloud segmentation aims to make deep models capable of\nrecognizing novel objects in point cloud that are unseen in the training phase.\nRecent trends favor the pipeline which transfers knowledge from seen classes\nwith labels to unseen classes without labels. They typically align visual\nfeatures with semantic features obtained from word embedding by the supervision\nof seen classes' annotations. However, point cloud contains limited information\nto fully match with semantic features. In fact, the rich appearance information\nof images is a natural complement to the textureless point cloud, which is not\nwell explored in previous literature. Motivated by this, we propose a novel\nmulti-modal zero-shot learning method to better utilize the complementary\ninformation of point clouds and images for more accurate visual-semantic\nalignment. Extensive experiments are performed in two popular benchmarks, i.e.,\nSemanticKITTI and nuScenes, and our method outperforms current SOTA methods\nwith 52% and 49% improvement on average for unseen class mIoU, respectively.\n","authors":["Yuhang Lu","Qi Jiang","Runnan Chen","Yuenan Hou","Xinge Zhu","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2307.10782v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10780v1","updated":"2023-07-20T11:30:12Z","published":"2023-07-20T11:30:12Z","title":"Learned Thresholds Token Merging and Pruning for Vision Transformers","summary":"  Vision transformers have demonstrated remarkable success in a wide range of\ncomputer vision tasks over the last years. However, their high computational\ncosts remain a significant barrier to their practical deployment. In\nparticular, the complexity of transformer models is quadratic with respect to\nthe number of input tokens. Therefore techniques that reduce the number of\ninput tokens that need to be processed have been proposed. This paper\nintroduces Learned Thresholds token Merging and Pruning (LTMP), a novel\napproach that leverages the strengths of both token merging and token pruning.\nLTMP uses learned threshold masking modules that dynamically determine which\ntokens to merge and which to prune. We demonstrate our approach with extensive\nexperiments on vision transformers on the ImageNet classification task. Our\nresults demonstrate that LTMP achieves state-of-the-art accuracy across\nreduction rates while requiring only a single fine-tuning epoch, which is an\norder of magnitude faster than previous methods. Code is available at\nhttps://github.com/Mxbonn/ltmp .\n","authors":["Maxim Bonnaerens","Joni Dambre"],"pdf_url":"https://arxiv.org/pdf/2307.10780v1.pdf","comment":"Paper to be presented at Efficient Systems for Foundation Models\n  Workshop at the International Conference on Machine Learning (ICML) 2023"},{"id":"http://arxiv.org/abs/2307.10776v1","updated":"2023-07-20T11:24:55Z","published":"2023-07-20T11:24:55Z","title":"Urban Radiance Field Representation with Deformable Neural Mesh\n  Primitives","summary":"  Neural Radiance Fields (NeRFs) have achieved great success in the past few\nyears. However, most current methods still require intensive resources due to\nray marching-based rendering. To construct urban-level radiance fields\nefficiently, we design Deformable Neural Mesh Primitive~(DNMP), and propose to\nparameterize the entire scene with such primitives. The DNMP is a flexible and\ncompact neural variant of classic mesh representation, which enjoys both the\nefficiency of rasterization-based rendering and the powerful neural\nrepresentation capability for photo-realistic image synthesis. Specifically, a\nDNMP consists of a set of connected deformable mesh vertices with paired vertex\nfeatures to parameterize the geometry and radiance information of a local area.\nTo constrain the degree of freedom for optimization and lower the storage\nbudgets, we enforce the shape of each primitive to be decoded from a relatively\nlow-dimensional latent space. The rendering colors are decoded from the vertex\nfeatures (interpolated with rasterization) by a view-dependent MLP. The DNMP\nprovides a new paradigm for urban-level scene representation with appealing\nproperties: $(1)$ High-quality rendering. Our method achieves leading\nperformance for novel view synthesis in urban scenarios. $(2)$ Low\ncomputational costs. Our representation enables fast rendering (2.07ms/1k\npixels) and low peak memory usage (110MB/1k pixels). We also present a\nlightweight version that can run 33$\\times$ faster than vanilla NeRFs, and\ncomparable to the highly-optimized Instant-NGP (0.61 vs 0.71ms/1k pixels).\nProject page: \\href{https://dnmp.github.io/}{https://dnmp.github.io/}.\n","authors":["Fan Lu","Yan Xu","Guang Chen","Hongsheng Li","Kwan-Yee Lin","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.10776v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2307.10768v1","updated":"2023-07-20T10:57:02Z","published":"2023-07-20T10:57:02Z","title":"Decoding the Enigma: Benchmarking Humans and AIs on the Many Facets of\n  Working Memory","summary":"  Working memory (WM), a fundamental cognitive process facilitating the\ntemporary storage, integration, manipulation, and retrieval of information,\nplays a vital role in reasoning and decision-making tasks. Robust benchmark\ndatasets that capture the multifaceted nature of WM are crucial for the\neffective development and evaluation of AI WM models. Here, we introduce a\ncomprehensive Working Memory (WorM) benchmark dataset for this purpose. WorM\ncomprises 10 tasks and a total of 1 million trials, assessing 4\nfunctionalities, 3 domains, and 11 behavioral and neural characteristics of WM.\nWe jointly trained and tested state-of-the-art recurrent neural networks and\ntransformers on all these tasks. We also include human behavioral benchmarks as\nan upper bound for comparison. Our results suggest that AI models replicate\nsome characteristics of WM in the brain, most notably primacy and recency\neffects, and neural clusters and correlates specialized for different domains\nand functionalities of WM. In the experiments, we also reveal some limitations\nin existing models to approximate human behavior. This dataset serves as a\nvaluable resource for communities in cognitive psychology, neuroscience, and\nAI, offering a standardized framework to compare and enhance WM models,\ninvestigate WM's neural underpinnings, and develop WM models with human-like\ncapabilities. Our source code and data are available at\nhttps://github.com/ZhangLab-DeepNeuroCogLab/WorM.\n","authors":["Ankur Sikarwar","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10763v1","updated":"2023-07-20T10:53:12Z","published":"2023-07-20T10:53:12Z","title":"MSQNet: Actor-agnostic Action Recognition with Multi-modal Query","summary":"  Existing action recognition methods are typically actor-specific due to the\nintrinsic topological and apparent differences among the actors. This requires\nactor-specific pose estimation (e.g., humans vs. animals), leading to\ncumbersome model design complexity and high maintenance costs. Moreover, they\noften focus on learning the visual modality alone and single-label\nclassification whilst neglecting other available information sources (e.g.,\nclass name text) and the concurrent occurrence of multiple actions. To overcome\nthese limitations, we propose a new approach called 'actor-agnostic multi-modal\nmulti-label action recognition,' which offers a unified solution for various\ntypes of actors, including humans and animals. We further formulate a novel\nMulti-modal Semantic Query Network (MSQNet) model in a transformer-based object\ndetection framework (e.g., DETR), characterized by leveraging visual and\ntextual modalities to represent the action classes better. The elimination of\nactor-specific model designs is a key advantage, as it removes the need for\nactor pose estimation altogether. Extensive experiments on five publicly\navailable benchmarks show that our MSQNet consistently outperforms the prior\narts of actor-specific alternatives on human and animal single- and multi-label\naction recognition tasks by up to 50%. Code will be released at\nhttps://github.com/mondalanindya/MSQNet.\n","authors":["Anindya Mondal","Sauradip Nag","Joaquin M Prada","Xiatian Zhu","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.10763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10753v1","updated":"2023-07-20T10:29:48Z","published":"2023-07-20T10:29:48Z","title":"LBL: Logarithmic Barrier Loss Function for One-class Classification","summary":"  One-class classification (OCC) aims to train a classifier only with the\ntarget class data and attracts great attention for its strong applicability in\nreal-world application. Despite a lot of advances have been made in OCC, it\nstill lacks the effective OCC loss functions for deep learning. In this paper,\na novel logarithmic barrier function based OCC loss (LBL) that assigns large\ngradients to the margin samples and thus derives more compact hypersphere, is\nfirst proposed by approximating the OCC objective smoothly. But the\noptimization of LBL may be instability especially when samples lie on the\nboundary leading to the infinity loss. To address this issue, then, a\nunilateral relaxation Sigmoid function is introduced into LBL and a novel OCC\nloss named LBLSig is proposed. The LBLSig can be seen as the fusion of the mean\nsquare error (MSE) and the cross entropy (CE) and the optimization of LBLSig is\nsmoother owing to the unilateral relaxation Sigmoid function. The effectiveness\nof the proposed LBL and LBLSig is experimentally demonstrated in comparisons\nwith several state-of-the-art OCC algorithms on different network structures.\nThe source code can be found at https://github.com/ML-HDU/LBL_LBLSig.\n","authors":["Tianlei Wang","Dekang Liu","Wandong Zhang","Jiuwen Cao"],"pdf_url":"https://arxiv.org/pdf/2307.10753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13960v2","updated":"2023-07-20T10:26:56Z","published":"2023-06-24T13:29:54Z","title":"Regular SE(3) Group Convolutions for Volumetric Medical Image Analysis","summary":"  Regular group convolutional neural networks (G-CNNs) have been shown to\nincrease model performance and improve equivariance to different geometrical\nsymmetries. This work addresses the problem of SE(3), i.e., roto-translation\nequivariance, on volumetric data. Volumetric image data is prevalent in many\nmedical settings. Motivated by the recent work on separable group convolutions,\nwe devise a SE(3) group convolution kernel separated into a continuous SO(3)\n(rotation) kernel and a spatial kernel. We approximate equivariance to the\ncontinuous setting by sampling uniform SO(3) grids. Our continuous SO(3) kernel\nis parameterized via RBF interpolation on similarly uniform grids. We\ndemonstrate the advantages of our approach in volumetric medical image\nanalysis. Our SE(3) equivariant models consistently outperform CNNs and regular\ndiscrete G-CNNs on challenging medical classification tasks and show\nsignificantly improved generalization capabilities. Our approach achieves up to\na 16.5% gain in accuracy over regular CNNs.\n","authors":["Thijs P. Kuipers","Erik J. Bekkers"],"pdf_url":"https://arxiv.org/pdf/2306.13960v2.pdf","comment":"10 pages, 1 figure, 2 tables, accepted at MICCAI 2023. Updated\n  version to camera ready version 1"},{"id":"http://arxiv.org/abs/2307.10745v1","updated":"2023-07-20T10:16:03Z","published":"2023-07-20T10:16:03Z","title":"EdgeAL: An Edge Estimation Based Active Learning Approach for OCT\n  Segmentation","summary":"  Active learning algorithms have become increasingly popular for training\nmodels with limited data. However, selecting data for annotation remains a\nchallenging problem due to the limited information available on unseen data. To\naddress this issue, we propose EdgeAL, which utilizes the edge information of\nunseen images as {\\it a priori} information for measuring uncertainty. The\nuncertainty is quantified by analyzing the divergence and entropy in model\npredictions across edges. This measure is then used to select superpixels for\nannotation. We demonstrate the effectiveness of EdgeAL on multi-class Optical\nCoherence Tomography (OCT) segmentation tasks, where we achieved a 99% dice\nscore while reducing the annotation label cost to 12%, 2.3%, and 3%,\nrespectively, on three publicly available datasets (Duke, AROI, and UMN). The\nsource code is available at \\url{https://github.com/Mak-Ta-Reque/EdgeAL}\n","authors":["Md Abdul Kadir","Hasan Md Tusfiqur Alam","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2307.10745v1.pdf","comment":"This version of the contribution has been accepted for publication,\n  after peer review (when applicable) but is not the Version of Record and does\n  not reflect post-acceptance improvements, or any corrections. Use of this\n  Accepted Version is subject to the publisher's Accepted Manuscript terms of\n  use\n  https://www.springernature.com/gp/open-research/policies/accepted-manuscript-terms"},{"id":"http://arxiv.org/abs/2307.02347v3","updated":"2023-07-20T09:54:41Z","published":"2023-07-05T15:03:10Z","title":"Detecting Images Generated by Deep Diffusion Models using their Local\n  Intrinsic Dimensionality","summary":"  Diffusion models recently have been successfully applied for the visual\nsynthesis of strikingly realistic appearing images. This raises strong concerns\nabout their potential for malicious purposes. In this paper, we propose using\nthe lightweight multi Local Intrinsic Dimensionality (multiLID), which has been\noriginally developed in context of the detection of adversarial examples, for\nthe automatic detection of synthetic images and the identification of the\naccording generator networks. In contrast to many existing detection\napproaches, which often only work for GAN-generated images, the proposed method\nprovides close to perfect detection results in many realistic use cases.\nExtensive experiments on known and newly created datasets demonstrate that the\nproposed multiLID approach exhibits superiority in diffusion detection and\nmodel identification. Since the empirical evaluations of recent publications on\nthe detection of generated images are often mainly focused on the\n\"LSUN-Bedroom\" dataset, we further establish a comprehensive benchmark for the\ndetection of diffusion-generated images, including samples from several\ndiffusion models with different image sizes.\n","authors":["Peter Lorenz","Ricard Durall","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.02347v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01091v2","updated":"2023-07-20T09:40:13Z","published":"2023-07-03T15:09:32Z","title":"UW-ProCCaps: UnderWater Progressive Colourisation with Capsules","summary":"  Underwater images are fundamental for studying and understanding the status\nof marine life. We focus on reducing the memory space required for image\nstorage while the memory space consumption in the collecting phase limits the\ntime lasting of this phase leading to the need for more image collection\ncampaigns. We present a novel machine-learning model that reconstructs the\ncolours of underwater images from their luminescence channel, thus saving 2/3\nof the available storage space. Our model specialises in underwater colour\nreconstruction and consists of an encoder-decoder architecture. The encoder is\ncomposed of a convolutional encoder and a parallel specialised classifier\ntrained with webly-supervised data. The encoder and the decoder use layers of\ncapsules to capture the features of the entities in the image. The colour\nreconstruction process recalls the progressive and the generative adversarial\ntraining procedures. The progressive training gives the ground for a generative\nadversarial routine focused on the refining of colours giving the image bright\nand saturated colours which bring the image back to life. We validate the model\nboth qualitatively and quantitatively on four benchmark datasets. This is the\nfirst attempt at colour reconstruction in greyscale underwater images.\nExtensive results on four benchmark datasets demonstrate that our solution\noutperforms state-of-the-art (SOTA) solutions. We also demonstrate that the\ngenerated colourisation enhances the quality of images compared to enhancement\nmodels at the SOTA.\n","authors":["Rita Pucci","Niki Martinel"],"pdf_url":"https://arxiv.org/pdf/2307.01091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10713v1","updated":"2023-07-20T09:13:32Z","published":"2023-07-20T09:13:32Z","title":"Kick Back & Relax: Learning to Reconstruct the World by Watching SlowTV","summary":"  Self-supervised monocular depth estimation (SS-MDE) has the potential to\nscale to vast quantities of data. Unfortunately, existing approaches limit\nthemselves to the automotive domain, resulting in models incapable of\ngeneralizing to complex environments such as natural or indoor settings.\n  To address this, we propose a large-scale SlowTV dataset curated from\nYouTube, containing an order of magnitude more data than existing automotive\ndatasets. SlowTV contains 1.7M images from a rich diversity of environments,\nsuch as worldwide seasonal hiking, scenic driving and scuba diving. Using this\ndataset, we train an SS-MDE model that provides zero-shot generalization to a\nlarge collection of indoor/outdoor datasets. The resulting model outperforms\nall existing SSL approaches and closes the gap on supervised SoTA, despite\nusing a more efficient architecture.\n  We additionally introduce a collection of best-practices to further maximize\nperformance and zero-shot generalization. This includes 1) aspect ratio\naugmentation, 2) camera intrinsic estimation, 3) support frame randomization\nand 4) flexible motion estimation. Code is available at\nhttps://github.com/jspenmar/slowtv_monodepth.\n","authors":["Jaime Spencer","Chris Russell","Simon Hadfield","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2307.10713v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2307.10711v1","updated":"2023-07-20T09:06:21Z","published":"2023-07-20T09:06:21Z","title":"AdjointDPM: Adjoint Sensitivity Method for Gradient Backpropagation of\n  Diffusion Probabilistic Models","summary":"  Existing customization methods require access to multiple reference examples\nto align pre-trained diffusion probabilistic models (DPMs) with user-provided\nconcepts. This paper aims to address the challenge of DPM customization when\nthe only available supervision is a differentiable metric defined on the\ngenerated contents. Since the sampling procedure of DPMs involves recursive\ncalls to the denoising UNet, na\\\"ive gradient backpropagation requires storing\nthe intermediate states of all iterations, resulting in extremely high memory\nconsumption. To overcome this issue, we propose a novel method AdjointDPM,\nwhich first generates new samples from diffusion models by solving the\ncorresponding probability-flow ODEs. It then uses the adjoint sensitivity\nmethod to backpropagate the gradients of the loss to the models' parameters\n(including conditioning signals, network weights, and initial noises) by\nsolving another augmented ODE. To reduce numerical errors in both the forward\ngeneration and gradient backpropagation processes, we further reparameterize\nthe probability-flow ODE and augmented ODE as simple non-stiff ODEs using\nexponential integration. Finally, we demonstrate the effectiveness of\nAdjointDPM on three interesting tasks: converting visual effects into\nidentification text embeddings, finetuning DPMs for specific types of\nstylization, and optimizing initial noise to generate adversarial samples for\nsecurity auditing.\n","authors":["Jiachun Pan","Hanshu Yan","Jun Hao Liew","Vincent Y. F. Tan","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2307.10711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.10552v2","updated":"2023-07-20T08:57:20Z","published":"2022-06-21T17:33:53Z","title":"Vicinity Vision Transformer","summary":"  Vision transformers have shown great success on numerous computer vision\ntasks. However, its central component, softmax attention, prohibits vision\ntransformers from scaling up to high-resolution images, due to both the\ncomputational complexity and memory footprint being quadratic. Although linear\nattention was introduced in natural language processing (NLP) tasks to mitigate\na similar issue, directly applying existing linear attention to vision\ntransformers may not lead to satisfactory results. We investigate this problem\nand find that computer vision tasks focus more on local information compared\nwith NLP tasks. Based on this observation, we present a Vicinity Attention that\nintroduces a locality bias to vision transformers with linear complexity.\nSpecifically, for each image patch, we adjust its attention weight based on its\n2D Manhattan distance measured by its neighbouring patches. In this case, the\nneighbouring patches will receive stronger attention than far-away patches.\nMoreover, since our Vicinity Attention requires the token length to be much\nlarger than the feature dimension to show its efficiency advantages, we further\npropose a new Vicinity Vision Transformer (VVT) structure to reduce the feature\ndimension without degenerating the accuracy. We perform extensive experiments\non the CIFAR100, ImageNet1K, and ADE20K datasets to validate the effectiveness\nof our method. Our method has a slower growth rate of GFlops than previous\ntransformer-based and convolution-based networks when the input resolution\nincreases. In particular, our approach achieves state-of-the-art image\nclassification accuracy with 50% fewer parameters than previous methods.\n","authors":["Weixuan Sun","Zhen Qin","Hui Deng","Jianyuan Wang","Yi Zhang","Kaihao Zhang","Nick Barnes","Stan Birchfield","Lingpeng Kong","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2206.10552v2.pdf","comment":"code: https://github.com/OpenNLPLab/Vicinity-Vision-Transformer"},{"id":"http://arxiv.org/abs/2307.10705v1","updated":"2023-07-20T08:53:47Z","published":"2023-07-20T08:53:47Z","title":"TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and\n  Lane Segmentation in Self-Driving Cars","summary":"  Semantic segmentation is a common task in autonomous driving to understand\nthe surrounding environment. Driveable Area Segmentation and Lane Detection are\nparticularly important for safe and efficient navigation on the road. However,\noriginal semantic segmentation models are computationally expensive and require\nhigh-end hardware, which is not feasible for embedded systems in autonomous\nvehicles. This paper proposes a lightweight model for the driveable area and\nlane line segmentation. TwinLiteNet is designed cheaply but achieves accurate\nand efficient segmentation results. We evaluate TwinLiteNet on the BDD100K\ndataset and compare it with modern models. Experimental results show that our\nTwinLiteNet performs similarly to existing approaches, requiring significantly\nfewer computational resources. Specifically, TwinLiteNet achieves a mIoU score\nof 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task\nwith only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000.\nFurthermore, TwinLiteNet can run in real-time on embedded devices with limited\ncomputing power, especially since it achieves 60FPS on Jetson Xavier NX, making\nit an ideal solution for self-driving vehicles. Code is available:\nurl{https://github.com/chequanghuy/TwinLiteNet}.\n","authors":["Quang Huy Che","Dinh Phuc Nguyen","Minh Quan Pham","Duc Khai Lam"],"pdf_url":"https://arxiv.org/pdf/2307.10705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10165v2","updated":"2023-07-20T08:53:13Z","published":"2023-07-19T17:46:55Z","title":"Drone navigation and license place detection for vehicle location in\n  indoor spaces","summary":"  Millions of vehicles are transported every year, tightly parked in vessels or\nboats. To reduce the risks of associated safety issues like fires, knowing the\nlocation of vehicles is essential, since different vehicles may need different\nmitigation measures, e.g. electric cars. This work is aimed at creating a\nsolution based on a nano-drone that navigates across rows of parked vehicles\nand detects their license plates. We do so via a wall-following algorithm, and\na CNN trained to detect license plates. All computations are done in real-time\non the drone, which just sends position and detected images that allow the\ncreation of a 2D map with the position of the plates. Our solution is capable\nof reading all plates across eight test cases (with several rows of plates,\ndifferent drone speeds, or low light) by aggregation of measurements across\nseveral drone journeys.\n","authors":["Moa Arvidsson","Sithichot Sawirot","Cristofer Englund","Fernando Alonso-Fernandez","Martin Torstensson","Boris Duran"],"pdf_url":"https://arxiv.org/pdf/2307.10165v2.pdf","comment":"Published at VIII International Workshop on Artificial Intelligence\n  and Pattern Recognition, IWAIPR 2023"},{"id":"http://arxiv.org/abs/2205.09753v2","updated":"2023-07-20T08:41:46Z","published":"2022-04-30T07:08:30Z","title":"HDGT: Heterogeneous Driving Graph Transformer for Multi-Agent Trajectory\n  Prediction via Scene Encoding","summary":"  Encoding a driving scene into vector representations has been an essential\ntask for autonomous driving that can benefit downstream tasks e.g. trajectory\nprediction. The driving scene often involves heterogeneous elements such as the\ndifferent types of objects (agents, lanes, traffic signs) and the semantic\nrelations between objects are rich and diverse. Meanwhile, there also exist\nrelativity across elements, which means that the spatial relation is a relative\nconcept and need be encoded in a ego-centric manner instead of in a global\ncoordinate system. Based on these observations, we propose Heterogeneous\nDriving Graph Transformer (HDGT), a backbone modelling the driving scene as a\nheterogeneous graph with different types of nodes and edges. For heterogeneous\ngraph construction, we connect different types of nodes according to diverse\nsemantic relations. For spatial relation encoding, the coordinates of the node\nas well as its in-edges are in the local node-centric coordinate system. For\nthe aggregation module in the graph neural network (GNN), we adopt the\ntransformer structure in a hierarchical way to fit the heterogeneous nature of\ninputs. Experimental results show that HDGT achieves state-of-the-art\nperformance for the task of trajectory prediction, on INTERACTION Prediction\nChallenge and Waymo Open Motion Challenge.\n","authors":["Xiaosong Jia","Penghao Wu","Li Chen","Yu Liu","Hongyang Li","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2205.09753v2.pdf","comment":"Accepted at IEEE TPAMI in 2023. Code url:\n  https://github.com/OpenDriveLab/HDGT"},{"id":"http://arxiv.org/abs/2307.10698v1","updated":"2023-07-20T08:39:20Z","published":"2023-07-20T08:39:20Z","title":"Reverse Knowledge Distillation: Training a Large Model using a Small One\n  for Retinal Image Matching on Limited Data","summary":"  Retinal image matching plays a crucial role in monitoring disease progression\nand treatment response. However, datasets with matched keypoints between\ntemporally separated pairs of images are not available in abundance to train\ntransformer-based model. We propose a novel approach based on reverse knowledge\ndistillation to train large models with limited data while preventing\noverfitting. Firstly, we propose architectural modifications to a CNN-based\nsemi-supervised method called SuperRetina that help us improve its results on a\npublicly available dataset. Then, we train a computationally heavier model\nbased on a vision transformer encoder using the lighter CNN-based model, which\nis counter-intuitive in the field knowledge-distillation research where\ntraining lighter models based on heavier ones is the norm. Surprisingly, such\nreverse knowledge distillation improves generalization even further. Our\nexperiments suggest that high-dimensional fitting in representation space may\nprevent overfitting unlike training directly to match the final output. We also\nprovide a public dataset with annotations for retinal image keypoint detection\nand matching to help the research community develop algorithms for retinal\nimage applications.\n","authors":["Sahar Almahfouz Nasser","Nihar Gupte","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2307.10698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10697v1","updated":"2023-07-20T08:38:50Z","published":"2023-07-20T08:38:50Z","title":"SqueezerFaceNet: Reducing a Small Face Recognition CNN Even More Via\n  Filter Pruning","summary":"  The widespread use of mobile devices for various digital services has created\na need for reliable and real-time person authentication. In this context,\nfacial recognition technologies have emerged as a dependable method for\nverifying users due to the prevalence of cameras in mobile devices and their\nintegration into everyday applications. The rapid advancement of deep\nConvolutional Neural Networks (CNNs) has led to numerous face verification\narchitectures. However, these models are often large and impractical for mobile\napplications, reaching sizes of hundreds of megabytes with millions of\nparameters. We address this issue by developing SqueezerFaceNet, a light face\nrecognition network which less than 1M parameters. This is achieved by applying\na network pruning method based on Taylor scores, where filters with small\nimportance scores are removed iteratively. Starting from an already small\nnetwork (of 1.24M) based on SqueezeNet, we show that it can be further reduced\n(up to 40%) without an appreciable loss in performance. To the best of our\nknowledge, we are the first to evaluate network pruning methods for the task of\nface recognition.\n","authors":["Fernando Alonso-Fernandez","Kevin Hernandez-Diaz","Jose Maria Buades Rubio","Josef Bigun"],"pdf_url":"https://arxiv.org/pdf/2307.10697v1.pdf","comment":"Published at VIII International Workshop on Artificial Intelligence\n  and Pattern Recognition, IWAIPR 2023"},{"id":"http://arxiv.org/abs/2307.10696v1","updated":"2023-07-20T08:38:15Z","published":"2023-07-20T08:38:15Z","title":"SLPD: Slide-level Prototypical Distillation for WSIs","summary":"  Improving the feature representation ability is the foundation of many whole\nslide pathological image (WSIs) tasks. Recent works have achieved great success\nin pathological-specific self-supervised learning (SSL). However, most of them\nonly focus on learning patch-level representations, thus there is still a gap\nbetween pretext and slide-level downstream tasks, e.g., subtyping, grading and\nstaging. Aiming towards slide-level representations, we propose Slide-Level\nPrototypical Distillation (SLPD) to explore intra- and inter-slide semantic\nstructures for context modeling on WSIs. Specifically, we iteratively perform\nintra-slide clustering for the regions (4096x4096 patches) within each WSI to\nyield the prototypes and encourage the region representations to be closer to\nthe assigned prototypes. By representing each slide with its prototypes, we\nfurther select similar slides by the set distance of prototypes and assign the\nregions by cross-slide prototypes for distillation. SLPD achieves\nstate-of-the-art results on multiple slide-level benchmarks and demonstrates\nthat representation learning of semantic structures of slides can make a\nsuitable proxy task for WSI analysis. Code will be available at\nhttps://github.com/Carboxy/SLPD.\n","authors":["Zhimiao Yu","Tiancheng Lin","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2307.10696v1.pdf","comment":"International Conference on Medical Image Computing and Computer\n  Assisted Intervention (MICCAI)"},{"id":"http://arxiv.org/abs/2307.10695v1","updated":"2023-07-20T08:38:01Z","published":"2023-07-20T08:38:01Z","title":"Self2Self+: Single-Image Denoising with Self-Supervised Learning and\n  Image Quality Assessment Loss","summary":"  Recently, denoising methods based on supervised learning have exhibited\npromising performance. However, their reliance on external datasets containing\nnoisy-clean image pairs restricts their applicability. To address this\nlimitation, researchers have focused on training denoising networks using\nsolely a set of noisy inputs. To improve the feasibility of denoising\nprocedures, in this study, we proposed a single-image self-supervised learning\nmethod in which only the noisy input image is used for network training. Gated\nconvolution was used for feature extraction and no-reference image quality\nassessment was used for guiding the training process. Moreover, the proposed\nmethod sampled instances from the input image dataset using Bernoulli sampling\nwith a certain dropout rate for training. The corresponding result was produced\nby averaging the generated predictions from various instances of the trained\nnetwork with dropouts. The experimental results indicated that the proposed\nmethod achieved state-of-the-art denoising performance on both synthetic and\nreal-world datasets. This highlights the effectiveness and practicality of our\nmethod as a potential solution for various noise removal tasks.\n","authors":["Jaekyun Ko","Sanghwan Lee"],"pdf_url":"https://arxiv.org/pdf/2307.10695v1.pdf","comment":"Technical report and supplemantry materials are combined into one\n  paper. - Technical report: Page 1~7 - Supplemantry materials : Page 8~18"},{"id":"http://arxiv.org/abs/2302.08292v3","updated":"2023-07-20T08:35:26Z","published":"2023-02-16T13:41:19Z","title":"Navya3DSeg -- Navya 3D Semantic Segmentation Dataset & split generation\n  for autonomous vehicles","summary":"  Autonomous driving (AD) perception today relies heavily on deep learning\nbased architectures requiring large scale annotated datasets with their\nassociated costs for curation and annotation. The 3D semantic data are useful\nfor core perception tasks such as obstacle detection and ego-vehicle\nlocalization. We propose a new dataset, Navya 3D Segmentation (Navya3DSeg),\nwith a diverse label space corresponding to a large scale production grade\noperational domain, including rural, urban, industrial sites and universities\nfrom 13 countries. It contains 23 labeled sequences and 25 supplementary\nsequences without labels, designed to explore self-supervised and\nsemi-supervised semantic segmentation benchmarks on point clouds. We also\npropose a novel method for sequential dataset split generation based on\niterative multi-label stratification, and demonstrated to achieve a +1.2% mIoU\nimprovement over the original split proposed by SemanticKITTI dataset. A\ncomplete benchmark for semantic segmentation task was performed, with state of\nthe art methods. Finally, we demonstrate an Active Learning (AL) based dataset\ndistillation framework. We introduce a novel heuristic-free sampling method\ncalled ego-pose distance based sampling in the context of AL. A detailed\npresentation on the dataset is available here\nhttps://www.youtube.com/watch?v=5m6ALIs-s20.\n","authors":["Alexandre Almin","Léo Lemarié","Anh Duong","B Ravi Kiran"],"pdf_url":"https://arxiv.org/pdf/2302.08292v3.pdf","comment":"Accepted version to IEEE RA-L. Version with supplementary materials"},{"id":"http://arxiv.org/abs/2307.10685v1","updated":"2023-07-20T08:25:38Z","published":"2023-07-20T08:25:38Z","title":"Pre-train, Adapt and Detect: Multi-Task Adapter Tuning for Camouflaged\n  Object Detection","summary":"  Camouflaged object detection (COD), aiming to segment camouflaged objects\nwhich exhibit similar patterns with the background, is a challenging task. Most\nexisting works are dedicated to establishing specialized modules to identify\ncamouflaged objects with complete and fine details, while the boundary can not\nbe well located for the lack of object-related semantics. In this paper, we\npropose a novel ``pre-train, adapt and detect\" paradigm to detect camouflaged\nobjects. By introducing a large pre-trained model, abundant knowledge learned\nfrom massive multi-modal data can be directly transferred to COD. A lightweight\nparallel adapter is inserted to adjust the features suitable for the downstream\nCOD task. Extensive experiments on four challenging benchmark datasets\ndemonstrate that our method outperforms existing state-of-the-art COD models by\nlarge margins. Moreover, we design a multi-task learning scheme for tuning the\nadapter to exploit the shareable knowledge across different semantic classes.\nComprehensive experimental results showed that the generalization ability of\nour model can be substantially improved with multi-task adapter initialization\non source tasks and multi-task adaptation on target tasks.\n","authors":["Yinghui Xing","Dexuan Kong","Shizhou Zhang","Geng Chen","Lingyan Ran","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12112v3","updated":"2023-07-20T08:16:09Z","published":"2023-03-21T18:03:14Z","title":"Positive-Augmented Contrastive Learning for Image and Video Captioning\n  Evaluation","summary":"  The CLIP model has been recently proven to be very effective for a variety of\ncross-modal tasks, including the evaluation of captions generated from\nvision-and-language architectures. In this paper, we propose a new recipe for a\ncontrastive-based evaluation metric for image captioning, namely\nPositive-Augmented Contrastive learning Score (PAC-S), that in a novel way\nunifies the learning of a contrastive visual-semantic space with the addition\nof generated images and text on curated data. Experiments spanning several\ndatasets demonstrate that our new metric achieves the highest correlation with\nhuman judgments on both images and videos, outperforming existing\nreference-based metrics like CIDEr and SPICE and reference-free metrics like\nCLIP-Score. Finally, we test the system-level correlation of the proposed\nmetric when considering popular image captioning approaches, and assess the\nimpact of employing different cross-modal features. Our source code and trained\nmodels are publicly available at: https://github.com/aimagelab/pacscore.\n","authors":["Sara Sarto","Manuele Barraco","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2303.12112v3.pdf","comment":"CVPR 2023 (highlight paper)"},{"id":"http://arxiv.org/abs/2307.05921v3","updated":"2023-07-20T08:14:17Z","published":"2023-07-12T05:36:47Z","title":"Reading Radiology Imaging Like The Radiologist","summary":"  Automated radiology report generation aims to generate radiology reports that\ncontain rich, fine-grained descriptions of radiology imaging. Compared with\nimage captioning in the natural image domain, medical images are very similar\nto each other, with only minor differences in the occurrence of diseases. Given\nthe importance of these minor differences in the radiology report, it is\ncrucial to encourage the model to focus more on the subtle regions of disease\noccurrence. Secondly, the problem of visual and textual data biases is serious.\nNot only do normal cases make up the majority of the dataset, but sentences\ndescribing areas with pathological changes also constitute only a small part of\nthe paragraph. Lastly, generating medical image reports involves the challenge\nof long text generation, which requires more expertise and empirical training\nin medical knowledge. As a result, the difficulty of generating such reports is\nincreased. To address these challenges, we propose a disease-oriented retrieval\nframework that utilizes similar reports as prior knowledge references. We\ndesign a factual consistency captioning generator to generate more accurate and\nfactually consistent disease descriptions. Our framework can find most similar\nreports for a given disease from the CXR database by retrieving a\ndisease-oriented mask consisting of the position and morphological\ncharacteristics. By referencing the disease-oriented similar report and the\nvisual features, the factual consistency model can generate a more accurate\nradiology report.\n","authors":["Yuhao Wang"],"pdf_url":"https://arxiv.org/pdf/2307.05921v3.pdf","comment":"There are data writing errors in the paper"},{"id":"http://arxiv.org/abs/2307.10677v1","updated":"2023-07-20T07:57:14Z","published":"2023-07-20T07:57:14Z","title":"Deep learning for classification of noisy QR codes","summary":"  We wish to define the limits of a classical classification model based on\ndeep learning when applied to abstract images, which do not represent visually\nidentifiable objects.QR codes (Quick Response codes) fall into this category of\nabstract images: one bit corresponding to one encoded character, QR codes were\nnot designed to be decoded manually. To understand the limitations of a deep\nlearning-based model for abstract image classification, we train an image\nclassification model on QR codes generated from information obtained when\nreading a health pass. We compare a classification model with a classical\n(deterministic) decoding method in the presence of noise. This study allows us\nto conclude that a model based on deep learning can be relevant for the\nunderstanding of abstract images.\n","authors":["Rebecca Leygonie","Sylvain Lobry"," )","Laurent Wendling (LIPADE)"],"pdf_url":"https://arxiv.org/pdf/2307.10677v1.pdf","comment":"in French language. RFIAP 2022 - Reconnaissance des Formes, Image,\n  Apprentissage et Perception, Jul 2022, Vannes (Bretagne), France"},{"id":"http://arxiv.org/abs/2307.10667v1","updated":"2023-07-20T07:47:48Z","published":"2023-07-20T07:47:48Z","title":"Efficient Unified Demosaicing for Bayer and Non-Bayer Patterned Image\n  Sensors","summary":"  As the physical size of recent CMOS image sensors (CIS) gets smaller, the\nlatest mobile cameras are adopting unique non-Bayer color filter array (CFA)\npatterns (e.g., Quad, Nona, QxQ), which consist of homogeneous color units with\nadjacent pixels. These non-Bayer sensors are superior to conventional Bayer CFA\nthanks to their changeable pixel-bin sizes for different light conditions but\nmay introduce visual artifacts during demosaicing due to their inherent pixel\npattern structures and sensor hardware characteristics. Previous demosaicing\nmethods have primarily focused on Bayer CFA, necessitating distinct\nreconstruction methods for non-Bayer patterned CIS with various CFA modes under\ndifferent lighting conditions. In this work, we propose an efficient unified\ndemosaicing method that can be applied to both conventional Bayer RAW and\nvarious non-Bayer CFAs' RAW data in different operation modes. Our Knowledge\nLearning-based demosaicing model for Adaptive Patterns, namely KLAP, utilizes\nCFA-adaptive filters for only 1% key filters in the network for each CFA, but\nstill manages to effectively demosaic all the CFAs, yielding comparable\nperformance to the large-scale models. Furthermore, by employing meta-learning\nduring inference (KLAP-M), our model is able to eliminate unknown\nsensor-generic artifacts in real RAW data, effectively bridging the gap between\nsynthetic images and real sensor RAW. Our KLAP and KLAP-M methods achieved\nstate-of-the-art demosaicing performance in both synthetic and real RAW data of\nBayer and non-Bayer CFAs.\n","authors":["Haechang Lee","Dongwon Park","Wongi Jeong","Kijeong Kim","Hyunwoo Je","Dongil Ryu","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2307.10667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10664v1","updated":"2023-07-20T07:46:34Z","published":"2023-07-20T07:46:34Z","title":"Lighting up NeRF via Unsupervised Decomposition and Enhancement","summary":"  Neural Radiance Field (NeRF) is a promising approach for synthesizing novel\nviews, given a set of images and the corresponding camera poses of a scene.\nHowever, images photographed from a low-light scene can hardly be used to train\na NeRF model to produce high-quality results, due to their low pixel\nintensities, heavy noise, and color distortion. Combining existing low-light\nimage enhancement methods with NeRF methods also does not work well due to the\nview inconsistency caused by the individual 2D enhancement process. In this\npaper, we propose a novel approach, called Low-Light NeRF (or LLNeRF), to\nenhance the scene representation and synthesize normal-light novel views\ndirectly from sRGB low-light images in an unsupervised manner. The core of our\napproach is a decomposition of radiance field learning, which allows us to\nenhance the illumination, reduce noise and correct the distorted colors jointly\nwith the NeRF optimization process. Our method is able to produce novel view\nimages with proper lighting and vivid colors and details, given a collection of\ncamera-finished low dynamic range (8-bits/channel) images from a low-light\nscene. Experiments demonstrate that our method outperforms existing low-light\nenhancement methods and NeRF methods.\n","authors":["Haoyuan Wang","Xiaogang Xu","Ke Xu","Rynson WH. Lau"],"pdf_url":"https://arxiv.org/pdf/2307.10664v1.pdf","comment":"ICCV 2023. Project website: https://whyy.site/paper/llnerf"},{"id":"http://arxiv.org/abs/2306.16997v2","updated":"2023-07-20T07:29:03Z","published":"2023-06-29T14:54:10Z","title":"Unsupervised 3D registration through optimization-guided cyclical\n  self-training","summary":"  State-of-the-art deep learning-based registration methods employ three\ndifferent learning strategies: supervised learning, which requires costly\nmanual annotations, unsupervised learning, which heavily relies on hand-crafted\nsimilarity metrics designed by domain experts, or learning from synthetic data,\nwhich introduces a domain shift. To overcome the limitations of these\nstrategies, we propose a novel self-supervised learning paradigm for\nunsupervised registration, relying on self-training. Our idea is based on two\nkey insights. Feature-based differentiable optimizers 1) perform reasonable\nregistration even from random features and 2) stabilize the training of the\npreceding feature extraction network on noisy labels. Consequently, we propose\ncyclical self-training, where pseudo labels are initialized as the displacement\nfields inferred from random features and cyclically updated based on more and\nmore expressive features from the learning feature extractor, yielding a\nself-reinforcement effect. We evaluate the method for abdomen and lung\nregistration, consistently surpassing metric-based supervision and\noutperforming diverse state-of-the-art competitors. Source code is available at\nhttps://github.com/multimodallearning/reg-cyclical-self-train.\n","authors":["Alexander Bigalke","Lasse Hansen","Tony C. W. Mok","Mattias P. Heinrich"],"pdf_url":"https://arxiv.org/pdf/2306.16997v2.pdf","comment":"accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.10642v1","updated":"2023-07-20T07:12:56Z","published":"2023-07-20T07:12:56Z","title":"RetouchingFFHQ: A Large-scale Dataset for Fine-grained Face Retouching\n  Detection","summary":"  The widespread use of face retouching filters on short-video platforms has\nraised concerns about the authenticity of digital appearances and the impact of\ndeceptive advertising. To address these issues, there is a pressing need to\ndevelop advanced face retouching techniques. However, the lack of large-scale\nand fine-grained face retouching datasets has been a major obstacle to progress\nin this field. In this paper, we introduce RetouchingFFHQ, a large-scale and\nfine-grained face retouching dataset that contains over half a million\nconditionally-retouched images. RetouchingFFHQ stands out from previous\ndatasets due to its large scale, high quality, fine-grainedness, and\ncustomization. By including four typical types of face retouching operations\nand different retouching levels, we extend the binary face retouching detection\ninto a fine-grained, multi-retouching type, and multi-retouching level\nestimation problem. Additionally, we propose a Multi-granularity Attention\nModule (MAM) as a plugin for CNN backbones for enhanced cross-scale\nrepresentation learning. Extensive experiments using different baselines as\nwell as our proposed method on RetouchingFFHQ show decent performance on face\nretouching detection. With the proposed new dataset, we believe there is great\npotential for future work to tackle the challenging problem of real-world\nfine-grained face retouching detection.\n","authors":["Qichao Ying","Jiaxin Liu","Sheng Li","Haisheng Xu","Zhenxing Qian","Xinpeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10642v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.10638v1","updated":"2023-07-20T07:08:24Z","published":"2023-07-20T07:08:24Z","title":"Quantized Feature Distillation for Network Quantization","summary":"  Neural network quantization aims to accelerate and trim full-precision neural\nnetwork models by using low bit approximations. Methods adopting the\nquantization aware training (QAT) paradigm have recently seen a rapid growth,\nbut are often conceptually complicated. This paper proposes a novel and highly\neffective QAT method, quantized feature distillation (QFD). QFD first trains a\nquantized (or binarized) representation as the teacher, then quantize the\nnetwork using knowledge distillation (KD). Quantitative results show that QFD\nis more flexible and effective (i.e., quantization friendly) than previous\nquantization methods. QFD surpasses existing methods by a noticeable margin on\nnot only image classification but also object detection, albeit being much\nsimpler. Furthermore, QFD quantizes ViT and Swin-Transformer on MS-COCO\ndetection and segmentation, which verifies its potential in real world\ndeployment. To the best of our knowledge, this is the first time that vision\ntransformers have been quantized in object detection and image segmentation\ntasks.\n","authors":["Ke Zhu","Yin-Yin He","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2307.10638v1.pdf","comment":"AAAI2023"},{"id":"http://arxiv.org/abs/2305.08396v3","updated":"2023-07-20T07:06:03Z","published":"2023-05-15T07:23:54Z","title":"MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation","summary":"  Convolutional Neural Networks (CNNs) have made significant strides in medical\nimage analysis in recent years. However, the local nature of the convolution\noperator may pose a limitation for capturing global and long-range interactions\nin CNNs. Recently, Transformers have gained popularity in the computer vision\ncommunity and also medical image segmentation due to their ability to process\nglobal features effectively. The scalability issues of self-attention mechanism\nand lack of the CNN-like inductive bias may have limited their adoption.\nTherefore, hybrid Vision transformers (CNN-Transformer), exploiting advantages\nof both Convolution and Self-attention Mechanisms, have gained importance. In\nthis work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision\ntransformer (CNN-Transformer) for medical image segmentation. The proposed\nHybrid Decoder, based on MaxViT-block, is designed to harness the power of both\nthe convolution and self-attention mechanisms at each decoding stage with\nnominal computational burden. The inclusion of multi-axis self-attention,\nwithin each decoder stage, significantly enhances the discriminating capacity\nbetween the object and background regions, and thereby helps in improving the\nsegmentation efficiency. In the Hybrid Decoder block, the fusion process\ncommences by integrating the upsampled lower level decoder features, obtained\nthrough transpose convolution, with the skip-connection features derived from\nthe hybrid encoder. Subsequently, the fused features undergo refinement through\nthe utilization of a multi-axis attention mechanism. The proposed decoder block\nis repeated multiple times to progressively segment the nuclei regions.\nExperimental results on MoNuSeg18 and MoNuSAC20 dataset demonstrates the\neffectiveness of the proposed technique.\n","authors":["Abdul Rehman Khan","Asifullah Khan"],"pdf_url":"https://arxiv.org/pdf/2305.08396v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10636v1","updated":"2023-07-20T07:04:16Z","published":"2023-07-20T07:04:16Z","title":"Learning and Evaluating Human Preferences for Conversational Head\n  Generation","summary":"  A reliable and comprehensive evaluation metric that aligns with manual\npreference assessments is crucial for conversational head video synthesis\nmethod development. Existing quantitative evaluations often fail to capture the\nfull complexity of human preference, as they only consider limited evaluation\ndimensions. Qualitative evaluations and user studies offer a solution but are\ntime-consuming and labor-intensive. This limitation hinders the advancement of\nconversational head generation algorithms and systems. In this paper, we\npropose a novel learning-based evaluation metric named Preference Score (PS)\nfor fitting human preference according to the quantitative evaluations across\ndifferent dimensions. PS can serve as a quantitative evaluation without the\nneed for human annotation. Experimental results validate the superiority of\nPreference Score in aligning with human perception, and also demonstrates\nrobustness and generalizability to unseen data, making it a valuable tool for\nadvancing conversation head generation. We expect this metric could facilitate\nnew advances in conversational head generation.\n","authors":["Mohan Zhou","Yalong Bai","Wei Zhang","Ting Yao","Tiejun Zhao","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2307.10636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12384v2","updated":"2023-07-20T07:04:04Z","published":"2023-03-22T08:47:37Z","title":"RegFormer: An Efficient Projection-Aware Transformer Network for\n  Large-Scale Point Cloud Registration","summary":"  Although point cloud registration has achieved remarkable advances in\nobject-level and indoor scenes, large-scale registration methods are rarely\nexplored. Challenges mainly arise from the huge point number, complex\ndistribution, and outliers of outdoor LiDAR scans. In addition, most existing\nregistration works generally adopt a two-stage paradigm: They first find\ncorrespondences by extracting discriminative local features, and then leverage\nestimators (eg. RANSAC) to filter outliers, which are highly dependent on\nwell-designed descriptors and post-processing choices. To address these\nproblems, we propose an end-to-end transformer network (RegFormer) for\nlarge-scale point cloud alignment without any further post-processing.\nSpecifically, a projection-aware hierarchical transformer is proposed to\ncapture long-range dependencies and filter outliers by extracting point\nfeatures globally. Our transformer has linear complexity, which guarantees high\nefficiency even for large-scale scenes. Furthermore, to effectively reduce\nmismatches, a bijective association transformer is designed for regressing the\ninitial transformation. Extensive experiments on KITTI and NuScenes datasets\ndemonstrate that our RegFormer achieves competitive performance in terms of\nboth accuracy and efficiency.\n","authors":["Jiuming Liu","Guangming Wang","Zhe Liu","Chaokang Jiang","Marc Pollefeys","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2303.12384v2.pdf","comment":"Accepted by ICCV2023. Codes will be released at\n  https://github.com/IRMVLab/RegFormer"},{"id":"http://arxiv.org/abs/2307.10632v1","updated":"2023-07-20T06:58:11Z","published":"2023-07-20T06:58:11Z","title":"Parallelization of a new embedded application for automatic meteor\n  detection","summary":"  This article presents the methods used to parallelize a new computer vision\napplication. The system is able to automatically detect meteor from\nnon-stabilized cameras and noisy video sequences. The application is designed\nto be embedded in weather balloons or for airborne observation campaigns. Thus,\nthe final target is a low power system-on-chip (< 10 Watts) while the software\nneeds to compute a stream of frames in real-time (> 25 frames per second). For\nthis, first the application is split in a tasks graph, then different\nparallelization techniques are applied. Experiment results demonstrate the\nefficiency of the parallelization methods. For instance, on the Raspberry Pi 4\nand on a HD video sequence, the processing chain reaches 42 frames per second\nwhile it only consumes 6 Watts.\n","authors":["Mathuran Kandeepan","Clara Ciocan","Adrien Cassagne","Lionel Lacassagne"],"pdf_url":"https://arxiv.org/pdf/2307.10632v1.pdf","comment":"in French language, COMPAS 2023 - Conf{\\'e}rence francophone\n  d'informatique en Parall{\\'e}lisme, Architecture et Syst{\\`e}me, Jul 2023,\n  Annecy (France), France"},{"id":"http://arxiv.org/abs/2307.10625v1","updated":"2023-07-20T06:47:46Z","published":"2023-07-20T06:47:46Z","title":"Learning Discriminative Visual-Text Representation for Polyp\n  Re-Identification","summary":"  Colonoscopic Polyp Re-Identification aims to match a specific polyp in a\nlarge gallery with different cameras and views, which plays a key role for the\nprevention and treatment of colorectal cancer in the computer-aided diagnosis.\nHowever, traditional methods mainly focus on the visual representation\nlearning, while neglect to explore the potential of semantic features during\ntraining, which may easily leads to poor generalization capability when adapted\nthe pretrained model into the new scenarios. To relieve this dilemma, we\npropose a simple but effective training method named VT-ReID, which can\nremarkably enrich the representation of polyp videos with the interchange of\nhigh-level semantic information. Moreover, we elaborately design a novel\nclustering mechanism to introduce prior knowledge from textual data, which\nleverages contrastive learning to promote better separation from abundant\nunlabeled text data. To the best of our knowledge, this is the first attempt to\nemploy the visual-text feature with clustering mechanism for the colonoscopic\npolyp re-identification. Empirical results show that our method significantly\noutperforms current state-of-the art methods with a clear margin.\n","authors":["Suncheng Xiang","Cang Liu","Sijia Du","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2307.10625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10624v1","updated":"2023-07-20T06:44:42Z","published":"2023-07-20T06:44:42Z","title":"Joint Skeletal and Semantic Embedding Loss for Micro-gesture\n  Classification","summary":"  In this paper, we briefly introduce the solution of our team HFUT-VUT for the\nMicros-gesture Classification in the MiGA challenge at IJCAI 2023. The\nmicro-gesture classification task aims at recognizing the action category of a\ngiven video based on the skeleton data. For this task, we propose a\n3D-CNNs-based micro-gesture recognition network, which incorporates a skeletal\nand semantic embedding loss to improve action classification performance.\nFinally, we rank 1st in the Micro-gesture Classification Challenge, surpassing\nthe second-place team in terms of Top-1 accuracy by 1.10%.\n","authors":["Kun Li","Dan Guo","Guoliang Chen","Xinge Peng","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10624v1.pdf","comment":"1st Place in Micro-gesture Classification sub-challenge in MiGA at\n  IJCAI-2023"},{"id":"http://arxiv.org/abs/2211.14085v3","updated":"2023-07-20T06:42:56Z","published":"2022-11-25T13:14:33Z","title":"Positive unlabeled learning with tensor networks","summary":"  Positive unlabeled learning is a binary classification problem with positive\nand unlabeled data. It is common in domains where negative labels are costly or\nimpossible to obtain, e.g., medicine and personalized advertising. Most\napproaches to positive unlabeled learning apply to specific data types (e.g.,\nimages, categorical data) and can not generate new positive and negative\nsamples. This work introduces a feature-space distance-based tensor network\napproach to the positive unlabeled learning problem. The presented method is\nnot domain specific and significantly improves the state-of-the-art results on\nthe MNIST image and 15 categorical/mixed datasets. The trained tensor network\nmodel is also a generative model and enables the generation of new positive and\nnegative instances.\n","authors":["Bojan Žunkovič"],"pdf_url":"https://arxiv.org/pdf/2211.14085v3.pdf","comment":"12 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.10620v1","updated":"2023-07-20T06:37:47Z","published":"2023-07-20T06:37:47Z","title":"Quaternion tensor ring decomposition and application for color image\n  inpainting","summary":"  In recent years, tensor networks have emerged as powerful tools for solving\nlarge-scale optimization problems. One of the most promising tensor networks is\nthe tensor ring (TR) decomposition, which achieves circular dimensional\npermutation invariance in the model through the utilization of the trace\noperation and equitable treatment of the latent cores. On the other hand, more\nrecently, quaternions have gained significant attention and have been widely\nutilized in color image processing tasks due to their effectiveness in encoding\ncolor pixels. Therefore, in this paper, we propose the quaternion tensor ring\n(QTR) decomposition, which inherits the powerful and generalized representation\nabilities of the TR decomposition while leveraging the advantages of\nquaternions for color pixel representation. In addition to providing the\ndefinition of QTR decomposition and an algorithm for learning the QTR format,\nthis paper also proposes a low-rank quaternion tensor completion (LRQTC) model\nand its algorithm for color image inpainting based on the QTR decomposition.\nFinally, extensive experiments on color image inpainting demonstrate that the\nproposed QTLRC method is highly competitive.\n","authors":["Jifei Miao","Kit Ian Kou"],"pdf_url":"https://arxiv.org/pdf/2307.10620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10616v1","updated":"2023-07-20T06:32:14Z","published":"2023-07-20T06:32:14Z","title":"Heterogeneous Federated Learning: State-of-the-art and Research\n  Challenges","summary":"  Federated learning (FL) has drawn increasing attention owing to its potential\nuse in large-scale industrial applications. Existing federated learning works\nmainly focus on model homogeneous settings. However, practical federated\nlearning typically faces the heterogeneity of data distributions, model\narchitectures, network environments, and hardware devices among participant\nclients. Heterogeneous Federated Learning (HFL) is much more challenging, and\ncorresponding solutions are diverse and complex. Therefore, a systematic survey\non this topic about the research challenges and state-of-the-art is essential.\nIn this survey, we firstly summarize the various research challenges in HFL\nfrom five aspects: statistical heterogeneity, model heterogeneity,\ncommunication heterogeneity, device heterogeneity, and additional challenges.\nIn addition, recent advances in HFL are reviewed and a new taxonomy of existing\nHFL methods is proposed with an in-depth analysis of their pros and cons. We\nclassify existing methods from three different levels according to the HFL\nprocedure: data-level, model-level, and server-level. Finally, several critical\nand promising future research directions in HFL are discussed, which may\nfacilitate further developments in this field. A periodically updated\ncollection on HFL is available at https://github.com/marswhu/HFL_Survey.\n","authors":["Mang Ye","Xiuwen Fang","Bo Du","Pong C. Yuen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2307.10616v1.pdf","comment":"42 pages, 11 figures, and 4 tables"},{"id":"http://arxiv.org/abs/2307.10609v1","updated":"2023-07-20T06:07:09Z","published":"2023-07-20T06:07:09Z","title":"Hybrid Feature Embedding For Automatic Building Outline Extraction","summary":"  Building outline extracted from high-resolution aerial images can be used in\nvarious application fields such as change detection and disaster assessment.\nHowever, traditional CNN model cannot recognize contours very precisely from\noriginal images. In this paper, we proposed a CNN and Transformer based model\ntogether with active contour model to deal with this problem. We also designed\na triple-branch decoder structure to handle different features generated by\nencoder. Experiment results show that our model outperforms other baseline\nmodel on two datasets, achieving 91.1% mIoU on Vaihingen and 83.8% on Bing\nhuts.\n","authors":["Weihang Ran","Wei Yuan","Xiaodan Shi","Zipei Fan","Ryosuke Shibasaki"],"pdf_url":"https://arxiv.org/pdf/2307.10609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10603v1","updated":"2023-07-20T05:49:21Z","published":"2023-07-20T05:49:21Z","title":"Physics-Driven Turbulence Image Restoration with Stochastic Refinement","summary":"  Image distortion by atmospheric turbulence is a stochastic degradation, which\nis a critical problem in long-range optical imaging systems. A number of\nresearch has been conducted during the past decades, including model-based and\nemerging deep-learning solutions with the help of synthetic data. Although fast\nand physics-grounded simulation tools have been introduced to help the\ndeep-learning models adapt to real-world turbulence conditions recently, the\ntraining of such models only relies on the synthetic data and ground truth\npairs. This paper proposes the Physics-integrated Restoration Network (PiRN) to\nbring the physics-based simulator directly into the training process to help\nthe network to disentangle the stochasticity from the degradation and the\nunderlying image. Furthermore, to overcome the ``average effect\" introduced by\ndeterministic models and the domain gap between the synthetic and real-world\ndegradation, we further introduce PiRN with Stochastic Refinement (PiRN-SR) to\nboost its perceptual quality. Overall, our PiRN and PiRN-SR improve the\ngeneralization to real-world unknown turbulence conditions and provide a\nstate-of-the-art restoration in both pixel-wise accuracy and perceptual\nquality. Our codes are available at \\url{https://github.com/VITA-Group/PiRN}.\n","authors":["Ajay Jaiswal","Xingguang Zhang","Stanley H. Chan","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10603v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10601v1","updated":"2023-07-20T05:46:32Z","published":"2023-07-20T05:46:32Z","title":"SCA-PVNet: Self-and-Cross Attention Based Aggregation of Point Cloud and\n  Multi-View for 3D Object Retrieval","summary":"  To address 3D object retrieval, substantial efforts have been made to\ngenerate highly discriminative descriptors of 3D objects represented by a\nsingle modality, e.g., voxels, point clouds or multi-view images. It is\npromising to leverage the complementary information from multi-modality\nrepresentations of 3D objects to further improve retrieval performance.\nHowever, multi-modality 3D object retrieval is rarely developed and analyzed on\nlarge-scale datasets. In this paper, we propose self-and-cross attention based\naggregation of point cloud and multi-view images (SCA-PVNet) for 3D object\nretrieval. With deep features extracted from point clouds and multi-view\nimages, we design two types of feature aggregation modules, namely the\nIn-Modality Aggregation Module (IMAM) and the Cross-Modality Aggregation Module\n(CMAM), for effective feature fusion. IMAM leverages a self-attention mechanism\nto aggregate multi-view features while CMAM exploits a cross-attention\nmechanism to interact point cloud features with multi-view features. The final\ndescriptor of a 3D object for object retrieval can be obtained via\nconcatenating the aggregated features from both modules. Extensive experiments\nand analysis are conducted on three datasets, ranging from small to large\nscale, to show the superiority of the proposed SCA-PVNet over the\nstate-of-the-art methods.\n","authors":["Dongyun Lin","Yi Cheng","Aiyuan Guo","Shangbo Mao","Yiqun Li"],"pdf_url":"https://arxiv.org/pdf/2307.10601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.01928v3","updated":"2023-07-20T05:21:04Z","published":"2023-01-05T06:32:50Z","title":"Event Camera Data Pre-training","summary":"  This paper proposes a pre-trained neural network for handling event camera\ndata. Our model is a self-supervised learning framework, and uses paired event\ncamera data and natural RGB images for training.\n  Our method contains three modules connected in a sequence: i) a family of\nevent data augmentations, generating meaningful event images for\nself-supervised training; ii) a conditional masking strategy to sample\ninformative event patches from event images, encouraging our model to capture\nthe spatial layout of a scene and accelerating training; iii) a contrastive\nlearning approach, enforcing the similarity of embeddings between matching\nevent images, and between paired event and RGB images. An embedding projection\nloss is proposed to avoid the model collapse when enforcing the event image\nembedding similarities. A probability distribution alignment loss is proposed\nto encourage the event image to be consistent with its paired RGB image in the\nfeature space.\n  Transfer learning performance on downstream tasks shows the superiority of\nour method over state-of-the-art methods. For example, we achieve top-1\naccuracy at 64.83% on the N-ImageNet dataset.\n","authors":["Yan Yang","Liyuan Pan","Liu Liu"],"pdf_url":"https://arxiv.org/pdf/2301.01928v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10593v1","updated":"2023-07-20T05:15:03Z","published":"2023-07-20T05:15:03Z","title":"Event Blob Tracking: An Asynchronous Real-Time Algorithm","summary":"  Event-based cameras have become increasingly popular for tracking fast-moving\nobjects due to their high temporal resolution, low latency, and high dynamic\nrange. In this paper, we propose a novel algorithm for tracking event blobs\nusing raw events asynchronously in real time. We introduce the concept of an\nevent blob as a spatio-temporal likelihood of event occurrence where the\nconditional spatial likelihood is blob-like. Many real-world objects generate\nevent blob data, for example, flickering LEDs such as car headlights or any\nsmall foreground object moving against a static or slowly varying background.\nThe proposed algorithm uses a nearest neighbour classifier with a dynamic\nthreshold criteria for data association coupled with a Kalman filter to track\nthe event blob state. Our algorithm achieves highly accurate tracking and event\nblob shape estimation even under challenging lighting conditions and high-speed\nmotions. The microsecond time resolution achieved means that the filter output\ncan be used to derive secondary information such as time-to-contact or range\nestimation, that will enable applications to real-world problems such as\ncollision avoidance in autonomous driving.\n","authors":["Ziwei Wang","Timothy Molloy","Pieter van Goor","Robert Mahony"],"pdf_url":"https://arxiv.org/pdf/2307.10593v1.pdf","comment":"17 pages, 8 figures, preprint version"},{"id":"http://arxiv.org/abs/2210.06551v4","updated":"2023-07-20T04:59:45Z","published":"2022-10-12T19:46:25Z","title":"MotionBERT: A Unified Perspective on Learning Human Motion\n  Representations","summary":"  We present a unified perspective on tackling various human-centric video\ntasks by learning human motion representations from large-scale and\nheterogeneous data resources. Specifically, we propose a pretraining stage in\nwhich a motion encoder is trained to recover the underlying 3D motion from\nnoisy partial 2D observations. The motion representations acquired in this way\nincorporate geometric, kinematic, and physical knowledge about human motion,\nwhich can be easily transferred to multiple downstream tasks. We implement the\nmotion encoder with a Dual-stream Spatio-temporal Transformer (DSTformer)\nneural network. It could capture long-range spatio-temporal relationships among\nthe skeletal joints comprehensively and adaptively, exemplified by the lowest\n3D pose estimation error so far when trained from scratch. Furthermore, our\nproposed framework achieves state-of-the-art performance on all three\ndownstream tasks by simply finetuning the pretrained motion encoder with a\nsimple regression head (1-2 layers), which demonstrates the versatility of the\nlearned motion representations. Code and models are available at\nhttps://motionbert.github.io/\n","authors":["Wentao Zhu","Xiaoxuan Ma","Zhaoyang Liu","Libin Liu","Wayne Wu","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2210.06551v4.pdf","comment":"ICCV 2023 version"},{"id":"http://arxiv.org/abs/2307.10584v1","updated":"2023-07-20T04:51:10Z","published":"2023-07-20T04:51:10Z","title":"Reference-based Painterly Inpainting via Diffusion: Crossing the Wild\n  Reference Domain Gap","summary":"  Have you ever imagined how it would look if we placed new objects into\npaintings? For example, what would it look like if we placed a basketball into\nClaude Monet's ``Water Lilies, Evening Effect''? We propose Reference-based\nPainterly Inpainting, a novel task that crosses the wild reference domain gap\nand implants novel objects into artworks. Although previous works have examined\nreference-based inpainting, they are not designed for large domain\ndiscrepancies between the target and the reference, such as inpainting an\nartistic image using a photorealistic reference. This paper proposes a novel\ndiffusion framework, dubbed RefPaint, to ``inpaint more wildly'' by taking such\nreferences with large domain gaps. Built with an image-conditioned diffusion\nmodel, we introduce a ladder-side branch and a masked fusion mechanism to work\nwith the inpainting mask. By decomposing the CLIP image embeddings at inference\ntime, one can manipulate the strength of semantic and style information with\nease. Experiments demonstrate that our proposed RefPaint framework produces\nsignificantly better results than existing methods. Our method enables creative\npainterly image inpainting with reference objects that would otherwise be\ndifficult to achieve. Project page: https://vita-group.github.io/RefPaint/\n","authors":["Dejia Xu","Xingqian Xu","Wenyan Cong","Humphrey Shi","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10577v1","updated":"2023-07-20T04:41:39Z","published":"2023-07-20T04:41:39Z","title":"Ethosight: A Joint-Embedding Based System for Nuanced Perception Using\n  Contextual Label Affinity Metric and Reasoning Based Iterative Learning","summary":"  Traditional computer vision models often require extensive manual effort for\ndata acquisition and validation, particularly when detecting subtle behavioral\nnuances or events. The difficulty in distinguishing routine behaviors from\npotential risks in real-world applications, like differentiating routine\nshopping from potential shoplifting, further complicates the process.\n  We present Ethosight, a novel zero-shot computer vision algorithm. Ethosight\neradicates the need for pre-existing symbolic knowledge, initiating from a\nclean slate based on user requirements and semantic knowledge of interest.\nUsing localized label affinity calculations and a reasoning-guided iterative\nlearning loop, Ethosight infers scene details and iteratively refines the label\nset. Reasoning mechanisms can be derived from large language models like GPT4,\nsymbolic reasoners like OpenNARS, or hybrid systems.\n  Ethosight further capitalizes on the capabilities of a pre-trained\nmulti-modal model, ImageBind, generating accurate semantic knowledge of images\nwithin a few cycles. It successfully captures both explicit and nuanced\nelements efficiently. We also introduce the implementation of Korzybski's\n\"time-binding\" concept in machines, which allows for generational learning and\nknowledge sharing across deployments.\n  Our evaluations demonstrate Ethosight's efficacy across 40 complex use cases.\nIt has exhibited an exceptional ability to discern new areas of interest,\nconsistently generating high-affinity scores within the top five labels from a\nset of a thousand. Tests conducted across diverse environments attest to\nEthosight's robust performance. Detailed results and case studies within the\nmain body of this paper and an appendix underscore a promising trajectory\ntowards enhancing the adaptability and resilience of computer vision models in\ndetecting and extracting subtle and nuanced behaviors.\n","authors":["Hugo Latapie","Kristinn R. Thorisson","Shan Yu","Vahagn Petrosyan","Patrick Hammer","Pei Wang","Brandon Kynoch","Hanning Chen","Tangrui Li"],"pdf_url":"https://arxiv.org/pdf/2307.10577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10575v1","updated":"2023-07-20T04:35:50Z","published":"2023-07-20T04:35:50Z","title":"Boosting Federated Learning Convergence with Prototype Regularization","summary":"  As a distributed machine learning technique, federated learning (FL) requires\nclients to collaboratively train a shared model with an edge server without\nleaking their local data. However, the heterogeneous data distribution among\nclients often leads to a decrease in model performance. To tackle this issue,\nthis paper introduces a prototype-based regularization strategy to address the\nheterogeneity in the data distribution. Specifically, the regularization\nprocess involves the server aggregating local prototypes from distributed\nclients to generate a global prototype, which is then sent back to the\nindividual clients to guide their local training. The experimental results on\nMNIST and Fashion-MNIST show that our proposal achieves improvements of 3.3%\nand 8.9% in average test accuracy, respectively, compared to the most popular\nbaseline FedAvg. Furthermore, our approach has a fast convergence rate in\nheterogeneous settings.\n","authors":["Yu Qiao","Huy Q. Le","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2307.10575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04247v2","updated":"2023-07-20T04:28:36Z","published":"2023-05-07T11:18:39Z","title":"Estimation of control area in badminton doubles with pose information\n  from top and back view drone videos","summary":"  The application of visual tracking to the performance analysis of sports\nplayers in dynamic competitions is vital for effective coaching. In doubles\nmatches, coordinated positioning is crucial for maintaining control of the\ncourt and minimizing opponents' scoring opportunities. The analysis of such\nteamwork plays a vital role in understanding the dynamics of the game. However,\nprevious studies have primarily focused on analyzing and assessing singles\nplayers without considering occlusion in broadcast videos. These studies have\nrelied on discrete representations, which involve the analysis and\nrepresentation of specific actions (e.g., strokes) or events that occur during\nthe game while overlooking the meaningful spatial distribution. In this work,\nwe present the first annotated drone dataset from top and back views in\nbadminton doubles and propose a framework to estimate the control area\nprobability map, which can be used to evaluate teamwork performance. We present\nan efficient framework of deep neural networks that enables the calculation of\nfull probability surfaces. This framework utilizes the embedding of a Gaussian\nmixture map of players' positions and employs graph convolution on their poses.\nIn the experiment, we verify our approach by comparing various baselines and\ndiscovering the correlations between the score and control area. Additionally,\nwe propose a practical application for assessing optimal positioning to provide\ninstructions during a game. Our approach offers both visual and quantitative\nevaluations of players' movements, thereby providing valuable insights into\ndoubles teamwork. The dataset and related project code is available at\nhttps://github.com/Ning-D/Drone_BD_ControlArea\n","authors":["Ning Ding","Kazuya Takeda","Wenhui Jin","Yingjiu Bei","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2305.04247v2.pdf","comment":"15 pages, 10 figures, to appear in Multimedia Tools and Applications"},{"id":"http://arxiv.org/abs/2307.10036v2","updated":"2023-07-20T04:26:46Z","published":"2023-07-19T15:19:02Z","title":"Class Attention to Regions of Lesion for Imbalanced Medical Image\n  Recognition","summary":"  Automated medical image classification is the key component in intelligent\ndiagnosis systems. However, most medical image datasets contain plenty of\nsamples of common diseases and just a handful of rare ones, leading to major\nclass imbalances. Currently, it is an open problem in intelligent diagnosis to\neffectively learn from imbalanced training data. In this paper, we propose a\nsimple yet effective framework, named \\textbf{C}lass \\textbf{A}ttention to\n\\textbf{RE}gions of the lesion (CARE), to handle data imbalance issues by\nembedding attention into the training process of \\textbf{C}onvolutional\n\\textbf{N}eural \\textbf{N}etworks (CNNs). The proposed attention module helps\nCNNs attend to lesion regions of rare diseases, therefore helping CNNs to learn\ntheir characteristics more effectively. In addition, this attention module\nworks only during the training phase and does not change the architecture of\nthe original network, so it can be directly combined with any existing CNN\narchitecture. The CARE framework needs bounding boxes to represent the lesion\nregions of rare diseases. To alleviate the need for manual annotation, we\nfurther developed variants of CARE by leveraging the traditional saliency\nmethods or a pretrained segmentation model for bounding box generation. Results\nshow that the CARE variants with automated bounding box generation are\ncomparable to the original CARE framework with \\textit{manual} bounding box\nannotations. A series of experiments on an imbalanced skin image dataset and a\npneumonia dataset indicates that our method can effectively help the network\nfocus on the lesion regions of rare diseases and remarkably improves the\nclassification performance of rare diseases.\n","authors":["Jia-Xin Zhuang","Jiabin Cai","Jianguo Zhang","Wei-shi Zheng","Ruixuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10036v2.pdf","comment":"Accepted by Neurocomputing on July 2023. 37 pages"},{"id":"http://arxiv.org/abs/2307.09724v2","updated":"2023-07-20T04:14:01Z","published":"2023-07-19T02:26:20Z","title":"AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks","summary":"  To deliver the artistic expression of the target style, recent studies\nexploit the attention mechanism owing to its ability to map the local patches\nof the style image to the corresponding patches of the content image. However,\nbecause of the low semantic correspondence between arbitrary content and\nartworks, the attention module repeatedly abuses specific local patches from\nthe style image, resulting in disharmonious and evident repetitive artifacts.\nTo overcome this limitation and accomplish impeccable artistic style transfer,\nwe focus on enhancing the attention mechanism and capturing the rhythm of\npatterns that organize the style. In this paper, we introduce a novel metric,\nnamely pattern repeatability, that quantifies the repetition of patterns in the\nstyle image. Based on the pattern repeatability, we propose Aesthetic\nPattern-Aware style transfer Networks (AesPA-Net) that discover the sweet spot\nof local and global style expressions. In addition, we propose a novel\nself-supervisory task to encourage the attention mechanism to learn precise and\nmeaningful semantic correspondence. Lastly, we introduce the patch-wise style\nloss to transfer the elaborate rhythm of local patterns. Through qualitative\nand quantitative evaluations, we verify the reliability of the proposed pattern\nrepeatability that aligns with human perception, and demonstrate the\nsuperiority of the proposed framework.\n","authors":["Kibeom Hong","Seogkyu Jeon","Junsoo Lee","Namhyuk Ahn","Kunhee Kim","Pilhyeon Lee","Daesik Kim","Youngjung Uh","Hyeran Byun"],"pdf_url":"https://arxiv.org/pdf/2307.09724v2.pdf","comment":"Accepted by ICCV 2023. Code is available at this\n  https://github.com/Kibeom-Hong/AesPA-Net"},{"id":"http://arxiv.org/abs/2307.10567v1","updated":"2023-07-20T04:12:10Z","published":"2023-07-20T04:12:10Z","title":"No-frills Temporal Video Grounding: Multi-Scale Neighboring Attention\n  and Zoom-in Boundary Detection","summary":"  Temporal video grounding (TVG) aims to retrieve the time interval of a\nlanguage query from an untrimmed video. A significant challenge in TVG is the\nlow \"Semantic Noise Ratio (SNR)\", which results in worse performance with lower\nSNR. Prior works have addressed this challenge using sophisticated techniques.\nIn this paper, we propose a no-frills TVG model that consists of two core\nmodules, namely multi-scale neighboring attention and zoom-in boundary\ndetection. The multi-scale neighboring attention restricts each video token to\nonly aggregate visual contexts from its neighbor, enabling the extraction of\nthe most distinguishing information with multi-scale feature hierarchies from\nhigh-ratio noises. The zoom-in boundary detection then focuses on local-wise\ndiscrimination of the selected top candidates for fine-grained grounding\nadjustment. With an end-to-end training strategy, our model achieves\ncompetitive performance on different TVG benchmarks, while also having the\nadvantage of faster inference speed and lighter model parameters, thanks to its\nlightweight architecture.\n","authors":["Qi Zhang","Sipeng Zheng","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2307.10567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14795v2","updated":"2023-07-20T03:39:19Z","published":"2023-06-26T15:53:02Z","title":"MotionGPT: Human Motion as a Foreign Language","summary":"  Though the advancement of pre-trained large language models unfolds, the\nexploration of building a unified model for language and other multi-modal\ndata, such as motion, remains challenging and untouched so far. Fortunately,\nhuman motion displays a semantic coupling akin to human language, often\nperceived as a form of body language. By fusing language data with large-scale\nmotion models, motion-language pre-training that can enhance the performance of\nmotion-related tasks becomes feasible. Driven by this insight, we propose\nMotionGPT, a unified, versatile, and user-friendly motion-language model to\nhandle multiple motion-relevant tasks. Specifically, we employ the discrete\nvector quantization for human motion and transfer 3D motion into motion tokens,\nsimilar to the generation process of word tokens. Building upon this \"motion\nvocabulary\", we perform language modeling on both motion and text in a unified\nmanner, treating human motion as a specific language. Moreover, inspired by\nprompt learning, we pre-train MotionGPT with a mixture of motion-language data\nand fine-tune it on prompt-based question-and-answer tasks. Extensive\nexperiments demonstrate that MotionGPT achieves state-of-the-art performances\non multiple motion tasks including text-driven motion generation, motion\ncaptioning, motion prediction, and motion in-between.\n","authors":["Biao Jiang","Xin Chen","Wen Liu","Jingyi Yu","Gang Yu","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2306.14795v2.pdf","comment":"Project Page: https://github.com/OpenMotionLab/MotionGPT"},{"id":"http://arxiv.org/abs/2307.10554v1","updated":"2023-07-20T03:36:13Z","published":"2023-07-20T03:36:13Z","title":"EMQ: Evolving Training-free Proxies for Automated Mixed Precision\n  Quantization","summary":"  Mixed-Precision Quantization~(MQ) can achieve a competitive\naccuracy-complexity trade-off for models. Conventional training-based search\nmethods require time-consuming candidate training to search optimized per-layer\nbit-width configurations in MQ. Recently, some training-free approaches have\npresented various MQ proxies and significantly improve search efficiency.\nHowever, the correlation between these proxies and quantization accuracy is\npoorly understood. To address the gap, we first build the MQ-Bench-101, which\ninvolves different bit configurations and quantization results. Then, we\nobserve that the existing training-free proxies perform weak correlations on\nthe MQ-Bench-101. To efficiently seek superior proxies, we develop an automatic\nsearch of proxies framework for MQ via evolving algorithms. In particular, we\ndevise an elaborate search space involving the existing proxies and perform an\nevolution search to discover the best correlated MQ proxy. We proposed a\ndiversity-prompting selection strategy and compatibility screening protocol to\navoid premature convergence and improve search efficiency. In this way, our\nEvolving proxies for Mixed-precision Quantization~(EMQ) framework allows the\nauto-generation of proxies without heavy tuning and expert knowledge. Extensive\nexperiments on ImageNet with various ResNet and MobileNet families demonstrate\nthat our EMQ obtains superior performance than state-of-the-art mixed-precision\nmethods at a significantly reduced cost. The code will be released.\n","authors":["Peijie Dong","Lujun Li","Zimian Wei","Xin Niu","Zhiliang Tian","Hengyue Pan"],"pdf_url":"https://arxiv.org/pdf/2307.10554v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.10549v1","updated":"2023-07-20T03:26:57Z","published":"2023-07-20T03:26:57Z","title":"Dynamic Large Language Models on Blockchains","summary":"  Training and deploying the large language models requires a large mount of\ncomputational resource because the language models contain billions of\nparameters and the text has thousands of tokens. Another problem is that the\nlarge language models are static. They are fixed after the training process. To\ntackle these issues, in this paper, we propose to train and deploy the dynamic\nlarge language model on blockchains, which have high computation performance\nand are distributed across a network of computers. A blockchain is a secure,\ndecentralized, and transparent system that allows for the creation of a\ntamper-proof ledger for transactions without the need for intermediaries. The\ndynamic large language models can continuously learn from the user input after\nthe training process. Our method provides a new way to develop the large\nlanguage models and also sheds a light on the next generation artificial\nintelligence systems.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2307.10549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.04550v5","updated":"2023-07-20T02:00:22Z","published":"2021-06-08T17:39:14Z","title":"DETReg: Unsupervised Pretraining with Region Priors for Object Detection","summary":"  Recent self-supervised pretraining methods for object detection largely focus\non pretraining the backbone of the object detector, neglecting key parts of\ndetection architecture. Instead, we introduce DETReg, a new self-supervised\nmethod that pretrains the entire object detection network, including the object\nlocalization and embedding components. During pretraining, DETReg predicts\nobject localizations to match the localizations from an unsupervised region\nproposal generator and simultaneously aligns the corresponding feature\nembeddings with embeddings from a self-supervised image encoder. We implement\nDETReg using the DETR family of detectors and show that it improves over\ncompetitive baselines when finetuned on COCO, PASCAL VOC, and Airbus Ship\nbenchmarks. In low-data regimes DETReg achieves improved performance, e.g.,\nwhen training with only 1% of the labels and in the few-shot learning settings.\n","authors":["Amir Bar","Xin Wang","Vadim Kantorov","Colorado J Reed","Roei Herzig","Gal Chechik","Anna Rohrbach","Trevor Darrell","Amir Globerson"],"pdf_url":"https://arxiv.org/pdf/2106.04550v5.pdf","comment":"Project page: https://www.amirbar.net/detreg/"},{"id":"http://arxiv.org/abs/2307.10518v1","updated":"2023-07-20T01:37:32Z","published":"2023-07-20T01:37:32Z","title":"Interactive Segmentation for Diverse Gesture Types Without Context","summary":"  Interactive segmentation entails a human marking an image to guide how a\nmodel either creates or edits a segmentation. Our work addresses limitations of\nexisting methods: they either only support one gesture type for marking an\nimage (e.g., either clicks or scribbles) or require knowledge of the gesture\ntype being employed, and require specifying whether marked regions should be\nincluded versus excluded in the final segmentation. We instead propose a\nsimplified interactive segmentation task where a user only must mark an image,\nwhere the input can be of any gesture type without specifying the gesture type.\nWe support this new task by introducing the first interactive segmentation\ndataset with multiple gesture types as well as a new evaluation metric capable\nof holistically evaluating interactive segmentation algorithms. We then analyze\nnumerous interactive segmentation algorithms, including ones adapted for our\nnovel task. While we observe promising performance overall, we also highlight\nareas for future improvement. To facilitate further extensions of this work, we\npublicly share our new dataset at https://github.com/joshmyersdean/dig.\n","authors":["Josh Myers-Dean","Yifei Fan","Brian Price","Wilson Chan","Danna Gurari"],"pdf_url":"https://arxiv.org/pdf/2307.10518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08015v3","updated":"2023-07-20T01:11:21Z","published":"2023-07-16T11:52:27Z","title":"Boosting 3-DoF Ground-to-Satellite Camera Localization Accuracy via\n  Geometry-Guided Cross-View Transformer","summary":"  Image retrieval-based cross-view localization methods often lead to very\ncoarse camera pose estimation, due to the limited sampling density of the\ndatabase satellite images. In this paper, we propose a method to increase the\naccuracy of a ground camera's location and orientation by estimating the\nrelative rotation and translation between the ground-level image and its\nmatched/retrieved satellite image. Our approach designs a geometry-guided\ncross-view transformer that combines the benefits of conventional geometry and\nlearnable cross-view transformers to map the ground-view observations to an\noverhead view. Given the synthesized overhead view and observed satellite\nfeature maps, we construct a neural pose optimizer with strong global\ninformation embedding ability to estimate the relative rotation between them.\nAfter aligning their rotations, we develop an uncertainty-guided spatial\ncorrelation to generate a probability map of the vehicle locations, from which\nthe relative translation can be determined. Experimental results demonstrate\nthat our method significantly outperforms the state-of-the-art. Notably, the\nlikelihood of restricting the vehicle lateral pose to be within 1m of its\nGround Truth (GT) value on the cross-view KITTI dataset has been improved from\n$35.54\\%$ to $76.44\\%$, and the likelihood of restricting the vehicle\norientation to be within $1^{\\circ}$ of its GT value has been improved from\n$19.64\\%$ to $99.10\\%$.\n","authors":["Yujiao Shi","Fei Wu","Akhil Perincherry","Ankit Vora","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2307.08015v3.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2106.04066v6","updated":"2023-07-20T00:24:58Z","published":"2021-06-08T02:51:33Z","title":"Semantically Adversarial Scenario Generation with Explicit Knowledge\n  Guidance","summary":"  Generating adversarial scenarios, which have the potential to fail autonomous\ndriving systems, provides an effective way to improve robustness. Extending\npurely data-driven generative models, recent specialized models satisfy\nadditional controllable requirements such as embedding a traffic sign in a\ndriving scene by manipulating patterns implicitly in the neuron level. In this\npaper, we introduce a method to incorporate domain knowledge explicitly in the\ngeneration process to achieve the Semantically Adversarial Generation (SAG). To\nbe consistent with the composition of driving scenes, we first categorize the\nknowledge into two types, the property of objects and the relationship among\nobjects. We then propose a tree-structured variational auto-encoder (T-VAE) to\nlearn hierarchical scene representation. By imposing semantic rules on the\nproperties of nodes and edges in the tree structure, explicit knowledge\nintegration enables controllable generation. We construct a synthetic example\nto illustrate the controllability and explainability of our method in a\nsuccinct setting. We further extend to realistic environments for autonomous\nvehicles: our method efficiently identifies adversarial driving scenes against\ndifferent state-of-the-art 3D point cloud segmentation models and satisfies the\ntraffic rules specified as the explicit knowledge.\n","authors":["Wenhao Ding","Haohong Lin","Bo Li","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2106.04066v6.pdf","comment":"20 pages, 13 figures"},{"id":"http://arxiv.org/abs/2307.10507v1","updated":"2023-07-20T00:07:29Z","published":"2023-07-20T00:07:29Z","title":"FedSoup: Improving Generalization and Personalization in Federated\n  Learning via Selective Model Interpolation","summary":"  Cross-silo federated learning (FL) enables the development of machine\nlearning models on datasets distributed across data centers such as hospitals\nand clinical research laboratories. However, recent research has found that\ncurrent FL algorithms face a trade-off between local and global performance\nwhen confronted with distribution shifts. Specifically, personalized FL methods\nhave a tendency to overfit to local data, leading to a sharp valley in the\nlocal model and inhibiting its ability to generalize to out-of-distribution\ndata. In this paper, we propose a novel federated model soup method (i.e.,\nselective interpolation of model parameters) to optimize the trade-off between\nlocal and global performance. Specifically, during the federated training\nphase, each client maintains its own global model pool by monitoring the\nperformance of the interpolated model between the local and global models. This\nallows us to alleviate overfitting and seek flat minima, which can\nsignificantly improve the model's generalization performance. We evaluate our\nmethod on retinal and pathological image classification tasks, and our proposed\nmethod achieves significant improvements for out-of-distribution\ngeneralization. Our code is available at https://github.com/ubc-tea/FedSoup.\n","authors":["Minghui Chen","Meirui Jiang","Qi Dou","Zehua Wang","Xiaoxiao Li"],"pdf_url":"https://arxiv.org/pdf/2307.10507v1.pdf","comment":"Accepted by MICCAI2023"},{"id":"http://arxiv.org/abs/2307.10506v1","updated":"2023-07-20T00:06:46Z","published":"2023-07-20T00:06:46Z","title":"Is Grad-CAM Explainable in Medical Images?","summary":"  Explainable Deep Learning has gained significant attention in the field of\nartificial intelligence (AI), particularly in domains such as medical imaging,\nwhere accurate and interpretable machine learning models are crucial for\neffective diagnosis and treatment planning. Grad-CAM is a baseline that\nhighlights the most critical regions of an image used in a deep learning\nmodel's decision-making process, increasing interpretability and trust in the\nresults. It is applied in many computer vision (CV) tasks such as\nclassification and explanation. This study explores the principles of\nExplainable Deep Learning and its relevance to medical imaging, discusses\nvarious explainability techniques and their limitations, and examines medical\nimaging applications of Grad-CAM. The findings highlight the potential of\nExplainable Deep Learning and Grad-CAM in improving the accuracy and\ninterpretability of deep learning models in medical imaging. The code is\navailable in (will be available).\n","authors":["Subhashis Suara","Aayush Jha","Pratik Sinha","Arif Ahmed Sekh"],"pdf_url":"https://arxiv.org/pdf/2307.10506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10504v1","updated":"2023-07-20T00:02:24Z","published":"2023-07-20T00:02:24Z","title":"Identifying Interpretable Subspaces in Image Representations","summary":"  We propose Automatic Feature Explanation using Contrasting Concepts (FALCON),\nan interpretability framework to explain features of image representations. For\na target feature, FALCON captions its highly activating cropped images using a\nlarge captioning dataset (like LAION-400m) and a pre-trained vision-language\nmodel like CLIP. Each word among the captions is scored and ranked leading to a\nsmall number of shared, human-understandable concepts that closely describe the\ntarget feature. FALCON also applies contrastive interpretation using lowly\nactivating (counterfactual) images, to eliminate spurious concepts. Although\nmany existing approaches interpret features independently, we observe in\nstate-of-the-art self-supervised and supervised models, that less than 20% of\nthe representation space can be explained by individual features. We show that\nfeatures in larger spaces become more interpretable when studied in groups and\ncan be explained with high-order scoring concepts through FALCON. We discuss\nhow extracted concepts can be used to explain and debug failures in downstream\ntasks. Finally, we present a technique to transfer concepts from one\n(explainable) representation space to another unseen representation space by\nlearning a simple linear transformation.\n","authors":["Neha Kalibhat","Shweta Bhardwaj","Bayan Bruss","Hamed Firooz","Maziar Sanjabi","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2307.10504v1.pdf","comment":"Published at ICML 2023"},{"id":"http://arxiv.org/abs/2307.11081v1","updated":"2023-07-20T17:57:04Z","published":"2023-07-20T17:57:04Z","title":"GLSFormer: Gated - Long, Short Sequence Transformer for Step Recognition\n  in Surgical Videos","summary":"  Automated surgical step recognition is an important task that can\nsignificantly improve patient safety and decision-making during surgeries.\nExisting state-of-the-art methods for surgical step recognition either rely on\nseparate, multi-stage modeling of spatial and temporal information or operate\non short-range temporal resolution when learned jointly. However, the benefits\nof joint modeling of spatio-temporal features and long-range information are\nnot taken in account. In this paper, we propose a vision transformer-based\napproach to jointly learn spatio-temporal features directly from sequence of\nframe-level patches. Our method incorporates a gated-temporal attention\nmechanism that intelligently combines short-term and long-term spatio-temporal\nfeature representations. We extensively evaluate our approach on two cataract\nsurgery video datasets, namely Cataract-101 and D99, and demonstrate superior\nperformance compared to various state-of-the-art methods. These results\nvalidate the suitability of our proposed approach for automated surgical step\nrecognition. Our code is released at:\nhttps://github.com/nisargshah1999/GLSFormer\n","authors":["Nisarg A. Shah","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2307.11081v1.pdf","comment":"Accepted to MICCAI 2023 (Early Accept)"},{"id":"http://arxiv.org/abs/2307.11261v1","updated":"2023-07-20T22:41:23Z","published":"2023-07-20T22:41:23Z","title":"SimCol3D -- 3D Reconstruction during Colonoscopy Challenge","summary":"  Colorectal cancer is one of the most common cancers in the world. While\ncolonoscopy is an effective screening technique, navigating an endoscope\nthrough the colon to detect polyps is challenging. A 3D map of the observed\nsurfaces could enhance the identification of unscreened colon tissue and serve\nas a training platform. However, reconstructing the colon from video footage\nremains unsolved due to numerous factors such as self-occlusion, reflective\nsurfaces, lack of texture, and tissue deformation that limit feature-based\nmethods. Learning-based approaches hold promise as robust alternatives, but\nnecessitate extensive datasets. By establishing a benchmark, the 2022 EndoVis\nsub-challenge SimCol3D aimed to facilitate data-driven depth and pose\nprediction during colonoscopy. The challenge was hosted as part of MICCAI 2022\nin Singapore. Six teams from around the world and representatives from academia\nand industry participated in the three sub-challenges: synthetic depth\nprediction, synthetic pose prediction, and real pose prediction. This paper\ndescribes the challenge, the submitted methods, and their results. We show that\ndepth prediction in virtual colonoscopy is robustly solvable, while pose\nestimation remains an open research question.\n","authors":["Anita Rau","Sophia Bano","Yueming Jin","Pablo Azagra","Javier Morlana","Edward Sanderson","Bogdan J. Matuszewski","Jae Young Lee","Dong-Jae Lee","Erez Posner","Netanel Frank","Varshini Elangovan","Sista Raviteja","Zhengwen Li","Jiquan Liu","Seenivasan Lalithkumar","Mobarakol Islam","Hongliang Ren","José M. M. Montiel","Danail Stoyanov"],"pdf_url":"https://arxiv.org/pdf/2307.11261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11259v1","updated":"2023-07-20T22:35:27Z","published":"2023-07-20T22:35:27Z","title":"Towards Non-Parametric Models for Confidence Aware Image Prediction from\n  Low Data using Gaussian Processes","summary":"  The ability to envision future states is crucial to informed decision making\nwhile interacting with dynamic environments. With cameras providing a prevalent\nand information rich sensing modality, the problem of predicting future states\nfrom image sequences has garnered a lot of attention. Current state of the art\nmethods typically train large parametric models for their predictions. Though\noften able to predict with accuracy, these models rely on the availability of\nlarge training datasets to converge to useful solutions. In this paper we focus\non the problem of predicting future images of an image sequence from very\nlittle training data. To approach this problem, we use non-parametric models to\ntake a probabilistic approach to image prediction. We generate probability\ndistributions over sequentially predicted images and propagate uncertainty\nthrough time to generate a confidence metric for our predictions. Gaussian\nProcesses are used for their data efficiency and ability to readily incorporate\nnew training data online. We showcase our method by successfully predicting\nfuture frames of a smooth fluid simulation environment.\n","authors":["Nikhil U. Shinde","Florian Richter","Michael C. Yip"],"pdf_url":"https://arxiv.org/pdf/2307.11259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11253v1","updated":"2023-07-20T22:09:04Z","published":"2023-07-20T22:09:04Z","title":"Joint one-sided synthetic unpaired image translation and segmentation\n  for colorectal cancer prevention","summary":"  Deep learning has shown excellent performance in analysing medical images.\nHowever, datasets are difficult to obtain due privacy issues, standardization\nproblems, and lack of annotations. We address these problems by producing\nrealistic synthetic images using a combination of 3D technologies and\ngenerative adversarial networks. We propose CUT-seg, a joint training where a\nsegmentation model and a generative model are jointly trained to produce\nrealistic images while learning to segment polyps. We take advantage of recent\none-sided translation models because they use significantly less memory,\nallowing us to add a segmentation model in the training loop. CUT-seg performs\nbetter, is computationally less expensive, and requires less real images than\nother memory-intensive image translation approaches that require two stage\ntraining. Promising results are achieved on five real polyp segmentation\ndatasets using only one real image and zero real annotations. As a part of this\nstudy we release Synth-Colon, an entirely synthetic dataset that includes 20000\nrealistic colon images and additional details about depth and 3D geometry:\nhttps://enric1994.github.io/synth-colon\n","authors":["Enric Moreu","Eric Arazo","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11253v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2202.08680"},{"id":"http://arxiv.org/abs/2307.11227v1","updated":"2023-07-20T20:45:13Z","published":"2023-07-20T20:45:13Z","title":"UP-DP: Unsupervised Prompt Learning for Data Pre-Selection with\n  Vision-Language Models","summary":"  In this study, we investigate the task of data pre-selection, which aims to\nselect instances for labeling from an unlabeled dataset through a single pass,\nthereby optimizing performance for undefined downstream tasks with a limited\nannotation budget. Previous approaches to data pre-selection relied solely on\nvisual features extracted from foundation models, such as CLIP and BLIP-2, but\nlargely ignored the powerfulness of text features. In this work, we argue that,\nwith proper design, the joint feature space of both vision and text can yield a\nbetter representation for data pre-selection. To this end, we introduce UP-DP,\na simple yet effective unsupervised prompt learning approach that adapts\nvision-language models, like BLIP-2, for data pre-selection. Specifically, with\nthe BLIP-2 parameters frozen, we train text prompts to extract the joint\nfeatures with improved representation, ensuring a diverse cluster structure\nthat covers the entire dataset. We extensively compare our method with the\nstate-of-the-art using seven benchmark datasets in different settings,\nachieving up to a performance gain of 20%. Interestingly, the prompts learned\nfrom one dataset demonstrate significant generalizability and can be applied\ndirectly to enhance the feature extraction of BLIP-2 from other datasets. To\nthe best of our knowledge, UP-DP is the first work to incorporate unsupervised\nprompt learning in a vision-language model for data pre-selection.\n","authors":["Xin Li","Sima Behpour","Thang Doan","Wenbin He","Liang Gou","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2307.11227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.03297v2","updated":"2023-07-20T19:28:22Z","published":"2022-10-07T03:10:34Z","title":"Preprocessors Matter! Realistic Decision-Based Attacks on Machine\n  Learning Systems","summary":"  Decision-based attacks construct adversarial examples against a machine\nlearning (ML) model by making only hard-label queries. These attacks have\nmainly been applied directly to standalone neural networks. However, in\npractice, ML models are just one component of a larger learning system. We find\nthat by adding a single preprocessor in front of a classifier, state-of-the-art\nquery-based attacks are up to 7$\\times$ less effective at attacking a\nprediction pipeline than at attacking the model alone. We explain this\ndiscrepancy by the fact that most preprocessors introduce some notion of\ninvariance to the input space. Hence, attacks that are unaware of this\ninvariance inevitably waste a large number of queries to re-discover or\novercome it. We, therefore, develop techniques to (i) reverse-engineer the\npreprocessor and then (ii) use this extracted information to attack the\nend-to-end system. Our preprocessors extraction method requires only a few\nhundred queries, and our preprocessor-aware attacks recover the same efficacy\nas when attacking the model alone. The code can be found at\nhttps://github.com/google-research/preprocessor-aware-black-box-attack.\n","authors":["Chawin Sitawarin","Florian Tramèr","Nicholas Carlini"],"pdf_url":"https://arxiv.org/pdf/2210.03297v2.pdf","comment":"ICML 2023. Code can be found at\n  https://github.com/google-research/preprocessor-aware-black-box-attack"},{"id":"http://arxiv.org/abs/2302.11827v2","updated":"2023-07-20T19:21:51Z","published":"2023-02-23T07:26:50Z","title":"Open Challenges for Monocular Single-shot 6D Object Pose Estimation","summary":"  Object pose estimation is a non-trivial task that enables robotic\nmanipulation, bin picking, augmented reality, and scene understanding, to name\na few use cases. Monocular object pose estimation gained considerable momentum\nwith the rise of high-performing deep learning-based solutions and is\nparticularly interesting for the community since sensors are inexpensive and\ninference is fast. Prior works establish the comprehensive state of the art for\ndiverse pose estimation problems. Their broad scopes make it difficult to\nidentify promising future directions. We narrow down the scope to the problem\nof single-shot monocular 6D object pose estimation, which is commonly used in\nrobotics, and thus are able to identify such trends. By reviewing recent\npublications in robotics and computer vision, the state of the art is\nestablished at the union of both fields. Following that, we identify promising\nresearch directions in order to help researchers to formulate relevant research\nideas and effectively advance the state of the art. Findings include that\nmethods are sophisticated enough to overcome the domain shift and that\nocclusion handling is a fundamental challenge. We also highlight problems such\nas novel object pose estimation and challenging materials handling as central\nchallenges to advance robotics.\n","authors":["Stefan Thalhammer","Peter Hönig","Jean-Baptiste Weibel","Markus Vincze"],"pdf_url":"https://arxiv.org/pdf/2302.11827v2.pdf","comment":"Revised version in the making"},{"id":"http://arxiv.org/abs/2307.11197v1","updated":"2023-07-20T19:20:35Z","published":"2023-07-20T19:20:35Z","title":"Heuristic Hyperparameter Choice for Image Anomaly Detection","summary":"  Anomaly detection (AD) in images is a fundamental computer vision problem by\ndeep learning neural network to identify images deviating significantly from\nnormality. The deep features extracted from pretrained models have been proved\nto be essential for AD based on multivariate Gaussian distribution analysis.\nHowever, since models are usually pretrained on a large dataset for\nclassification tasks such as ImageNet, they might produce lots of redundant\nfeatures for AD, which increases computational cost and degrades the\nperformance. We aim to do the dimension reduction of Negated Principal\nComponent Analysis (NPCA) for these features. So we proposed some heuristic to\nchoose hyperparameter of NPCA algorithm for getting as fewer components of\nfeatures as possible while ensuring a good performance.\n","authors":["Zeyu Jiang","João P. C. Bertoldo","Etienne Decencière"],"pdf_url":"https://arxiv.org/pdf/2307.11197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03512v2","updated":"2023-07-20T18:27:42Z","published":"2023-07-07T11:00:44Z","title":"Tranfer Learning of Semantic Segmentation Methods for Identifying Buried\n  Archaeological Structures on LiDAR Data","summary":"  When applying deep learning to remote sensing data in archaeological\nresearch, a notable obstacle is the limited availability of suitable datasets\nfor training models. The application of transfer learning is frequently\nemployed to mitigate this drawback. However, there is still a need to explore\nits effectiveness when applied across different archaeological datasets. This\npaper compares the performance of various transfer learning configurations\nusing two semantic segmentation deep neural networks on two LiDAR datasets. The\nexperimental results indicate that transfer learning-based approaches in\narchaeology can lead to performance improvements, although a systematic\nenhancement has not yet been observed. We provide specific insights about the\nvalidity of such techniques that can serve as a baseline for future works.\n","authors":["Paolo Soleni","Wouter B. Verschoof-van der Vaart","Žiga Kokalj","Arianna Traviglia","Marco Fiorucci"],"pdf_url":"https://arxiv.org/pdf/2307.03512v2.pdf","comment":"Accepted to IEEE International Geoscience and Remote Sensing\n  Symposium 2023 (IGARSS 2023) @IEEE copyright"},{"id":"http://arxiv.org/abs/2307.11141v1","updated":"2023-07-20T17:53:04Z","published":"2023-07-20T17:53:04Z","title":"Towards General Game Representations: Decomposing Games Pixels into\n  Content and Style","summary":"  On-screen game footage contains rich contextual information that players\nprocess when playing and experiencing a game. Learning pixel representations of\ngames can benefit artificial intelligence across several downstream tasks\nincluding game-playing agents, procedural content generation, and player\nmodelling. The generalizability of these methods, however, remains a challenge,\nas learned representations should ideally be shared across games with similar\ngame mechanics. This could allow, for instance, game-playing agents trained on\none game to perform well in similar games with no re-training. This paper\nexplores how generalizable pre-trained computer vision encoders can be for such\ntasks, by decomposing the latent space into content embeddings and style\nembeddings. The goal is to minimize the domain gap between games of the same\ngenre when it comes to game content critical for downstream tasks, and ignore\ndifferences in graphical style. We employ a pre-trained Vision Transformer\nencoder and a decomposition technique based on game genres to obtain separate\ncontent and style embeddings. Our findings show that the decomposed embeddings\nachieve style invariance across multiple games while still maintaining strong\ncontent extraction capabilities. We argue that the proposed decomposition of\ncontent and style offers better generalization capacities across game\nenvironments independently of the downstream task.\n","authors":["Chintan Trivedi","Konstantinos Makantasis","Antonios Liapis","Georgios N. Yannakakis"],"pdf_url":"https://arxiv.org/pdf/2307.11141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.10495v2","updated":"2023-07-20T16:33:52Z","published":"2022-10-19T12:04:47Z","title":"ADPS: Asymmetric Distillation Post-Segmentation Method for Image Anomaly\n  Detection","summary":"  Knowledge Distillation-based Anomaly Detection (KDAD) methods rely on the\nteacher-student paradigm to detect and segment anomalous regions by contrasting\nthe unique features extracted by both networks. However, existing KDAD methods\nsuffer from two main limitations: 1) the student network can effortlessly\nreplicate the teacher network's representations, and 2) the features of the\nteacher network serve solely as a ``reference standard\" and are not fully\nleveraged. Toward this end, we depart from the established paradigm and instead\npropose an innovative approach called Asymmetric Distillation Post-Segmentation\n(ADPS). Our ADPS employs an asymmetric distillation paradigm that takes\ndistinct forms of the same image as the input of the teacher-student networks,\ndriving the student network to learn discriminating representations for\nanomalous regions.\n  Meanwhile, a customized Weight Mask Block (WMB) is proposed to generate a\ncoarse anomaly localization mask that transfers the distilled knowledge\nacquired from the asymmetric paradigm to the teacher network. Equipped with\nWMB, the proposed Post-Segmentation Module (PSM) is able to effectively detect\nand segment abnormal regions with fine structures and clear boundaries.\nExperimental results demonstrate that the proposed ADPS outperforms the\nstate-of-the-art methods in detecting and segmenting anomalies. Surprisingly,\nADPS significantly improves Average Precision (AP) metric by 9% and 20% on the\nMVTec AD and KolektorSDD2 datasets, respectively.\n","authors":["Peng Xing","Hao Tang","Jinhui Tang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2210.10495v2.pdf","comment":"11pages,9 figures"},{"id":"http://arxiv.org/abs/2307.11130v1","updated":"2023-07-20T16:07:02Z","published":"2023-07-20T16:07:02Z","title":"Frequency-aware optical coherence tomography image super-resolution via\n  conditional generative adversarial neural network","summary":"  Optical coherence tomography (OCT) has stimulated a wide range of medical\nimage-based diagnosis and treatment in fields such as cardiology and\nophthalmology. Such applications can be further facilitated by deep\nlearning-based super-resolution technology, which improves the capability of\nresolving morphological structures. However, existing deep learning-based\nmethod only focuses on spatial distribution and disregard frequency fidelity in\nimage reconstruction, leading to a frequency bias. To overcome this limitation,\nwe propose a frequency-aware super-resolution framework that integrates three\ncritical frequency-based modules (i.e., frequency transformation, frequency\nskip connection, and frequency alignment) and frequency-based loss function\ninto a conditional generative adversarial network (cGAN). We conducted a\nlarge-scale quantitative study from an existing coronary OCT dataset to\ndemonstrate the superiority of our proposed framework over existing deep\nlearning frameworks. In addition, we confirmed the generalizability of our\nframework by applying it to fish corneal images and rat retinal images,\ndemonstrating its capability to super-resolve morphological details in eye\nimaging.\n","authors":["Xueshen Li","Zhenxing Dong","Hongshan Liu","Jennifer J. Kang-Mieler","Yuye Ling","Yu Gan"],"pdf_url":"https://arxiv.org/pdf/2307.11130v1.pdf","comment":"13 pages, 7 figures, submitted to Biomedical Optics Express special\n  issue"},{"id":"http://arxiv.org/abs/2307.11118v1","updated":"2023-07-20T14:37:30Z","published":"2023-07-20T14:37:30Z","title":"Diffusion Sampling with Momentum for Mitigating Divergence Artifacts","summary":"  Despite the remarkable success of diffusion models in image generation, slow\nsampling remains a persistent issue. To accelerate the sampling process, prior\nstudies have reformulated diffusion sampling as an ODE/SDE and introduced\nhigher-order numerical methods. However, these methods often produce divergence\nartifacts, especially with a low number of sampling steps, which limits the\nachievable acceleration. In this paper, we investigate the potential causes of\nthese artifacts and suggest that the small stability regions of these methods\ncould be the principal cause. To address this issue, we propose two novel\ntechniques. The first technique involves the incorporation of Heavy Ball (HB)\nmomentum, a well-known technique for improving optimization, into existing\ndiffusion numerical methods to expand their stability regions. We also prove\nthat the resulting methods have first-order convergence. The second technique,\ncalled Generalized Heavy Ball (GHVB), constructs a new high-order method that\noffers a variable trade-off between accuracy and artifact suppression.\nExperimental results show that our techniques are highly effective in reducing\nartifacts and improving image quality, surpassing state-of-the-art diffusion\nsolvers on both pixel-based and latent-based diffusion models for low-step\nsampling. Our research provides novel insights into the design of numerical\nmethods for future diffusion work.\n","authors":["Suttisak Wizadwongsa","Worameth Chinchuthakun","Pramook Khungurn","Amit Raj","Supasorn Suwajanakorn"],"pdf_url":"https://arxiv.org/pdf/2307.11118v1.pdf","comment":"Project page: https://github.com/sWizad/momentum-diffusion"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.11019v1","updated":"2023-07-20T16:46:10Z","published":"2023-07-20T16:46:10Z","title":"Investigating the Factual Knowledge Boundary of Large Language Models\n  with Retrieval Augmentation","summary":"  Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require\na substantial amount of factual knowledge and often rely on external\ninformation for assistance. Recently, large language models (LLMs) (e.g.,\nChatGPT), have demonstrated impressive prowess in solving a wide range of tasks\nwith world knowledge, including knowledge-intensive tasks. However, it remains\nunclear how well LLMs are able to perceive their factual knowledge boundaries,\nparticularly how they behave when incorporating retrieval augmentation. In this\nstudy, we present an initial analysis of the factual knowledge boundaries of\nLLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,\nwe focus on three primary research questions and analyze them by examining QA\nperformance, priori judgement and posteriori judgement of LLMs. We show\nevidence that LLMs possess unwavering confidence in their capabilities to\nrespond to questions and the accuracy of their responses. Furthermore,\nretrieval augmentation proves to be an effective approach in enhancing LLMs'\nawareness of knowledge boundaries, thereby improving their judgemental\nabilities. Additionally, we also find that LLMs have a propensity to rely on\nthe provided retrieval results when formulating answers, while the quality of\nthese results significantly impacts their reliance. The code to reproduce this\nwork is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.\n","authors":["Ruiyang Ren","Yuhao Wang","Yingqi Qu","Wayne Xin Zhao","Jing Liu","Hao Tian","Hua Wu","Ji-Rong Wen","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.11876v3","updated":"2023-07-20T10:42:36Z","published":"2021-05-25T12:23:24Z","title":"Criterion-based Heterogeneous Collaborative Filtering for Multi-behavior\n  Implicit Recommendation","summary":"  Recent years have witnessed the explosive growth of interaction behaviors in\nmultimedia information systems, where multi-behavior recommender systems have\nreceived increasing attention by leveraging data from various auxiliary\nbehaviors such as tip and collect. Among various multi-behavior recommendation\nmethods, non-sampling methods have shown superiority over negative sampling\nmethods. However, two observations are usually ignored in existing\nstate-of-the-art non-sampling methods based on binary regression: (1) users\nhave different preference strengths for different items, so they cannot be\nmeasured simply by binary implicit data; (2) the dependency across multiple\nbehaviors varies for different users and items. To tackle the above issue, we\npropose a novel non-sampling learning framework named Criterion-guided\nHeterogeneous Collaborative Filtering (CHCF). CHCF introduces both upper and\nlower thresholds to indicate selection criteria, which will guide user\npreference learning. Besides, CHCF integrates criterion learning and user\npreference learning into a unified framework, which can be trained jointly for\nthe interaction prediction of the target behavior. We further theoretically\ndemonstrate that the optimization of Collaborative Metric Learning can be\napproximately achieved by the CHCF learning framework in a non-sampling form\neffectively. Extensive experiments on three real-world datasets show the\neffectiveness of CHCF in heterogeneous scenarios.\n","authors":["Xiao Luo","Daqing Wu","Yiyang Gu","Chong Chen","Luchen Liu","Jinwen Ma","Ming Zhang","Minghua Deng","Jianqiang Huang","Xian-Sheng Hua"],"pdf_url":"https://arxiv.org/pdf/2105.11876v3.pdf","comment":"Accepted by ACM Transactions on Knowledge Discovery from Data (TKDD)"},{"id":"http://arxiv.org/abs/2307.10747v1","updated":"2023-07-20T10:19:47Z","published":"2023-07-20T10:19:47Z","title":"Enhancing Job Recommendation through LLM-based Generative Adversarial\n  Networks","summary":"  Recommending suitable jobs to users is a critical task in online recruitment\nplatforms, as it can enhance users' satisfaction and the platforms'\nprofitability. While existing job recommendation methods encounter challenges\nsuch as the low quality of users' resumes, which hampers their accuracy and\npractical effectiveness. With the rapid development of large language models\n(LLMs), utilizing the rich external knowledge encapsulated within them, as well\nas their powerful capabilities of text processing and reasoning, is a promising\nway to complete users' resumes for more accurate recommendations. However,\ndirectly leveraging LLMs to enhance recommendation results is not a\none-size-fits-all solution, as LLMs may suffer from fabricated generation and\nfew-shot problems, which degrade the quality of resume completion. In this\npaper, we propose a novel LLM-based approach for job recommendation. To\nalleviate the limitation of fabricated generation for LLMs, we extract accurate\nand valuable information beyond users' self-description, which helps the LLMs\nbetter profile users for resume completion. Specifically, we not only extract\nusers' explicit properties (e.g., skills, interests) from their\nself-description but also infer users' implicit characteristics from their\nbehaviors for more accurate and meaningful resume completion. Nevertheless,\nsome users still suffer from few-shot problems, which arise due to scarce\ninteraction records, leading to limited guidance for the models in generating\nhigh-quality resumes. To address this issue, we propose aligning unpaired\nlow-quality with high-quality generated resumes by Generative Adversarial\nNetworks (GANs), which can refine the resume representations for better\nrecommendation results. Extensive experiments on three large real-world\nrecruitment datasets demonstrate the effectiveness of our proposed method.\n","authors":["Yingpeng Du","Di Luo","Rui Yan","Hongzhi Liu","Yang Song","Hengshu Zhu","Jie Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10747v1.pdf","comment":"13 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2011.00696v2","updated":"2023-07-20T08:56:26Z","published":"2020-11-02T03:07:38Z","title":"ABNIRML: Analyzing the Behavior of Neural IR Models","summary":"  Pretrained contextualized language models such as BERT and T5 have\nestablished a new state-of-the-art for ad-hoc search. However, it is not yet\nwell-understood why these methods are so effective, what makes some variants\nmore effective than others, and what pitfalls they may have. We present a new\ncomprehensive framework for Analyzing the Behavior of Neural IR ModeLs\n(ABNIRML), which includes new types of diagnostic probes that allow us to test\nseveral characteristics -- such as writing styles, factuality, sensitivity to\nparaphrasing and word order -- that are not addressed by previous techniques.\nTo demonstrate the value of the framework, we conduct an extensive empirical\nstudy that yields insights into the factors that contribute to the neural\nmodel's gains, and identify potential unintended biases the models exhibit.\nSome of our results confirm conventional wisdom, like that recent neural\nranking models rely less on exact term overlap with the query, and instead\nleverage richer linguistic information, evidenced by their higher sensitivity\nto word and sentence order. Other results are more surprising, such as that\nsome models (e.g., T5 and ColBERT) are biased towards factually correct (rather\nthan simply relevant) texts. Further, some characteristics vary even for the\nsame base language model, and other characteristics can appear due to random\nvariations during model training.\n","authors":["Sean MacAvaney","Sergey Feldman","Nazli Goharian","Doug Downey","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2011.00696v2.pdf","comment":"TACL version"},{"id":"http://arxiv.org/abs/2307.10702v1","updated":"2023-07-20T08:47:54Z","published":"2023-07-20T08:47:54Z","title":"A Constraint-based Recommender System via RDF Knowledge Graphs","summary":"  Knowledge graphs, represented in RDF, are able to model entities and their\nrelations by means of ontologies. The use of knowledge graphs for information\nmodeling has attracted interest in recent years. In recommender systems, items\nand users can be mapped and integrated into the knowledge graph, which can\nrepresent more links and relationships between users and items.\nConstraint-based recommender systems are based on the idea of explicitly\nexploiting deep recommendation knowledge through constraints to identify\nrelevant recommendations. When combined with knowledge graphs, a\nconstraint-based recommender system gains several benefits in terms of\nconstraint sets. In this paper, we investigate and propose the construction of\na constraint-based recommender system via RDF knowledge graphs applied to the\nvehicle purchase/sale domain. The results of our experiments show that the\nproposed approach is able to efficiently identify recommendations in accordance\nwith user preferences.\n","authors":["Ngoc Luyen Le","Marie-Hélène Abel","Philippe Gouspillou"],"pdf_url":"https://arxiv.org/pdf/2307.10702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10680v1","updated":"2023-07-20T08:14:06Z","published":"2023-07-20T08:14:06Z","title":"A Personalized Recommender System Based-on Knowledge Graph Embeddings","summary":"  Knowledge graphs have proven to be effective for modeling entities and their\nrelationships through the use of ontologies. The recent emergence in interest\nfor using knowledge graphs as a form of information modeling has led to their\nincreased adoption in recommender systems. By incorporating users and items\ninto the knowledge graph, these systems can better capture the implicit\nconnections between them and provide more accurate recommendations. In this\npaper, we investigate and propose the construction of a personalized\nrecommender system via knowledge graphs embedding applied to the vehicle\npurchase/sale domain. The results of our experimentation demonstrate the\nefficacy of the proposed method in providing relevant recommendations that are\nconsistent with individual users.\n","authors":["Ngoc Luyen Le","Marie-Hélène Abel","Philippe Gouspillou"],"pdf_url":"https://arxiv.org/pdf/2307.10680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10650v1","updated":"2023-07-20T07:30:27Z","published":"2023-07-20T07:30:27Z","title":"Language-Enhanced Session-Based Recommendation with Decoupled\n  Contrastive Learning","summary":"  Session-based recommendation techniques aim to capture dynamic user behavior\nby analyzing past interactions. However, existing methods heavily rely on\nhistorical item ID sequences to extract user preferences, leading to challenges\nsuch as popular bias and cold-start problems. In this paper, we propose a\nhybrid multimodal approach for session-based recommendation to address these\nchallenges. Our approach combines different modalities, including textual\ncontent and item IDs, leveraging the complementary nature of these modalities\nusing CatBoost. To learn universal item representations, we design a language\nrepresentation-based item retrieval architecture that extracts features from\nthe textual content utilizing pre-trained language models. Furthermore, we\nintroduce a novel Decoupled Contrastive Learning method to enhance the\neffectiveness of the language representation. This technique decouples the\nsequence representation and item representation space, facilitating\nbidirectional alignment through dual-queue contrastive learning.\nSimultaneously, the momentum queue provides a large number of negative samples,\neffectively enhancing the effectiveness of contrastive learning. Our approach\nyielded competitive results, securing a 5th place ranking in KDD CUP 2023 Task\n1. We have released the source code and pre-trained models associated with this\nwork.\n","authors":["Zhipeng Zhang","Piao Tong","Yingwei Ma","Qiao Liu","Xujiang Liu","Xu Luo"],"pdf_url":"https://arxiv.org/pdf/2307.10650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10639v1","updated":"2023-07-20T07:08:25Z","published":"2023-07-20T07:08:25Z","title":"Improving Semantic Similarity Measure Within a Recommender System\n  Based-on RDF Graphs","summary":"  In today's era of information explosion, more users are becoming more reliant\nupon recommender systems to have better advice, suggestions, or inspire them.\nThe measure of the semantic relatedness or likeness between terms, words, or\ntext data plays an important role in different applications dealing with\ntextual data, as in a recommender system. Over the past few years, many\nontologies have been developed and used as a form of structured representation\nof knowledge bases for information systems. The measure of semantic similarity\nfrom ontology has developed by several methods. In this paper, we propose and\ncarry on an approach for the improvement of semantic similarity calculations\nwithin a recommender system based-on RDF graphs.\n","authors":["Ngoc Luyen Le","Marie-Hélène Abel","Philippe Gouspillou"],"pdf_url":"https://arxiv.org/pdf/2307.10639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10617v1","updated":"2023-07-20T06:35:43Z","published":"2023-07-20T06:35:43Z","title":"Detecting deceptive reviews using text classification","summary":"  In recent years, online reviews play a vital role for promoting any kind of\nproduct or services. Businesses may embed fake reviews in order to attract\ncustomers to purchase their products. They may even highlight the benefits of\ntheir own product or criticize the competition's product. Marketers,\nadvertisers, and other online business users have incentive to create fake\npositive reviews for products which they want to promote or give fake negative\nreviews for products which they really don't like. So now-a-days writing a\ndeceptive review is inevitable thing for promoting their own business or\ndegrading competitor's reputation. Thus, identifying deceptive reviews is an\nintense and on-going research area. This research paper proposes machine\nlearning model approach to identify deceptive reviews. The paper investigates\nthe performance of the several experiments done on a Deceptive Opinion Spam\nCorpus dataset of restaurants reviews. We developed a n-gram model and max\nfeatures to identify deceptive contents with a particular focus on fake\nreviews. Further, we conduct a benchmark study to investigate the performance\nof two different features extraction techniques and apply five machine learning\nclassification techniques. The experimental results show that passive\naggressive classifier outperforms other algorithms, and it reaches the highest\naccuracy not only in text classification but also to fake reviews. We also\nstudy the data augmentation and implement different deep learning techniques.\n","authors":["Anusuya Baby"],"pdf_url":"https://arxiv.org/pdf/2307.10617v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2306.11296v2","updated":"2023-07-20T02:20:35Z","published":"2023-06-20T05:20:29Z","title":"ChatGPT Chemistry Assistant for Text Mining and Prediction of MOF\n  Synthesis","summary":"  We use prompt engineering to guide ChatGPT in the automation of text mining\nof metal-organic frameworks (MOFs) synthesis conditions from diverse formats\nand styles of the scientific literature. This effectively mitigates ChatGPT's\ntendency to hallucinate information -- an issue that previously made the use of\nLarge Language Models (LLMs) in scientific fields challenging. Our approach\ninvolves the development of a workflow implementing three different processes\nfor text mining, programmed by ChatGPT itself. All of them enable parsing,\nsearching, filtering, classification, summarization, and data unification with\ndifferent tradeoffs between labor, speed, and accuracy. We deploy this system\nto extract 26,257 distinct synthesis parameters pertaining to approximately 800\nMOFs sourced from peer-reviewed research articles. This process incorporates\nour ChemPrompt Engineering strategy to instruct ChatGPT in text mining,\nresulting in impressive precision, recall, and F1 scores of 90-99%.\nFurthermore, with the dataset built by text mining, we constructed a\nmachine-learning model with over 86% accuracy in predicting MOF experimental\ncrystallization outcomes and preliminarily identifying important factors in MOF\ncrystallization. We also developed a reliable data-grounded MOF chatbot to\nanswer questions on chemical reactions and synthesis procedures. Given that the\nprocess of using ChatGPT reliably mines and tabulates diverse MOF synthesis\ninformation in a unified format, while using only narrative language requiring\nno coding expertise, we anticipate that our ChatGPT Chemistry Assistant will be\nvery useful across various other chemistry sub-disciplines.\n","authors":["Zhiling Zheng","Oufan Zhang","Christian Borgs","Jennifer T. Chayes","Omar M. Yaghi"],"pdf_url":"https://arxiv.org/pdf/2306.11296v2.pdf","comment":"Published on Journal of the American Chemical Society (2023); 102\n  pages (18-page manuscript, 84 pages of supporting information)"},{"id":"http://arxiv.org/abs/2307.11224v1","updated":"2023-07-20T20:37:24Z","published":"2023-07-20T20:37:24Z","title":"Jina Embeddings: A Novel Set of High-Performance Sentence Embedding\n  Models","summary":"  Jina Embeddings constitutes a set of high-performance sentence embedding\nmodels adept at translating various textual inputs into numerical\nrepresentations, thereby capturing the semantic essence of the text. While\nthese models are not exclusively designed for text generation, they excel in\napplications such as dense retrieval and semantic textual similarity. This\npaper details the development of Jina Embeddings, starting with the creation of\na high-quality pairwise and triplet dataset. It underlines the crucial role of\ndata cleaning in dataset preparation, gives in-depth insights into the model\ntraining process, and concludes with a comprehensive performance evaluation\nusing the Massive Textual Embedding Benchmark (MTEB).\n","authors":["Michael Günther","Louis Milliken","Jonathan Geuter","Georgios Mastrapas","Bo Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.11224v1.pdf","comment":"9 pages, 2 page appendix, EMNLP 2023 Industrial Track"},{"id":"http://arxiv.org/abs/2307.11140v1","updated":"2023-07-20T17:52:47Z","published":"2023-07-20T17:52:47Z","title":"RCVaR: an Economic Approach to Estimate Cyberattacks Costs using Data\n  from Industry Reports","summary":"  Digitization increases business opportunities and the risk of companies being\nvictims of devastating cyberattacks. Therefore, managing risk exposure and\ncybersecurity strategies is essential for digitized companies that want to\nsurvive in competitive markets. However, understanding company-specific risks\nand quantifying their associated costs is not trivial. Current approaches fail\nto provide individualized and quantitative monetary estimations of\ncybersecurity impacts. Due to limited resources and technical expertise, SMEs\nand even large companies are affected and struggle to quantify their\ncyberattack exposure. Therefore, novel approaches must be placed to support the\nunderstanding of the financial loss due to cyberattacks. This article\nintroduces the Real Cyber Value at Risk (RCVaR), an economical approach for\nestimating cybersecurity costs using real-world information from public\ncybersecurity reports. RCVaR identifies the most significant cyber risk factors\nfrom various sources and combines their quantitative results to estimate\nspecific cyberattacks costs for companies. Furthermore, RCVaR extends current\nmethods to achieve cost and risk estimations based on historical real-world\ndata instead of only probability-based simulations. The evaluation of the\napproach on unseen data shows the accuracy and efficiency of the RCVaR in\npredicting and managing cyber risks. Thus, it shows that the RCVaR is a\nvaluable addition to cybersecurity planning and risk management processes.\n","authors":["Muriel Figueredo Franco","Fabian Künzler","Jan von der Assen","Chao Feng","Burkhard Stiller"],"pdf_url":"https://arxiv.org/pdf/2307.11140v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.11091v1","updated":"2023-07-20T17:59:59Z","published":"2023-07-20T17:59:59Z","title":"Data-driven criteria for quantum correlations","summary":"  We build a machine learning model to detect correlations in a three-qubit\nsystem using a neural network trained in an unsupervised manner on randomly\ngenerated states. The network is forced to recognize separable states, and\ncorrelated states are detected as anomalies. Quite surprisingly, we find that\nthe proposed detector performs much better at distinguishing a weaker form of\nquantum correlations, namely, the quantum discord, than entanglement. In fact,\nit has a tendency to grossly overestimate the set of entangled states even at\nthe optimal threshold for entanglement detection, while it underestimates the\nset of discordant states to a much lesser extent. In order to illustrate the\nnature of states classified as quantum-correlated, we construct a diagram\ncontaining various types of states -- entangled, as well as separable, both\ndiscordant and non-discordant. We find that the near-zero value of the\nrecognition loss reproduces the shape of the non-discordant separable states\nwith high accuracy, especially considering the non-trivial shape of this set on\nthe diagram. The network architecture is designed carefully: it preserves\nseparability, and its output is equivariant with respect to qubit permutations.\nWe show that the choice of architecture is important to get the highest\ndetection accuracy, much better than for a baseline model that just utilizes a\npartial trace operation.\n","authors":["Mateusz Krawczyk","Jarosław Pawłowski","Maciej M. Maśka","Katarzyna Roszak"],"pdf_url":"https://arxiv.org/pdf/2307.11091v1.pdf","comment":"7 pages, 3 figures, 3 tables, and extra 5 pages of supplementary\n  materials"},{"id":"http://arxiv.org/abs/2307.11086v1","updated":"2023-07-20T17:59:33Z","published":"2023-07-20T17:59:33Z","title":"PAPR: Proximity Attention Point Rendering","summary":"  Learning accurate and parsimonious point cloud representations of scene\nsurfaces from scratch remains a challenge in 3D representation learning.\nExisting point-based methods often suffer from the vanishing gradient problem\nor require a large number of points to accurately model scene geometry and\ntexture. To address these limitations, we propose Proximity Attention Point\nRendering (PAPR), a novel method that consists of a point-based scene\nrepresentation and a differentiable renderer. Our scene representation uses a\npoint cloud where each point is characterized by its spatial position,\nforeground score, and view-independent feature vector. The renderer selects the\nrelevant points for each ray and produces accurate colours using their\nassociated features. PAPR effectively learns point cloud positions to represent\nthe correct scene geometry, even when the initialization drastically differs\nfrom the target geometry. Notably, our method captures fine texture details\nwhile using only a parsimonious set of points. We also demonstrate four\npractical applications of our method: geometry editing, object manipulation,\ntexture transfer, and exposure control. More results and code are available on\nour project website at https://zvict.github.io/papr/.\n","authors":["Yanshu Zhang","Shichong Peng","Alireza Moazeni","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2307.11086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07269v2","updated":"2023-07-20T17:59:25Z","published":"2023-07-14T10:50:43Z","title":"Frequency Domain Adversarial Training for Robust Volumetric Medical\n  Segmentation","summary":"  It is imperative to ensure the robustness of deep learning models in critical\napplications such as, healthcare. While recent advances in deep learning have\nimproved the performance of volumetric medical image segmentation models, these\nmodels cannot be deployed for real-world applications immediately due to their\nvulnerability to adversarial attacks. We present a 3D frequency domain\nadversarial attack for volumetric medical image segmentation models and\ndemonstrate its advantages over conventional input or voxel domain attacks.\nUsing our proposed attack, we introduce a novel frequency domain adversarial\ntraining approach for optimizing a robust model against voxel and frequency\ndomain attacks. Moreover, we propose frequency consistency loss to regulate our\nfrequency domain adversarial training that achieves a better tradeoff between\nmodel's performance on clean and adversarial samples. Code is publicly\navailable at https://github.com/asif-hanif/vafa.\n","authors":["Asif Hanif","Muzammal Naseer","Salman Khan","Mubarak Shah","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2307.07269v2.pdf","comment":"This paper has been accepted in MICCAI 2023 conference"},{"id":"http://arxiv.org/abs/2301.13867v2","updated":"2023-07-20T17:59:14Z","published":"2023-01-31T18:59:03Z","title":"Mathematical Capabilities of ChatGPT","summary":"  We investigate the mathematical capabilities of two iterations of ChatGPT\n(released 9-January-2023 and 30-January-2023) and of GPT-4 by testing them on\npublicly available datasets, as well as hand-crafted ones, using a novel\nmethodology. In contrast to formal mathematics, where large databases of formal\nproofs are available (e.g., the Lean Mathematical Library), current datasets of\nnatural-language mathematics, used to benchmark language models, either cover\nonly elementary mathematics or are very small. We address this by publicly\nreleasing two new datasets: GHOSTS and miniGHOSTS. These are the first\nnatural-language datasets curated by working researchers in mathematics that\n(1) aim to cover graduate-level mathematics, (2) provide a holistic overview of\nthe mathematical capabilities of language models, and (3) distinguish multiple\ndimensions of mathematical reasoning. These datasets also test whether ChatGPT\nand GPT-4 can be helpful assistants to professional mathematicians by emulating\nuse cases that arise in the daily professional activities of mathematicians. We\nbenchmark the models on a range of fine-grained performance metrics. For\nadvanced mathematics, this is the most detailed evaluation effort to date. We\nfind that ChatGPT can be used most successfully as a mathematical assistant for\nquerying facts, acting as a mathematical search engine and knowledge base\ninterface. GPT-4 can additionally be used for undergraduate-level mathematics\nbut fails on graduate-level difficulty. Contrary to many positive reports in\nthe media about GPT-4 and ChatGPT's exam-solving abilities (a potential case of\nselection bias), their overall mathematical performance is well below the level\nof a graduate student. Hence, if your goal is to use ChatGPT to pass a\ngraduate-level math exam, you would be better off copying from your average\npeer!\n","authors":["Simon Frieder","Luca Pinchetti","Alexis Chevalier","Ryan-Rhys Griffiths","Tommaso Salvatori","Thomas Lukasiewicz","Philipp Christian Petersen","Julius Berner"],"pdf_url":"https://arxiv.org/pdf/2301.13867v2.pdf","comment":"Added further evaluations on another ChatGPT version and on GPT-4.\n  The GHOSTS and miniGHOSTS datasets are available at\n  https://github.com/xyfrieder/science-GHOSTS"},{"id":"http://arxiv.org/abs/2307.11085v1","updated":"2023-07-20T17:59:11Z","published":"2023-07-20T17:59:11Z","title":"Representation Learning in Anomaly Detection: Successes, Limits and a\n  Grand Challenge","summary":"  In this perspective paper, we argue that the dominant paradigm in anomaly\ndetection cannot scale indefinitely and will eventually hit fundamental limits.\nThis is due to the a no free lunch principle for anomaly detection. These\nlimitations can be overcome when there are strong tasks priors, as is the case\nfor many industrial tasks. When such priors do not exists, the task is much\nharder for anomaly detection. We pose two such tasks as grand challenges for\nanomaly detection: i) scientific discovery by anomaly detection ii) a\n\"mini-grand\" challenge of detecting the most anomalous image in the ImageNet\ndataset. We believe new anomaly detection tools and ideas would need to be\ndeveloped to overcome these challenges.\n","authors":["Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2307.11085v1.pdf","comment":"Keynote talk at the Visual Anomaly and Novelty Detection Workshop,\n  CVPR'23"},{"id":"http://arxiv.org/abs/2205.09208v2","updated":"2023-07-20T17:57:36Z","published":"2022-05-18T20:34:25Z","title":"Torchhd: An Open Source Python Library to Support Research on\n  Hyperdimensional Computing and Vector Symbolic Architectures","summary":"  Hyperdimensional computing (HD), also known as vector symbolic architectures\n(VSA), is a framework for computing with distributed representations by\nexploiting properties of random high-dimensional vector spaces. The commitment\nof the scientific community to aggregate and disseminate research in this\nparticularly multidisciplinary area has been fundamental for its advancement.\nJoining these efforts, we present Torchhd, a high-performance open source\nPython library for HD/VSA. Torchhd seeks to make HD/VSA more accessible and\nserves as an efficient foundation for further research and application\ndevelopment. The easy-to-use library builds on top of PyTorch and features\nstate-of-the-art HD/VSA functionality, clear documentation, and implementation\nexamples from well-known publications. Comparing publicly available code with\ntheir corresponding Torchhd implementation shows that experiments can run up to\n100x faster. Torchhd is available at:\nhttps://github.com/hyperdimensional-computing/torchhd.\n","authors":["Mike Heddes","Igor Nunes","Pere Vergés","Denis Kleyko","Danny Abraham","Tony Givargis","Alexandru Nicolau","Alexander Veidenbaum"],"pdf_url":"https://arxiv.org/pdf/2205.09208v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11081v1","updated":"2023-07-20T17:57:04Z","published":"2023-07-20T17:57:04Z","title":"GLSFormer : Gated - Long, Short Sequence Transformer for Step\n  Recognition in Surgical Videos","summary":"  Automated surgical step recognition is an important task that can\nsignificantly improve patient safety and decision-making during surgeries.\nExisting state-of-the-art methods for surgical step recognition either rely on\nseparate, multi-stage modeling of spatial and temporal information or operate\non short-range temporal resolution when learned jointly. However, the benefits\nof joint modeling of spatio-temporal features and long-range information are\nnot taken in account. In this paper, we propose a vision transformer-based\napproach to jointly learn spatio-temporal features directly from sequence of\nframe-level patches. Our method incorporates a gated-temporal attention\nmechanism that intelligently combines short-term and long-term spatio-temporal\nfeature representations. We extensively evaluate our approach on two cataract\nsurgery video datasets, namely Cataract-101 and D99, and demonstrate superior\nperformance compared to various state-of-the-art methods. These results\nvalidate the suitability of our proposed approach for automated surgical step\nrecognition. Our code is released at:\nhttps://github.com/nisargshah1999/GLSFormer\n","authors":["Nisarg A. Shah","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2307.11081v1.pdf","comment":"Accepted to MICCAI 2023 (Early Accept)"},{"id":"http://arxiv.org/abs/2307.11078v1","updated":"2023-07-20T17:55:17Z","published":"2023-07-20T17:55:17Z","title":"Brain2Music: Reconstructing Music from Human Brain Activity","summary":"  The process of reconstructing experiences from human brain activity offers a\nunique lens into how the brain interprets and represents the world. In this\npaper, we introduce a method for reconstructing music from brain activity,\ncaptured using functional magnetic resonance imaging (fMRI). Our approach uses\neither music retrieval or the MusicLM music generation model conditioned on\nembeddings derived from fMRI data. The generated music resembles the musical\nstimuli that human subjects experienced, with respect to semantic properties\nlike genre, instrumentation, and mood. We investigate the relationship between\ndifferent components of MusicLM and brain activity through a voxel-wise\nencoding modeling analysis. Furthermore, we discuss which brain regions\nrepresent information derived from purely textual descriptions of music\nstimuli. We provide supplementary material including examples of the\nreconstructed music at https://google-research.github.io/seanet/brain2music\n","authors":["Timo I. Denk","Yu Takagi","Takuya Matsuyama","Andrea Agostinelli","Tomoya Nakai","Christian Frank","Shinji Nishimoto"],"pdf_url":"https://arxiv.org/pdf/2307.11078v1.pdf","comment":"Preprint; 21 pages; supplementary material:\n  https://google-research.github.io/seanet/brain2music"},{"id":"http://arxiv.org/abs/2307.11077v1","updated":"2023-07-20T17:55:14Z","published":"2023-07-20T17:55:14Z","title":"AlignDet: Aligning Pre-training and Fine-tuning in Object Detection","summary":"  The paradigm of large-scale pre-training followed by downstream fine-tuning\nhas been widely employed in various object detection algorithms. In this paper,\nwe reveal discrepancies in data, model, and task between the pre-training and\nfine-tuning procedure in existing practices, which implicitly limit the\ndetector's performance, generalization ability, and convergence speed. To this\nend, we propose AlignDet, a unified pre-training framework that can be adapted\nto various existing detectors to alleviate the discrepancies. AlignDet\ndecouples the pre-training process into two stages, i.e., image-domain and\nbox-domain pre-training. The image-domain pre-training optimizes the detection\nbackbone to capture holistic visual abstraction, and box-domain pre-training\nlearns instance-level semantics and task-aware concepts to initialize the parts\nout of the backbone. By incorporating the self-supervised pre-trained\nbackbones, we can pre-train all modules for various detectors in an\nunsupervised paradigm. As depicted in Figure 1, extensive experiments\ndemonstrate that AlignDet can achieve significant improvements across diverse\nprotocols, such as detection algorithm, model backbone, data setting, and\ntraining schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by\n2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs.\n","authors":["Ming Li","Jie Wu","Xionghui Wang","Chen Chen","Jie Qin","Xuefeng Xiao","Rui Wang","Min Zheng","Xin Pan"],"pdf_url":"https://arxiv.org/pdf/2307.11077v1.pdf","comment":"Accepted by ICCV 2023. Code and Models are publicly available.\n  Project Page: https://liming-ai.github.io/AlignDet"},{"id":"http://arxiv.org/abs/2307.11069v1","updated":"2023-07-20T17:52:19Z","published":"2023-07-20T17:52:19Z","title":"Effectiveness and predictability of in-network storage cache for\n  scientific workflows","summary":"  Large scientific collaborations often have multiple scientists accessing the\nsame set of files while doing different analyses, which create repeated\naccesses to the large amounts of shared data located far away. These data\naccesses have long latency due to distance and occupy the limited bandwidth\navailable over the wide-area network. To reduce the wide-area network traffic\nand the data access latency, regional data storage caches have been installed\nas a new networking service. To study the effectiveness of such a cache system\nin scientific applications, we examine the Southern California Petabyte Scale\nCache for a high-energy physics experiment. By examining about 3TB of\noperational logs, we show that this cache removed 67.6% of file requests from\nthe wide-area network and reduced the traffic volume on wide-area network by\n12.3TB (or 35.4%) an average day. The reduction in the traffic volume (35.4%)\nis less than the reduction in file counts (67.6%) because the larger files are\nless likely to be reused. Due to this difference in data access patterns, the\ncache system has implemented a policy to avoid evicting smaller files when\nprocessing larger files. We also build a machine learning model to study the\npredictability of the cache behavior. Tests show that this model is able to\naccurately predict the cache accesses, cache misses, and network throughput,\nmaking the model useful for future studies on resource provisioning and\nplanning.\n","authors":["Caitlin Sim","Kesheng Wu","Alex Sim","Inder Monga","Chin Guok","Frank Wurthwein","Diego Davila","Harvey Newman","Justas Balcas"],"pdf_url":"https://arxiv.org/pdf/2307.11069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11049v1","updated":"2023-07-20T17:30:37Z","published":"2023-07-20T17:30:37Z","title":"Breadcrumbs to the Goal: Goal-Conditioned Exploration from\n  Human-in-the-Loop Feedback","summary":"  Exploration and reward specification are fundamental and intertwined\nchallenges for reinforcement learning. Solving sequential decision-making tasks\nrequiring expansive exploration requires either careful design of reward\nfunctions or the use of novelty-seeking exploration bonuses. Human supervisors\ncan provide effective guidance in the loop to direct the exploration process,\nbut prior methods to leverage this guidance require constant synchronous\nhigh-quality human feedback, which is expensive and impractical to obtain. In\nthis work, we present a technique called Human Guided Exploration (HuGE), which\nuses low-quality feedback from non-expert users that may be sporadic,\nasynchronous, and noisy. HuGE guides exploration for reinforcement learning not\nonly in simulation but also in the real world, all without meticulous reward\nspecification. The key concept involves bifurcating human feedback and policy\nlearning: human feedback steers exploration, while self-supervised learning\nfrom the exploration data yields unbiased policies. This procedure can leverage\nnoisy, asynchronous human feedback to learn policies with no hand-crafted\nreward design or exploration bonuses. HuGE is able to learn a variety of\nchallenging multi-stage robotic navigation and manipulation tasks in simulation\nusing crowdsourced feedback from non-expert users. Moreover, this paradigm can\nbe scaled to learning directly on real-world robots, using occasional,\nasynchronous feedback from human supervisors.\n","authors":["Marcel Torne","Max Balsells","Zihan Wang","Samedh Desai","Tao Chen","Pulkit Agrawal","Abhishek Gupta"],"pdf_url":"https://arxiv.org/pdf/2307.11049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11046v1","updated":"2023-07-20T17:28:01Z","published":"2023-07-20T17:28:01Z","title":"A Definition of Continual Reinforcement Learning","summary":"  In this paper we develop a foundation for continual reinforcement learning.\n","authors":["David Abel","André Barreto","Benjamin Van Roy","Doina Precup","Hado van Hasselt","Satinder Singh"],"pdf_url":"https://arxiv.org/pdf/2307.11046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11044v1","updated":"2023-07-20T17:27:29Z","published":"2023-07-20T17:27:29Z","title":"On the Convergence of Bounded Agents","summary":"  When has an agent converged? Standard models of the reinforcement learning\nproblem give rise to a straightforward definition of convergence: An agent\nconverges when its behavior or performance in each environment state stops\nchanging. However, as we shift the focus of our learning problem from the\nenvironment's state to the agent's state, the concept of an agent's convergence\nbecomes significantly less clear. In this paper, we propose two complementary\naccounts of agent convergence in a framing of the reinforcement learning\nproblem that centers around bounded agents. The first view says that a bounded\nagent has converged when the minimal number of states needed to describe the\nagent's future behavior cannot decrease. The second view says that a bounded\nagent has converged just when the agent's performance only changes if the\nagent's internal state changes. We establish basic properties of these two\ndefinitions, show that they accommodate typical views of convergence in\nstandard settings, and prove several facts about their nature and relationship.\nWe take these perspectives, definitions, and analysis to bring clarity to a\ncentral idea of the field.\n","authors":["David Abel","André Barreto","Hado van Hasselt","Benjamin Van Roy","Doina Precup","Satinder Singh"],"pdf_url":"https://arxiv.org/pdf/2307.11044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02719v3","updated":"2023-07-20T17:17:54Z","published":"2023-07-06T01:57:37Z","title":"Understanding Uncertainty Sampling","summary":"  Uncertainty sampling is a prevalent active learning algorithm that queries\nsequentially the annotations of data samples which the current prediction model\nis uncertain about. However, the usage of uncertainty sampling has been largely\nheuristic: (i) There is no consensus on the proper definition of \"uncertainty\"\nfor a specific task under a specific loss; (ii) There is no theoretical\nguarantee that prescribes a standard protocol to implement the algorithm, for\nexample, how to handle the sequentially arrived annotated data under the\nframework of optimization algorithms such as stochastic gradient descent. In\nthis work, we systematically examine uncertainty sampling algorithms under both\nstream-based and pool-based active learning. We propose a notion of equivalent\nloss which depends on the used uncertainty measure and the original loss\nfunction and establish that an uncertainty sampling algorithm essentially\noptimizes against such an equivalent loss. The perspective verifies the\nproperness of existing uncertainty measures from two aspects: surrogate\nproperty and loss convexity. Furthermore, we propose a new notion for designing\nuncertainty measures called \\textit{loss as uncertainty}. The idea is to use\nthe conditional expected loss given the features as the uncertainty measure.\nSuch an uncertainty measure has nice analytical properties and generality to\ncover both classification and regression problems, which enable us to provide\nthe first generalization bound for uncertainty sampling algorithms under both\nstream-based and pool-based settings, in the full generality of the underlying\nmodel and problem. Lastly, we establish connections between certain variants of\nthe uncertainty sampling algorithms with risk-sensitive objectives and\ndistributional robustness, which can partly explain the advantage of\nuncertainty sampling algorithms when the sample size is small.\n","authors":["Shang Liu","Xiaocheng Li"],"pdf_url":"https://arxiv.org/pdf/2307.02719v3.pdf","comment":"Update: add numerical illustrations and experiments; correct some\n  typos and modify the numbering"},{"id":"http://arxiv.org/abs/2307.11031v1","updated":"2023-07-20T17:07:28Z","published":"2023-07-20T17:07:28Z","title":"Embroid: Unsupervised Prediction Smoothing Can Improve Few-Shot\n  Classification","summary":"  Recent work has shown that language models' (LMs) prompt-based learning\ncapabilities make them well suited for automating data labeling in domains\nwhere manual annotation is expensive. The challenge is that while writing an\ninitial prompt is cheap, improving a prompt is costly -- practitioners often\nrequire significant labeled data in order to evaluate the impact of prompt\nmodifications. Our work asks whether it is possible to improve prompt-based\nlearning without additional labeled data. We approach this problem by\nattempting to modify the predictions of a prompt, rather than the prompt\nitself. Our intuition is that accurate predictions should also be consistent:\nsamples which are similar under some feature representation should receive the\nsame prompt prediction. We propose Embroid, a method which computes multiple\nrepresentations of a dataset under different embedding functions, and uses the\nconsistency between the LM predictions for neighboring samples to identify\nmispredictions. Embroid then uses these neighborhoods to create additional\npredictions for each sample, and combines these predictions with a simple\nlatent variable graphical model in order to generate a final corrected\nprediction. In addition to providing a theoretical analysis of Embroid, we\nconduct a rigorous empirical evaluation across six different LMs and up to 95\ndifferent tasks. We find that (1) Embroid substantially improves performance\nover original prompts (e.g., by an average of 7.3 points on GPT-JT), (2) also\nrealizes improvements for more sophisticated prompting strategies (e.g.,\nchain-of-thought), and (3) can be specialized to domains like law through the\nembedding functions.\n","authors":["Neel Guha","Mayee F. Chen","Kush Bhatia","Azalia Mirhoseini","Frederic Sala","Christopher Ré"],"pdf_url":"https://arxiv.org/pdf/2307.11031v1.pdf","comment":"38 pages, 22 figures, 8 tables"},{"id":"http://arxiv.org/abs/2307.11030v1","updated":"2023-07-20T17:05:51Z","published":"2023-07-20T17:05:51Z","title":"Cluster-aware Semi-supervised Learning: Relational Knowledge\n  Distillation Provably Learns Clustering","summary":"  Despite the empirical success and practical significance of (relational)\nknowledge distillation that matches (the relations of) features between teacher\nand student models, the corresponding theoretical interpretations remain\nlimited for various knowledge distillation paradigms. In this work, we take an\ninitial step toward a theoretical understanding of relational knowledge\ndistillation (RKD), with a focus on semi-supervised classification problems. We\nstart by casting RKD as spectral clustering on a population-induced graph\nunveiled by a teacher model. Via a notion of clustering error that quantifies\nthe discrepancy between the predicted and ground truth clusterings, we\nillustrate that RKD over the population provably leads to low clustering error.\nMoreover, we provide a sample complexity bound for RKD with limited unlabeled\nsamples. For semi-supervised learning, we further demonstrate the label\nefficiency of RKD through a general framework of cluster-aware semi-supervised\nlearning that assumes low clustering errors. Finally, by unifying data\naugmentation consistency regularization into this cluster-aware framework, we\nshow that despite the common effect of learning accurate clusterings, RKD\nfacilitates a \"global\" perspective through spectral clustering, whereas\nconsistency regularization focuses on a \"local\" perspective via expansion.\n","authors":["Yijun Dong","Kevin Miller","Qi Lei","Rachel Ward"],"pdf_url":"https://arxiv.org/pdf/2307.11030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05610v2","updated":"2023-07-20T16:46:36Z","published":"2023-05-09T17:01:17Z","title":"Can point cloud networks learn statistical shape models of anatomies?","summary":"  Statistical Shape Modeling (SSM) is a valuable tool for investigating and\nquantifying anatomical variations within populations of anatomies. However,\ntraditional correspondence-based SSM generation methods have a prohibitive\ninference process and require complete geometric proxies (e.g., high-resolution\nbinary volumes or surface meshes) as input shapes to construct the SSM.\nUnordered 3D point cloud representations of shapes are more easily acquired\nfrom various medical imaging practices (e.g., thresholded images and surface\nscanning). Point cloud deep networks have recently achieved remarkable success\nin learning permutation-invariant features for different point cloud tasks\n(e.g., completion, semantic segmentation, classification). However, their\napplication to learning SSM from point clouds is to-date unexplored. In this\nwork, we demonstrate that existing point cloud encoder-decoder-based completion\nnetworks can provide an untapped potential for SSM, capturing population-level\nstatistical representations of shapes while reducing the inference burden and\nrelaxing the input requirement. We discuss the limitations of these techniques\nto the SSM application and suggest future improvements. Our work paves the way\nfor further exploration of point cloud deep learning for SSM, a promising\navenue for advancing shape analysis literature and broadening SSM to diverse\nuse cases.\n","authors":["Jadie Adams","Shireen Elhabian"],"pdf_url":"https://arxiv.org/pdf/2305.05610v2.pdf","comment":"Accepted to MICCAI 2023. 13 pages, 5 figures, appendix"},{"id":"http://arxiv.org/abs/2307.11018v1","updated":"2023-07-20T16:45:22Z","published":"2023-07-20T16:45:22Z","title":"Amortized Variational Inference: When and Why?","summary":"  Amortized variational inference (A-VI) is a method for approximating the\nintractable posterior distributions that arise in probabilistic models. The\ndefining feature of A-VI is that it learns a global inference function that\nmaps each observation to its local latent variable's approximate posterior.\nThis stands in contrast to the more classical factorized (or mean-field)\nvariational inference (F-VI), which directly learns the parameters of the\napproximating distribution for each latent variable. In deep generative models,\nA-VI is used as a computational trick to speed up inference for local latent\nvariables. In this paper, we study A-VI as a general alternative to F-VI for\napproximate posterior inference. A-VI cannot produce an approximation with a\nlower Kullback-Leibler divergence than F-VI's optimal solution, because the\namortized family is a subset of the factorized family. Thus a central\ntheoretical problem is to characterize when A-VI still attains F-VI's optimal\nsolution. We derive conditions on both the model and the inference function\nunder which A-VI can theoretically achieve F-VI's optimum. We show that for a\nbroad class of hierarchical models, including deep generative models, it is\npossible to close the gap between A-VI and F-VI. Further, for an even broader\nclass of models, we establish when and how to expand the domain of the\ninference function to make amortization a feasible strategy. Finally, we prove\nthat for certain models -- including hidden Markov models and Gaussian\nprocesses -- A-VI cannot match F-VI's solution, no matter how expressive the\ninference function is. We also study A-VI empirically [...]\n","authors":["Charles C. Margossian","David M. Blei"],"pdf_url":"https://arxiv.org/pdf/2307.11018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11017v1","updated":"2023-07-20T16:45:16Z","published":"2023-07-20T16:45:16Z","title":"Multi-objective point cloud autoencoders for explainable myocardial\n  infarction prediction","summary":"  Myocardial infarction (MI) is one of the most common causes of death in the\nworld. Image-based biomarkers commonly used in the clinic, such as ejection\nfraction, fail to capture more complex patterns in the heart's 3D anatomy and\nthus limit diagnostic accuracy. In this work, we present the multi-objective\npoint cloud autoencoder as a novel geometric deep learning approach for\nexplainable infarction prediction, based on multi-class 3D point cloud\nrepresentations of cardiac anatomy and function. Its architecture consists of\nmultiple task-specific branches connected by a low-dimensional latent space to\nallow for effective multi-objective learning of both reconstruction and MI\nprediction, while capturing pathology-specific 3D shape information in an\ninterpretable latent space. Furthermore, its hierarchical branch design with\npoint cloud-based deep learning operations enables efficient multi-scale\nfeature learning directly on high-resolution anatomy point clouds. In our\nexperiments on a large UK Biobank dataset, the multi-objective point cloud\nautoencoder is able to accurately reconstruct multi-temporal 3D shapes with\nChamfer distances between predicted and input anatomies below the underlying\nimages' pixel resolution. Our method outperforms multiple machine learning and\ndeep learning benchmarks for the task of incident MI prediction by 19% in terms\nof Area Under the Receiver Operating Characteristic curve. In addition, its\ntask-specific compact latent space exhibits easily separable control and MI\nclusters with clinically plausible associations between subject encodings and\ncorresponding 3D shapes, thus demonstrating the explainability of the\nprediction.\n","authors":["Marcel Beetz","Abhirup Banerjee","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2307.11017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.01110v3","updated":"2023-07-20T16:44:47Z","published":"2022-07-03T20:07:00Z","title":"Data-Driven Modeling of Noise Time Series with Convolutional Generative\n  Adversarial Networks","summary":"  Random noise arising from physical processes is an inherent characteristic of\nmeasurements and a limiting factor for most signal processing and data analysis\ntasks. Given the recent interest in generative adversarial networks (GANs) for\ndata-driven modeling, it is important to determine to what extent GANs can\nfaithfully reproduce noise in target data sets. In this paper, we present an\nempirical investigation that aims to shed light on this issue for time series.\nNamely, we assess two general-purpose GANs for time series that are based on\nthe popular deep convolutional GAN (DCGAN) architecture, a direct time-series\nmodel and an image-based model that uses a short-time Fourier transform (STFT)\ndata representation. The GAN models are trained and quantitatively evaluated\nusing distributions of simulated noise time series with known ground-truth\nparameters. Target time series distributions include a broad range of noise\ntypes commonly encountered in physical measurements, electronics, and\ncommunication systems: band-limited thermal noise, power law noise, shot noise,\nand impulsive noise. We find that GANs are capable of learning many noise\ntypes, although they predictably struggle when the GAN architecture is not well\nsuited to some aspects of the noise, e.g., impulsive time-series with extreme\noutliers. Our findings provide insights into the capabilities and potential\nlimitations of current approaches to time-series GANs and highlight areas for\nfurther research. In addition, our battery of tests provides a useful benchmark\nto aid the development of deep generative models for time series.\n","authors":["Adam Wunderlich","Jack Sklar"],"pdf_url":"https://arxiv.org/pdf/2207.01110v3.pdf","comment":"27 pages, 20 figures"},{"id":"http://arxiv.org/abs/2302.06223v3","updated":"2023-07-20T16:40:14Z","published":"2023-02-13T09:54:50Z","title":"Variational Mixture of HyperGenerators for Learning Distributions Over\n  Functions","summary":"  Recent approaches build on implicit neural representations (INRs) to propose\ngenerative models over function spaces. However, they are computationally\ncostly when dealing with inference tasks, such as missing data imputation, or\ndirectly cannot tackle them. In this work, we propose a novel deep generative\nmodel, named VAMoH. VAMoH combines the capabilities of modeling continuous\nfunctions using INRs and the inference capabilities of Variational Autoencoders\n(VAEs). In addition, VAMoH relies on a normalizing flow to define the prior,\nand a mixture of hypernetworks to parametrize the data log-likelihood. This\ngives VAMoH a high expressive capability and interpretability. Through\nexperiments on a diverse range of data types, such as images, voxels, and\nclimate data, we show that VAMoH can effectively learn rich distributions over\ncontinuous functions. Furthermore, it can perform inference-related tasks, such\nas conditional super-resolution generation and in-painting, as well or better\nthan previous approaches, while being less computationally demanding.\n","authors":["Batuhan Koyuncu","Pablo Sanchez-Martin","Ignacio Peis","Pablo M. Olmos","Isabel Valera"],"pdf_url":"https://arxiv.org/pdf/2302.06223v3.pdf","comment":"Accepted at ICML 2023. Camera ready version"},{"id":"http://arxiv.org/abs/2012.07881v2","updated":"2023-07-20T16:38:57Z","published":"2020-12-14T19:02:26Z","title":"Perceptron Theory Can Predict the Accuracy of Neural Networks","summary":"  Multilayer neural networks set the current state of the art for many\ntechnical classification problems. But, these networks are still, essentially,\nblack boxes in terms of analyzing them and predicting their performance. Here,\nwe develop a statistical theory for the one-layer perceptron and show that it\ncan predict performances of a surprisingly large variety of neural networks\nwith different architectures. A general theory of classification with\nperceptrons is developed by generalizing an existing theory for analyzing\nreservoir computing models and connectionist models for symbolic reasoning\nknown as vector symbolic architectures. Our statistical theory offers three\nformulas leveraging the signal statistics with increasing detail. The formulas\nare analytically intractable, but can be evaluated numerically. The description\nlevel that captures maximum details requires stochastic sampling methods.\nDepending on the network model, the simpler formulas already yield high\nprediction accuracy. The quality of the theory predictions is assessed in three\nexperimental settings, a memorization task for echo state networks (ESNs) from\nreservoir computing literature, a collection of classification datasets for\nshallow randomly connected networks, and the ImageNet dataset for deep\nconvolutional neural networks. We find that the second description level of the\nperceptron theory can predict the performance of types of ESNs, which could not\nbe described previously. The theory can predict deep multilayer neural networks\nby being applied to their output layer. While other methods for prediction of\nneural networks performance commonly require to train an estimator model, the\nproposed theory requires only the first two moments of the distribution of the\npostsynaptic sums in the output neurons. The perceptron theory compares\nfavorably to other methods that do not rely on training an estimator model.\n","authors":["Denis Kleyko","Antonello Rosato","E. Paxon Frady","Massimo Panella","Friedrich T. Sommer"],"pdf_url":"https://arxiv.org/pdf/2012.07881v2.pdf","comment":"16 pages, 14 figures"},{"id":"http://arxiv.org/abs/2307.11013v1","updated":"2023-07-20T16:38:18Z","published":"2023-07-20T16:38:18Z","title":"Flow Map Learning for Unknown Dynamical Systems: Overview,\n  Implementation, and Benchmarks","summary":"  Flow map learning (FML), in conjunction with deep neural networks (DNNs), has\nshown promises for data driven modeling of unknown dynamical systems. A\nremarkable feature of FML is that it is capable of producing accurate\npredictive models for partially observed systems, even when their exact\nmathematical models do not exist. In this paper, we present an overview of the\nFML framework, along with the important computational details for its\nsuccessful implementation. We also present a set of well defined benchmark\nproblems for learning unknown dynamical systems. All the numerical details of\nthese problems are presented, along with their FML results, to ensure that the\nproblems are accessible for cross-examination and the results are reproducible.\n","authors":["Victor Churchill","Dongbin Xiu"],"pdf_url":"https://arxiv.org/pdf/2307.11013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11011v1","updated":"2023-07-20T16:36:04Z","published":"2023-07-20T16:36:04Z","title":"Neuron Sensitivity Guided Test Case Selection for Deep Learning Testing","summary":"  Deep Neural Networks~(DNNs) have been widely deployed in software to address\nvarious tasks~(e.g., autonomous driving, medical diagnosis). However, they\ncould also produce incorrect behaviors that result in financial losses and even\nthreaten human safety. To reveal the incorrect behaviors in DNN and repair\nthem, DNN developers often collect rich unlabeled datasets from the natural\nworld and label them to test the DNN models. However, properly labeling a large\nnumber of unlabeled datasets is a highly expensive and time-consuming task.\n  To address the above-mentioned problem, we propose NSS, Neuron Sensitivity\nguided test case Selection, which can reduce the labeling time by selecting\nvaluable test cases from unlabeled datasets. NSS leverages the internal\nneuron's information induced by test cases to select valuable test cases, which\nhave high confidence in causing the model to behave incorrectly. We evaluate\nNSS with four widely used datasets and four well-designed DNN models compared\nto SOTA baseline methods. The results show that NSS performs well in assessing\nthe test cases' probability of fault triggering and model improvement\ncapabilities. Specifically, compared with baseline approaches, NSS obtains a\nhigher fault detection rate~(e.g., when selecting 5\\% test case from the\nunlabeled dataset in MNIST \\& LeNet1 experiment, NSS can obtain 81.8\\% fault\ndetection rate, 20\\% higher than baselines).\n","authors":["Dong Huang","Qingwen Bu","Yichao Fu","Yuhao Qing","Bocheng Xiao","Heming Cui"],"pdf_url":"https://arxiv.org/pdf/2307.11011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11007v1","updated":"2023-07-20T16:34:58Z","published":"2023-07-20T16:34:58Z","title":"Sharpness Minimization Algorithms Do Not Only Minimize Sharpness To\n  Achieve Better Generalization","summary":"  Despite extensive studies, the underlying reason as to why overparameterized\nneural networks can generalize remains elusive. Existing theory shows that\ncommon stochastic optimizers prefer flatter minimizers of the training loss,\nand thus a natural potential explanation is that flatness implies\ngeneralization. This work critically examines this explanation. Through\ntheoretical and empirical investigation, we identify the following three\nscenarios for two-layer ReLU networks: (1) flatness provably implies\ngeneralization; (2) there exist non-generalizing flattest models and sharpness\nminimization algorithms fail to generalize, and (3) perhaps most surprisingly,\nthere exist non-generalizing flattest models, but sharpness minimization\nalgorithms still generalize. Our results suggest that the relationship between\nsharpness and generalization subtly depends on the data distributions and the\nmodel architectures and sharpness minimization algorithms do not only minimize\nsharpness to achieve better generalization. This calls for the search for other\nexplanations for the generalization of over-parameterized neural networks.\n","authors":["Kaiyue Wen","Tengyu Ma","Zhiyuan Li"],"pdf_url":"https://arxiv.org/pdf/2307.11007v1.pdf","comment":"34 pages,11 figures"},{"id":"http://arxiv.org/abs/2307.10999v1","updated":"2023-07-20T16:27:51Z","published":"2023-07-20T16:27:51Z","title":"Private Federated Learning with Autotuned Compression","summary":"  We propose new techniques for reducing communication in private federated\nlearning without the need for setting or tuning compression rates. Our\non-the-fly methods automatically adjust the compression rate based on the error\ninduced during training, while maintaining provable privacy guarantees through\nthe use of secure aggregation and differential privacy. Our techniques are\nprovably instance-optimal for mean estimation, meaning that they can adapt to\nthe ``hardness of the problem\" with minimal interactivity. We demonstrate the\neffectiveness of our approach on real-world datasets by achieving favorable\ncompression rates without the need for tuning.\n","authors":["Enayat Ullah","Christopher A. Choquette-Choo","Peter Kairouz","Sewoong Oh"],"pdf_url":"https://arxiv.org/pdf/2307.10999v1.pdf","comment":"Accepted to ICML 2023"},{"id":"http://arxiv.org/abs/2307.10997v1","updated":"2023-07-20T16:25:58Z","published":"2023-07-20T16:25:58Z","title":"DREAM: Domain-free Reverse Engineering Attributes of Black-box Model","summary":"  Deep learning models are usually black boxes when deployed on machine\nlearning platforms. Prior works have shown that the attributes ($e.g.$, the\nnumber of convolutional layers) of a target black-box neural network can be\nexposed through a sequence of queries. There is a crucial limitation: these\nworks assume the dataset used for training the target model to be known\nbeforehand and leverage this dataset for model attribute attack. However, it is\ndifficult to access the training dataset of the target black-box model in\nreality. Therefore, whether the attributes of a target black-box model could be\nstill revealed in this case is doubtful. In this paper, we investigate a new\nproblem of Domain-agnostic Reverse Engineering the Attributes of a black-box\ntarget Model, called DREAM, without requiring the availability of the target\nmodel's training dataset, and put forward a general and principled framework by\ncasting this problem as an out of distribution (OOD) generalization problem. In\nthis way, we can learn a domain-agnostic model to inversely infer the\nattributes of a target black-box model with unknown training data. This makes\nour method one of the kinds that can gracefully apply to an arbitrary domain\nfor model attribute reverse engineering with strong generalization ability.\nExtensive experimental studies are conducted and the results validate the\nsuperiority of our proposed method over the baselines.\n","authors":["Rongqing Li","Jiaqi Yu","Changsheng Li","Wenhan Luo","Ye Yuan","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10994v1","updated":"2023-07-20T16:25:00Z","published":"2023-07-20T16:25:00Z","title":"Progressive distillation diffusion for raw music generation","summary":"  This paper aims to apply a new deep learning approach to the task of\ngenerating raw audio files. It is based on diffusion models, a recent type of\ndeep generative model. This new type of method has recently shown outstanding\nresults with image generation. A lot of focus has been given to those models by\nthe computer vision community. On the other hand, really few have been given\nfor other types of applications such as music generation in waveform domain.\n  In this paper the model for unconditional generating applied to music is\nimplemented: Progressive distillation diffusion with 1D U-Net. Then, a\ncomparison of different parameters of diffusion and their value in a full\nresult is presented. One big advantage of the methods implemented through this\nwork is the fact that the model is able to deal with progressing audio\nprocessing and generating , using transformation from 1-channel 128 x 384 to\n3-channel 128 x 128 mel-spectrograms and looped generation. The empirical\ncomparisons are realized across different self-collected datasets.\n","authors":["Svetlana Pavlova"],"pdf_url":"https://arxiv.org/pdf/2307.10994v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2207.12395v3","updated":"2023-07-20T16:21:58Z","published":"2022-07-25T17:58:09Z","title":"Tuning Stochastic Gradient Algorithms for Statistical Inference via\n  Large-Sample Asymptotics","summary":"  The tuning of stochastic gradient algorithms (SGAs) for optimization and\nsampling is often based on heuristics and trial-and-error rather than\ngeneralizable theory. We address this theory--practice gap by characterizing\nthe large-sample statistical asymptotics of SGAs via a joint\nstep-size--sample-size scaling limit. We show that iterate averaging with a\nlarge fixed step size is robust to the choice of tuning parameters and\nasymptotically has covariance proportional to that of the MLE sampling\ndistribution. We also prove a Bernstein--von Mises-like theorem to guide\ntuning, including for generalized posteriors that are robust to model\nmisspecification. Numerical experiments validate our results and\nrecommendations in realistic finite-sample regimes. Our work lays the\nfoundation for a systematic analysis of other stochastic gradient Markov chain\nMonte Carlo algorithms for a wide range of models.\n","authors":["Jeffrey Negrea","Jun Yang","Haoyue Feng","Daniel M. Roy","Jonathan H. Huggins"],"pdf_url":"https://arxiv.org/pdf/2207.12395v3.pdf","comment":"42 pgs"},{"id":"http://arxiv.org/abs/2307.10988v1","updated":"2023-07-20T16:18:33Z","published":"2023-07-20T16:18:33Z","title":"Investigating minimizing the training set fill distance in machine\n  learning regression","summary":"  Many machine learning regression methods leverage large datasets for training\npredictive models. However, using large datasets may not be feasible due to\ncomputational limitations or high labelling costs. Therefore, sampling small\ntraining sets from large pools of unlabelled data points is essential to\nmaximize model performance while maintaining computational efficiency. In this\nwork, we study a sampling approach aimed to minimize the fill distance of the\nselected set. We derive an upper bound for the maximum expected prediction\nerror that linearly depends on the training set fill distance, conditional to\nthe knowledge of data features. For empirical validation, we perform\nexperiments using two regression models on two datasets. We empirically show\nthat selecting a training set by aiming to minimize the fill distance, thereby\nminimizing the bound, significantly reduces the maximum prediction error of\nvarious regression models, outperforming existing sampling approaches by a\nlarge margin.\n","authors":["Paolo Climaco","Jochen Garcke"],"pdf_url":"https://arxiv.org/pdf/2307.10988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09943v2","updated":"2023-07-20T16:11:39Z","published":"2023-07-19T12:35:16Z","title":"Impatient Bandits: Optimizing Recommendations for the Long-Term Without\n  Delay","summary":"  Recommender systems are a ubiquitous feature of online platforms.\nIncreasingly, they are explicitly tasked with increasing users' long-term\nsatisfaction. In this context, we study a content exploration task, which we\nformalize as a multi-armed bandit problem with delayed rewards. We observe that\nthere is an apparent trade-off in choosing the learning signal: Waiting for the\nfull reward to become available might take several weeks, hurting the rate at\nwhich learning happens, whereas measuring short-term proxy rewards reflects the\nactual long-term goal only imperfectly. We address this challenge in two steps.\nFirst, we develop a predictive model of delayed rewards that incorporates all\ninformation obtained to date. Full observations as well as partial (short or\nmedium-term) outcomes are combined through a Bayesian filter to obtain a\nprobabilistic belief. Second, we devise a bandit algorithm that takes advantage\nof this new predictive model. The algorithm quickly learns to identify content\naligned with long-term success by carefully balancing exploration and\nexploitation. We apply our approach to a podcast recommendation problem, where\nwe seek to identify shows that users engage with repeatedly over two months. We\nempirically validate that our approach results in substantially better\nperformance compared to approaches that either optimize for short-term proxies,\nor wait for the long-term outcome to be fully realized.\n","authors":["Thomas M. McDonald","Lucas Maystre","Mounia Lalmas","Daniel Russo","Kamil Ciosek"],"pdf_url":"https://arxiv.org/pdf/2307.09943v2.pdf","comment":"Presented at the 29th ACM SIGKDD Conference on Knowledge Discovery\n  and Data Mining (KDD '23)"},{"id":"http://arxiv.org/abs/2307.10982v1","updated":"2023-07-20T16:09:57Z","published":"2023-07-20T16:09:57Z","title":"MASR: Metadata Aware Speech Representation","summary":"  In the recent years, speech representation learning is constructed primarily\nas a self-supervised learning (SSL) task, using the raw audio signal alone,\nwhile ignoring the side-information that is often available for a given speech\nrecording. In this paper, we propose MASR, a Metadata Aware Speech\nRepresentation learning framework, which addresses the aforementioned\nlimitations. MASR enables the inclusion of multiple external knowledge sources\nto enhance the utilization of meta-data information. The external knowledge\nsources are incorporated in the form of sample-level pair-wise similarity\nmatrices that are useful in a hard-mining loss. A key advantage of the MASR\nframework is that it can be combined with any choice of SSL method. Using MASR\nrepresentations, we perform evaluations on several downstream tasks such as\nlanguage identification, speech recognition and other non-semantic tasks such\nas speaker and emotion recognition. In these experiments, we illustrate\nsignificant performance improvements for the MASR over other established\nbenchmarks. We perform a detailed analysis on the language identification task\nto provide insights on how the proposed loss function enables the\nrepresentations to separate closely related languages.\n","authors":["Anjali Raj","Shikhar Bharadwaj","Sriram Ganapathy","Min Ma","Shikhar Vashishth"],"pdf_url":"https://arxiv.org/pdf/2307.10982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10981v1","updated":"2023-07-20T16:09:07Z","published":"2023-07-20T16:09:07Z","title":"PATROL: Privacy-Oriented Pruning for Collaborative Inference Against\n  Model Inversion Attacks","summary":"  Collaborative inference has been a promising solution to enable\nresource-constrained edge devices to perform inference using state-of-the-art\ndeep neural networks (DNNs). In collaborative inference, the edge device first\nfeeds the input to a partial DNN locally and then uploads the intermediate\nresult to the cloud to complete the inference. However, recent research\nindicates model inversion attacks (MIAs) can reconstruct input data from\nintermediate results, posing serious privacy concerns for collaborative\ninference. Existing perturbation and cryptography techniques are inefficient\nand unreliable in defending against MIAs while performing accurate inference.\nThis paper provides a viable solution, named PATROL, which develops\nprivacy-oriented pruning to balance privacy, efficiency, and utility of\ncollaborative inference. PATROL takes advantage of the fact that later layers\nin a DNN can extract more task-specific features. Given limited local resources\nfor collaborative inference, PATROL intends to deploy more layers at the edge\nbased on pruning techniques to enforce task-specific features for inference and\nreduce task-irrelevant but sensitive features for privacy preservation. To\nachieve privacy-oriented pruning, PATROL introduces two key components:\nLipschitz regularization and adversarial reconstruction training, which\nincrease the reconstruction errors by reducing the stability of MIAs and\nenhance the target inference model by adversarial training, respectively.\n","authors":["Shiwei Ding","Lan Zhang","Miao Pan","Xiaoyong Yuan"],"pdf_url":"https://arxiv.org/pdf/2307.10981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03017v3","updated":"2023-07-20T16:05:39Z","published":"2023-05-04T17:43:19Z","title":"Improving Code Example Recommendations on Informal Documentation Using\n  BERT and Query-Aware LSH: A Comparative Study","summary":"  Our research investigates the recommendation of code examples to aid software\ndevelopers, a practice that saves developers significant time by providing\nready-to-use code snippets. The focus of our study is Stack Overflow, a\ncommonly used resource for coding discussions and solutions, particularly in\nthe context of the Java programming language. We applied BERT, a powerful Large\nLanguage Model (LLM) that enables us to transform code examples into numerical\nvectors by extracting their semantic information. Once these numerical\nrepresentations are prepared, we identify Approximate Nearest Neighbors (ANN)\nusing Locality-Sensitive Hashing (LSH). Our research employed two variants of\nLSH: Random Hyperplane-based LSH and Query-Aware LSH. We rigorously compared\nthese two approaches across four parameters: HitRate, Mean Reciprocal Rank\n(MRR), Average Execution Time, and Relevance. Our study revealed that the\nQuery-Aware (QA) approach showed superior performance over the Random\nHyperplane-based (RH) method. Specifically, it exhibited a notable improvement\nof 20% to 35% in HitRate for query pairs compared to the RH approach.\nFurthermore, the QA approach proved significantly more time-efficient, with its\nspeed in creating hashing tables and assigning data samples to buckets being at\nleast four times faster. It can return code examples within milliseconds,\nwhereas the RH approach typically requires several seconds to recommend code\nexamples. Due to the superior performance of the QA approach, we tested it\nagainst PostFinder and FaCoY, the state-of-the-art baselines. Our QA method\nshowed comparable efficiency proving its potential for effective code\nrecommendation.\n","authors":["Sajjad Rahmani","AmirHossein Naghshzan","Latifa Guerrouj"],"pdf_url":"https://arxiv.org/pdf/2305.03017v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12619v2","updated":"2023-07-20T16:04:19Z","published":"2023-06-22T01:14:47Z","title":"Class-Incremental Learning based on Label Generation","summary":"  Despite the great success of pre-trained language models, it is still a\nchallenge to use these models for continual learning, especially for the\nclass-incremental learning (CIL) setting due to catastrophic forgetting (CF).\nThis paper reports our finding that if we formulate CIL as a continual label\ngeneration problem, CF is drastically reduced and the generalizable\nrepresentations of pre-trained models can be better retained. We thus propose a\nnew CIL method (VAG) that also leverages the sparsity of vocabulary to focus\nthe generation and creates pseudo-replay samples by using label semantics.\nExperimental results show that VAG outperforms baselines by a large margin.\n","authors":["Yijia Shao","Yiduo Guo","Dongyan Zhao","Bing Liu"],"pdf_url":"https://arxiv.org/pdf/2306.12619v2.pdf","comment":"12 pages, ACL 2023 Main Conference"},{"id":"http://arxiv.org/abs/2307.10975v1","updated":"2023-07-20T16:04:07Z","published":"2023-07-20T16:04:07Z","title":"Globally Normalising the Transducer for Streaming Speech Recognition","summary":"  The Transducer (e.g. RNN-Transducer or Conformer-Transducer) generates an\noutput label sequence as it traverses the input sequence. It is straightforward\nto use in streaming mode, where it generates partial hypotheses before the\ncomplete input has been seen. This makes it popular in speech recognition.\nHowever, in streaming mode the Transducer has a mathematical flaw which, simply\nput, restricts the model's ability to change its mind. The fix is to replace\nlocal normalisation (e.g. a softmax) with global normalisation, but then the\nloss function becomes impossible to evaluate exactly. A recent paper proposes\nto solve this by approximating the model, severely degrading performance.\nInstead, this paper proposes to approximate the loss function, allowing global\nnormalisation to apply to a state-of-the-art streaming model. Global\nnormalisation reduces its word error rate by 9-11% relative, closing almost\nhalf the gap between streaming and lookahead mode.\n","authors":["Rogier van Dalen"],"pdf_url":"https://arxiv.org/pdf/2307.10975v1.pdf","comment":"9 pages plus references and appendices"},{"id":"http://arxiv.org/abs/2210.06089v2","updated":"2023-07-20T16:01:03Z","published":"2022-10-12T11:04:22Z","title":"When are Local Queries Useful for Robust Learning?","summary":"  Distributional assumptions have been shown to be necessary for the robust\nlearnability of concept classes when considering the exact-in-the-ball robust\nrisk and access to random examples by Gourdeau et al. (2019). In this paper, we\nstudy learning models where the learner is given more power through the use of\nlocal queries, and give the first distribution-free algorithms that perform\nrobust empirical risk minimization (ERM) for this notion of robustness. The\nfirst learning model we consider uses local membership queries (LMQ), where the\nlearner can query the label of points near the training sample. We show that,\nunder the uniform distribution, LMQs do not increase the robustness threshold\nof conjunctions and any superclass, e.g., decision lists and halfspaces. Faced\nwith this negative result, we introduce the local equivalence query\n($\\mathsf{LEQ}$) oracle, which returns whether the hypothesis and target\nconcept agree in the perturbation region around a point in the training sample,\nas well as a counterexample if it exists. We show a separation result: on the\none hand, if the query radius $\\lambda$ is strictly smaller than the\nadversary's perturbation budget $\\rho$, then distribution-free robust learning\nis impossible for a wide variety of concept classes; on the other hand, the\nsetting $\\lambda=\\rho$ allows us to develop robust ERM algorithms. We then\nbound the query complexity of these algorithms based on online learning\nguarantees and further improve these bounds for the special case of\nconjunctions. We finish by giving robust learning algorithms for halfspaces on\n$\\{0,1\\}^n$ and then obtaining robustness guarantees for halfspaces in\n$\\mathbb{R}^n$ against precision-bounded adversaries.\n","authors":["Pascale Gourdeau","Varun Kanade","Marta Kwiatkowska","James Worrell"],"pdf_url":"https://arxiv.org/pdf/2210.06089v2.pdf","comment":"Accepted to NeurIPS 2022; V2 contains new results (Section 3.6) and\n  an erratum from the previous version (Appendix C)"},{"id":"http://arxiv.org/abs/2204.06362v2","updated":"2023-07-20T15:48:35Z","published":"2022-04-13T13:16:21Z","title":"A Review of Machine Learning Methods Applied to Structural Dynamics and\n  Vibroacoustic","summary":"  The use of Machine Learning (ML) has rapidly spread across several fields,\nhaving encountered many applications in Structural Dynamics and Vibroacoustic\n(SD\\&V). The increasing capabilities of ML to unveil insights from data, driven\nby unprecedented data availability, algorithms advances and computational\npower, enhance decision making, uncertainty handling, patterns recognition and\nreal-time assessments. Three main applications in SD\\&V have taken advantage of\nthese benefits. In Structural Health Monitoring, ML detection and prognosis\nlead to safe operation and optimized maintenance schedules. System\nidentification and control design are leveraged by ML techniques in Active\nNoise Control and Active Vibration Control. Finally, the so-called ML-based\nsurrogate models provide fast alternatives to costly simulations, enabling\nrobust and optimized product design. Despite the many works in the area, they\nhave not been reviewed and analyzed. Therefore, to keep track and understand\nthis ongoing integration of fields, this paper presents a survey of ML\napplications in SD\\&V analyses, shedding light on the current state of\nimplementation and emerging opportunities. The main methodologies, advantages,\nlimitations, and recommendations based on scientific knowledge were identified\nfor each of the three applications. Moreover, the paper considers the role of\nDigital Twins and Physics Guided ML to overcome current challenges and power\nfuture research progress. As a result, the survey provides a broad overview of\nthe present landscape of ML applied in SD\\&V and guides the reader to an\nadvanced understanding of progress and prospects in the field.\n","authors":["Barbara Cunha","Christophe Droz","Abdelmalek Zine","Stéphane Foulard","Mohamed Ichchou"],"pdf_url":"https://arxiv.org/pdf/2204.06362v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10936v1","updated":"2023-07-20T15:09:06Z","published":"2023-07-20T15:09:06Z","title":"PASTA: Pretrained Action-State Transformer Agents","summary":"  Self-supervised learning has brought about a revolutionary paradigm shift in\nvarious computing domains, including NLP, vision, and biology. Recent\napproaches involve pre-training transformer models on vast amounts of unlabeled\ndata, serving as a starting point for efficiently solving downstream tasks. In\nthe realm of reinforcement learning, researchers have recently adapted these\napproaches by developing models pre-trained on expert trajectories, enabling\nthem to address a wide range of tasks, from robotics to recommendation systems.\nHowever, existing methods mostly rely on intricate pre-training objectives\ntailored to specific downstream applications. This paper presents a\ncomprehensive investigation of models we refer to as Pretrained Action-State\nTransformer Agents (PASTA). Our study uses a unified methodology and covers an\nextensive set of general downstream tasks including behavioral cloning, offline\nRL, sensor failure robustness, and dynamics change adaptation. Our goal is to\nsystematically compare various design choices and provide valuable insights to\npractitioners for building robust models. Key highlights of our study include\ntokenization at the action and state component level, using fundamental\npre-training objectives like next token prediction, training models across\ndiverse domains simultaneously, and using parameter efficient fine-tuning\n(PEFT). The developed models in our study contain fewer than 10 million\nparameters and the application of PEFT enables fine-tuning of fewer than 10,000\nparameters during downstream adaptation, allowing a broad community to use\nthese models and reproduce our experiments. We hope that this study will\nencourage further research into the use of transformers with first-principles\ndesign choices to represent RL trajectories and contribute to robust policy\nlearning.\n","authors":["Raphael Boige","Yannis Flet-Berliac","Arthur Flajolet","Guillaume Richard","Thomas Pierrot"],"pdf_url":"https://arxiv.org/pdf/2307.10936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10935v1","updated":"2023-07-20T15:07:49Z","published":"2023-07-20T15:07:49Z","title":"Inorganic synthesis-structure maps in zeolites with machine learning and\n  crystallographic distances","summary":"  Zeolites are inorganic materials known for their diversity of applications,\nsynthesis conditions, and resulting polymorphs. Although their synthesis is\ncontrolled both by inorganic and organic synthesis conditions, computational\nstudies of zeolite synthesis have focused mostly on organic template design. In\nthis work, we use a strong distance metric between crystal structures and\nmachine learning (ML) to create inorganic synthesis maps in zeolites. Starting\nwith 253 known zeolites, we show how the continuous distances between\nframeworks reproduce inorganic synthesis conditions from the literature without\nusing labels such as building units. An unsupervised learning analysis shows\nthat neighboring zeolites according to our metric often share similar inorganic\nsynthesis conditions, even in template-based routes. In combination with ML\nclassifiers, we find synthesis-structure relationships for 14 common inorganic\nconditions in zeolites, namely Al, B, Be, Ca, Co, F, Ga, Ge, K, Mg, Na, P, Si,\nand Zn. By explaining the model predictions, we demonstrate how\n(dis)similarities towards known structures can be used as features for the\nsynthesis space. Finally, we show how these methods can be used to predict\ninorganic synthesis conditions for unrealized frameworks in hypothetical\ndatabases and interpret the outcomes by extracting local structural patterns\nfrom zeolites. In combination with template design, this work can accelerate\nthe exploration of the space of synthesis conditions for zeolites.\n","authors":["Daniel Schwalbe-Koda","Daniel E. Widdowson","Tuan Anh Pham","Vitaliy A. Kurlin"],"pdf_url":"https://arxiv.org/pdf/2307.10935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10927v1","updated":"2023-07-20T14:56:29Z","published":"2023-07-20T14:56:29Z","title":"Modeling 3D cardiac contraction and relaxation with point cloud\n  deformation networks","summary":"  Global single-valued biomarkers of cardiac function typically used in\nclinical practice, such as ejection fraction, provide limited insight on the\ntrue 3D cardiac deformation process and hence, limit the understanding of both\nhealthy and pathological cardiac mechanics. In this work, we propose the Point\nCloud Deformation Network (PCD-Net) as a novel geometric deep learning approach\nto model 3D cardiac contraction and relaxation between the extreme ends of the\ncardiac cycle. It employs the recent advances in point cloud-based deep\nlearning into an encoder-decoder structure, in order to enable efficient\nmulti-scale feature learning directly on multi-class 3D point cloud\nrepresentations of the cardiac anatomy. We evaluate our approach on a large\ndataset of over 10,000 cases from the UK Biobank study and find average Chamfer\ndistances between the predicted and ground truth anatomies below the pixel\nresolution of the underlying image acquisition. Furthermore, we observe similar\nclinical metrics between predicted and ground truth populations and show that\nthe PCD-Net can successfully capture subpopulation-specific differences between\nnormal subjects and myocardial infarction (MI) patients. We then demonstrate\nthat the learned 3D deformation patterns outperform multiple clinical\nbenchmarks by 13% and 7% in terms of area under the receiver operating\ncharacteristic curve for the tasks of prevalent MI detection and incident MI\nprediction and by 7% in terms of Harrell's concordance index for MI survival\nanalysis.\n","authors":["Marcel Beetz","Abhirup Banerjee","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2307.10927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10926v1","updated":"2023-07-20T14:52:45Z","published":"2023-07-20T14:52:45Z","title":"Confidence intervals for performance estimates in 3D medical image\n  segmentation","summary":"  Medical segmentation models are evaluated empirically. As such an evaluation\nis based on a limited set of example images, it is unavoidably noisy. Beyond a\nmean performance measure, reporting confidence intervals is thus crucial.\nHowever, this is rarely done in medical image segmentation. The width of the\nconfidence interval depends on the test set size and on the spread of the\nperformance measure (its standard-deviation across of the test set). For\nclassification, many test images are needed to avoid wide confidence intervals.\nSegmentation, however, has not been studied, and it differs by the amount of\ninformation brought by a given test image. In this paper, we study the typical\nconfidence intervals in medical image segmentation. We carry experiments on 3D\nimage segmentation using the standard nnU-net framework, two datasets from the\nMedical Decathlon challenge and two performance measures: the Dice accuracy and\nthe Hausdorff distance. We show that the parametric confidence intervals are\nreasonable approximations of the bootstrap estimates for varying test set sizes\nand spread of the performance metric. Importantly, we show that the test size\nneeded to achieve a given precision is often much lower than for classification\ntasks. Typically, a 1% wide confidence interval requires about 100-200 test\nsamples when the spread is low (standard-deviation around 3%). More difficult\nsegmentation tasks may lead to higher spreads and require over 1000 samples.\n","authors":["R. El Jurdi","G. Varoquax","O. Colliot"],"pdf_url":"https://arxiv.org/pdf/2307.10926v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.10923v1","updated":"2023-07-20T14:49:58Z","published":"2023-07-20T14:49:58Z","title":"Sequential Multi-Dimensional Self-Supervised Learning for Clinical Time\n  Series","summary":"  Self-supervised learning (SSL) for clinical time series data has received\nsignificant attention in recent literature, since these data are highly rich\nand provide important information about a patient's physiological state.\nHowever, most existing SSL methods for clinical time series are limited in that\nthey are designed for unimodal time series, such as a sequence of structured\nfeatures (e.g., lab values and vitals signs) or an individual high-dimensional\nphysiological signal (e.g., an electrocardiogram). These existing methods\ncannot be readily extended to model time series that exhibit multimodality,\nwith structured features and high-dimensional data being recorded at each\ntimestep in the sequence. In this work, we address this gap and propose a new\nSSL method -- Sequential Multi-Dimensional SSL -- where a SSL loss is applied\nboth at the level of the entire sequence and at the level of the individual\nhigh-dimensional data points in the sequence in order to better capture\ninformation at both scales. Our strategy is agnostic to the specific form of\nloss function used at each level -- it can be contrastive, as in SimCLR, or\nnon-contrastive, as in VICReg. We evaluate our method on two real-world\nclinical datasets, where the time series contains sequences of (1)\nhigh-frequency electrocardiograms and (2) structured data from lab values and\nvitals signs. Our experimental results indicate that pre-training with our\nmethod and then fine-tuning on downstream tasks improves performance over\nbaselines on both datasets, and in several settings, can lead to improvements\nacross different self-supervised loss functions.\n","authors":["Aniruddh Raghu","Payal Chandak","Ridwan Alam","John Guttag","Collin M. Stultz"],"pdf_url":"https://arxiv.org/pdf/2307.10923v1.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2307.10922v1","updated":"2023-07-20T14:47:50Z","published":"2023-07-20T14:47:50Z","title":"Language-based Action Concept Spaces Improve Video Self-Supervised\n  Learning","summary":"  Recent contrastive language image pre-training has led to learning highly\ntransferable and robust image representations. However, adapting these models\nto video domains with minimal supervision remains an open problem. We explore a\nsimple step in that direction, using language tied self-supervised learning to\nadapt an image CLIP model to the video domain. A backbone modified for temporal\nmodeling is trained under self-distillation settings with train objectives\noperating in an action concept space. Feature vectors of various action\nconcepts extracted from a language encoder using relevant textual prompts\nconstruct this space. We introduce two train objectives, concept distillation\nand concept alignment, that retain generality of original representations while\nenforcing relations between actions and their attributes. Our approach improves\nzero-shot and linear probing performance on three action recognition\nbenchmarks.\n","authors":["Kanchana Ranasinghe","Michael Ryoo"],"pdf_url":"https://arxiv.org/pdf/2307.10922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14319v3","updated":"2023-07-20T14:37:12Z","published":"2022-12-29T14:28:32Z","title":"Gaussian Process Priors for Systems of Linear Partial Differential\n  Equations with Constant Coefficients","summary":"  Partial differential equations (PDEs) are important tools to model physical\nsystems and including them into machine learning models is an important way of\nincorporating physical knowledge. Given any system of linear PDEs with constant\ncoefficients, we propose a family of Gaussian process (GP) priors, which we\ncall EPGP, such that all realizations are exact solutions of this system. We\napply the Ehrenpreis-Palamodov fundamental principle, which works as a\nnon-linear Fourier transform, to construct GP kernels mirroring standard\nspectral methods for GPs. Our approach can infer probable solutions of linear\nPDE systems from any data such as noisy measurements, or pointwise defined\ninitial and boundary conditions. Constructing EPGP-priors is algorithmic,\ngenerally applicable, and comes with a sparse version (S-EPGP) that learns the\nrelevant spectral frequencies and works better for big data sets. We\ndemonstrate our approach on three families of systems of PDEs, the heat\nequation, wave equation, and Maxwell's equations, where we improve upon the\nstate of the art in computation time and precision, in some experiments by\nseveral orders of magnitude.\n","authors":["Marc Härkönen","Markus Lange-Hegermann","Bogdan Raiţă"],"pdf_url":"https://arxiv.org/pdf/2212.14319v3.pdf","comment":"26 pages, 8 figures; ICML 2023 (oral); updated with expanded\n  appendices and ancillary files. Code available at\n  https://github.com/haerski/EPGP. For animations, see\n  https://mathrepo.mis.mpg.de/EPGP/index.html"},{"id":"http://arxiv.org/abs/2307.00405v2","updated":"2023-07-20T14:36:11Z","published":"2023-07-01T18:35:21Z","title":"Provably Efficient UCB-type Algorithms For Learning Predictive State\n  Representations","summary":"  The general sequential decision-making problem, which includes Markov\ndecision processes (MDPs) and partially observable MDPs (POMDPs) as special\ncases, aims at maximizing a cumulative reward by making a sequence of decisions\nbased on a history of observations and actions over time. Recent studies have\nshown that the sequential decision-making problem is statistically learnable if\nit admits a low-rank structure modeled by predictive state representations\n(PSRs). Despite these advancements, existing approaches typically involve\noracles or steps that are not computationally efficient. On the other hand, the\nupper confidence bound (UCB) based approaches, which have served successfully\nas computationally efficient methods in bandits and MDPs, have not been\ninvestigated for more general PSRs, due to the difficulty of optimistic bonus\ndesign in these more challenging settings. This paper proposes the first known\nUCB-type approach for PSRs, featuring a novel bonus term that upper bounds the\ntotal variation distance between the estimated and true models. We further\ncharacterize the sample complexity bounds for our designed UCB-type algorithms\nfor both online and offline PSRs. In contrast to existing approaches for PSRs,\nour UCB-type algorithms enjoy computational efficiency, last-iterate guaranteed\nnear-optimal policy, and guaranteed model accuracy.\n","authors":["Ruiquan Huang","Yingbin Liang","Jing Yang"],"pdf_url":"https://arxiv.org/pdf/2307.00405v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10907v1","updated":"2023-07-20T14:29:51Z","published":"2023-07-20T14:29:51Z","title":"The Role of Entropy and Reconstruction in Multi-View Self-Supervised\n  Learning","summary":"  The mechanisms behind the success of multi-view self-supervised learning\n(MVSSL) are not yet fully understood. Contrastive MVSSL methods have been\nstudied through the lens of InfoNCE, a lower bound of the Mutual Information\n(MI). However, the relation between other MVSSL methods and MI remains unclear.\nWe consider a different lower bound on the MI consisting of an entropy and a\nreconstruction term (ER), and analyze the main MVSSL families through its lens.\nThrough this ER bound, we show that clustering-based methods such as\nDeepCluster and SwAV maximize the MI. We also re-interpret the mechanisms of\ndistillation-based approaches such as BYOL and DINO, showing that they\nexplicitly maximize the reconstruction term and implicitly encourage a stable\nentropy, and we confirm this empirically. We show that replacing the objectives\nof common MVSSL methods with this ER bound achieves competitive performance,\nwhile making them stable when training with smaller batch sizes or smaller\nexponential moving average (EMA) coefficients.\n  Github repo: https://github.com/apple/ml-entropy-reconstruction.\n","authors":["Borja Rodríguez-Gálvez","Arno Blaas","Pau Rodríguez","Adam Goliński","Xavier Suau","Jason Ramapuram","Dan Busbridge","Luca Zappella"],"pdf_url":"https://arxiv.org/pdf/2307.10907v1.pdf","comment":"18 pages: 9 of main text, 2 of references, and 7 of supplementary\n  material. Appears in the proceedings of ICML 2023"},{"id":"http://arxiv.org/abs/2110.05216v2","updated":"2023-07-20T14:29:07Z","published":"2021-10-11T12:32:56Z","title":"High-order Tensor Pooling with Attention for Action Recognition","summary":"  We aim at capturing high-order statistics of feature vectors formed by a\nneural network, and propose end-to-end second- and higher-order pooling to form\na tensor descriptor. Tensor descriptors require a robust similarity measure due\nto low numbers of aggregated vectors and the burstiness phenomenon, when a\ngiven feature appears more/less frequently than statistically expected. The\nHeat Diffusion Process (HDP) on a graph Laplacian is closely related to the\nEigenvalue Power Normalization (EPN) of the covariance/auto-correlation matrix,\nwhose inverse forms a loopy graph Laplacian. We show that the HDP and the EPN\nplay the same role, i.e., to boost or dampen the magnitude of the eigenspectrum\nthus preventing the burstiness. We equip higher-order tensors with EPN which\nacts as a spectral detector of higher-order occurrences to prevent burstiness.\nWe also prove that for a tensor of order r built from d dimensional feature\ndescriptors, such a detector gives the likelihood if at least one higher-order\noccurrence is 'projected' into one of binom(d,r) subspaces represented by the\ntensor; thus forming a tensor power normalization metric endowed with\nbinom(d,r) such 'detectors'. For experimental contributions, we apply several\nsecond- and higher-order pooling variants to action recognition, provide\npreviously not presented comparisons of such pooling variants, and show\nstate-of-the-art results on HMDB-51, YUP++ and MPII Cooking Activities.\n","authors":["Piotr Koniusz","Lei Wang","Ke Sun"],"pdf_url":"https://arxiv.org/pdf/2110.05216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10895v1","updated":"2023-07-20T14:18:44Z","published":"2023-07-20T14:18:44Z","title":"Variational Point Encoding Deformation for Dental Modeling","summary":"  Digital dentistry has made significant advancements in recent years, yet\nnumerous challenges remain to be addressed. In this study, we release a new\nextensive dataset of tooth meshes to encourage further research. Additionally,\nwe propose Variational FoldingNet (VF-Net), which extends FoldingNet to enable\nprobabilistic learning of point cloud representations. A key challenge in\nexisting latent variable models for point clouds is the lack of a 1-to-1\nmapping between input points and output points. Instead, they must rely on\noptimizing Chamfer distances, a metric that does not have a normalized\ndistributional counterpart, preventing its usage in probabilistic models. We\ndemonstrate that explicit minimization of Chamfer distances can be replaced by\na suitable encoder, which allows us to increase computational efficiency while\nsimplifying the probabilistic extension. Our experimental findings present\nempirical evidence demonstrating the superior performance of VF-Net over\nexisting models in terms of dental scan reconstruction and extrapolation.\nAdditionally, our investigation highlights the robustness of VF-Net's latent\nrepresentations. These results underscore the promising prospects of VF-Net as\nan effective and reliable method for point cloud reconstruction and analysis.\n","authors":["Johan Ziruo Ye","Thomas Ørkild","Peter Lempel Søndergaard","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2307.10895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10892v1","updated":"2023-07-20T14:11:29Z","published":"2023-07-20T14:11:29Z","title":"Learning and Generalizing Polynomials in Simulation Metamodeling","summary":"  The ability to learn polynomials and generalize out-of-distribution is\nessential for simulation metamodels in many disciplines of engineering, where\nthe time step updates are described by polynomials. While feed forward neural\nnetworks can fit any function, they cannot generalize out-of-distribution for\nhigher-order polynomials. Therefore, this paper collects and proposes\nmultiplicative neural network (MNN) architectures that are used as recursive\nbuilding blocks for approximating higher-order polynomials. Our experiments\nshow that MNNs are better than baseline models at generalizing, and their\nperformance in validation is true to their performance in out-of-distribution\ntests. In addition to MNN architectures, a simulation metamodeling approach is\nproposed for simulations with polynomial time step updates. For these\nsimulations, simulating a time interval can be performed in fewer steps by\nincreasing the step size, which entails approximating higher-order polynomials.\nWhile our approach is compatible with any simulation with polynomial time step\nupdates, a demonstration is shown for an epidemiology simulation model, which\nalso shows the inductive bias in MNNs for learning and generalizing\nhigher-order polynomials.\n","authors":["Jesper Hauch","Christoffer Riis","Francisco C. Pereira"],"pdf_url":"https://arxiv.org/pdf/2307.10892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10891v1","updated":"2023-07-20T14:10:40Z","published":"2023-07-20T14:10:40Z","title":"Syntactic vs Semantic Linear Abstraction and Refinement of Neural\n  Networks","summary":"  Abstraction is a key verification technique to improve scalability. However,\nits use for neural networks is so far extremely limited. Previous approaches\nfor abstracting classification networks replace several neurons with one of\nthem that is similar enough. We can classify the similarity as defined either\nsyntactically (using quantities on the connections between neurons) or\nsemantically (on the activation values of neurons for various inputs).\nUnfortunately, the previous approaches only achieve moderate reductions, when\nimplemented at all. In this work, we provide a more flexible framework where a\nneuron can be replaced with a linear combination of other neurons, improving\nthe reduction. We apply this approach both on syntactic and semantic\nabstractions, and implement and evaluate them experimentally. Further, we\nintroduce a refinement method for our abstractions, allowing for finding a\nbetter balance between reduction and precision.\n","authors":["Calvin Chau","Jan Křetínský","Stefanie Mohr"],"pdf_url":"https://arxiv.org/pdf/2307.10891v1.pdf","comment":"Accepted at ATVA 2023"},{"id":"http://arxiv.org/abs/2307.10890v1","updated":"2023-07-20T14:10:33Z","published":"2023-07-20T14:10:33Z","title":"Player-optimal Stable Regret for Bandit Learning in Matching Markets","summary":"  The problem of matching markets has been studied for a long time in the\nliterature due to its wide range of applications. Finding a stable matching is\na common equilibrium objective in this problem. Since market participants are\nusually uncertain of their preferences, a rich line of recent works study the\nonline setting where one-side participants (players) learn their unknown\npreferences from iterative interactions with the other side (arms). Most\nprevious works in this line are only able to derive theoretical guarantees for\nplayer-pessimal stable regret, which is defined compared with the players'\nleast-preferred stable matching. However, under the pessimal stable matching,\nplayers only obtain the least reward among all stable matchings. To maximize\nplayers' profits, player-optimal stable matching would be the most desirable.\nThough \\citet{basu21beyond} successfully bring an upper bound for\nplayer-optimal stable regret, their result can be exponentially large if\nplayers' preference gap is small. Whether a polynomial guarantee for this\nregret exists is a significant but still open problem. In this work, we provide\na new algorithm named explore-then-Gale-Shapley (ETGS) and show that the\noptimal stable regret of each player can be upper bounded by $O(K\\log\nT/\\Delta^2)$ where $K$ is the number of arms, $T$ is the horizon and $\\Delta$\nis the players' minimum preference gap among the first $N+1$-ranked arms. This\nresult significantly improves previous works which either have a weaker\nplayer-pessimal stable matching objective or apply only to markets with special\nassumptions. When the preferences of participants satisfy some special\nconditions, our regret upper bound also matches the previously derived lower\nbound.\n","authors":["Fang Kong","Shuai Li"],"pdf_url":"https://arxiv.org/pdf/2307.10890v1.pdf","comment":"SODA 2023"},{"id":"http://arxiv.org/abs/2307.02405v2","updated":"2023-07-20T14:10:24Z","published":"2023-07-05T16:27:33Z","title":"$ν^2$-Flows: Fast and improved neutrino reconstruction in\n  multi-neutrino final states with conditional normalizing flows","summary":"  In this work we introduce $\\nu^2$-Flows, an extension of the $\\nu$-Flows\nmethod to final states containing multiple neutrinos. The architecture can\nnatively scale for all combinations of object types and multiplicities in the\nfinal state for any desired neutrino multiplicities. In $t\\bar{t}$ dilepton\nevents, the momenta of both neutrinos and correlations between them are\nreconstructed more accurately than when using the most popular standard\nanalytical techniques, and solutions are found for all events. Inference time\nis significantly faster than competing methods, and can be reduced further by\nevaluating in parallel on graphics processing units. We apply $\\nu^2$-Flows to\n$t\\bar{t}$ dilepton events and show that the per-bin uncertainties in unfolded\ndistributions is much closer to the limit of performance set by perfect\nneutrino reconstruction than standard techniques. For the chosen double\ndifferential observables $\\nu^2$-Flows results in improved statistical\nprecision for each bin by a factor of 1.5 to 2 in comparison to the Neutrino\nWeighting method and up to a factor of four in comparison to the Ellipse\napproach.\n","authors":["John Andrew Raine","Matthew Leigh","Knut Zoch","Tobias Golling"],"pdf_url":"https://arxiv.org/pdf/2307.02405v2.pdf","comment":"20 pages, 16 figures, 5 tables"},{"id":"http://arxiv.org/abs/2303.16716v2","updated":"2023-07-20T13:54:48Z","published":"2023-03-29T14:15:38Z","title":"Topological Point Cloud Clustering","summary":"  We present Topological Point Cloud Clustering (TPCC), a new method to cluster\npoints in an arbitrary point cloud based on their contribution to global\ntopological features. TPCC synthesizes desirable features from spectral\nclustering and topological data analysis and is based on considering the\nspectral properties of a simplicial complex associated to the considered point\ncloud. As it is based on considering sparse eigenvector computations, TPCC is\nsimilarly easy to interpret and implement as spectral clustering. However, by\nfocusing not just on a single matrix associated to a graph created from the\npoint cloud data, but on a whole set of Hodge-Laplacians associated to an\nappropriately constructed simplicial complex, we can leverage a far richer set\nof topological features to characterize the data points within the point cloud\nand benefit from the relative robustness of topological techniques against\nnoise. We test the performance of TPCC on both synthetic and real-world data\nand compare it with classical spectral clustering.\n","authors":["Vincent P. Grande","Michael T. Schaub"],"pdf_url":"https://arxiv.org/pdf/2303.16716v2.pdf","comment":"Accepted at the 40th International Conference on Machine Learning\n  (ICML), 2023. Code available at\n  https://git.rwth-aachen.de/netsci/publication-2023-topological-point-cloud-clustering"},{"id":"http://arxiv.org/abs/2306.14030v2","updated":"2023-07-20T13:54:05Z","published":"2023-06-24T18:17:38Z","title":"My Boli: Code-mixed Marathi-English Corpora, Pretrained Language Models\n  and Evaluation Benchmarks","summary":"  The research on code-mixed data is limited due to the unavailability of\ndedicated code-mixed datasets and pre-trained language models. In this work, we\nfocus on the low-resource Indian language Marathi which lacks any prior work in\ncode-mixing. We present L3Cube-MeCorpus, a large code-mixed Marathi-English\n(Mr-En) corpus with 10 million social media sentences for pretraining. We also\nrelease L3Cube-MeBERT and MeRoBERTa, code-mixed BERT-based transformer models\npre-trained on MeCorpus. Furthermore, for benchmarking, we present three\nsupervised datasets MeHate, MeSent, and MeLID for downstream tasks like\ncode-mixed Mr-En hate speech detection, sentiment analysis, and language\nidentification respectively. These evaluation datasets individually consist of\nmanually annotated \\url{~}12,000 Marathi-English code-mixed tweets. Ablations\nshow that the models trained on this novel corpus significantly outperform the\nexisting state-of-the-art BERT models. This is the first work that presents\nartifacts for code-mixed Marathi research. All datasets and models are publicly\nreleased at https://github.com/l3cube-pune/MarathiNLP .\n","authors":["Tanmay Chavan","Omkar Gokhale","Aditya Kane","Shantanu Patankar","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2306.14030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10875v1","updated":"2023-07-20T13:47:30Z","published":"2023-07-20T13:47:30Z","title":"Risk-optimized Outlier Removal for Robust Point Cloud Classification","summary":"  The popularity of point cloud deep models for safety-critical purposes has\nincreased, but the reliability and security of these models can be compromised\nby intentional or naturally occurring point cloud noise. To combat this issue,\nwe present a novel point cloud outlier removal method called PointCVaR, which\nempowers standard-trained models to eliminate additional outliers and restore\nthe data. Our approach begins by conducting attribution analysis to determine\nthe influence of each point on the model output, which we refer to as point\nrisk. We then optimize the process of filtering high-risk points using\nConditional Value at Risk (CVaR) as the objective. The rationale for this\napproach is based on the observation that noise points in point clouds tend to\ncluster in the tail of the risk distribution, with a low frequency but a high\nlevel of risk, resulting in significant interference with classification\nresults. Despite requiring no additional training effort, our method produces\nexceptional results in various removal-and-classification experiments for noisy\npoint clouds, which are corrupted by random noise, adversarial noise, and\nbackdoor trigger noise. Impressively, it achieves 87% accuracy in defense\nagainst the backdoor attack by removing triggers. Overall, the proposed\nPointCVaR effectively eliminates noise points and enhances point cloud\nclassification, making it a promising plug-in module for various models in\ndifferent scenarios.\n","authors":["Xinke Li","Junchi Lu"],"pdf_url":"https://arxiv.org/pdf/2307.10875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10870v1","updated":"2023-07-20T13:42:13Z","published":"2023-07-20T13:42:13Z","title":"Nonlinear Meta-Learning Can Guarantee Faster Rates","summary":"  Many recent theoretical works on \\emph{meta-learning} aim to achieve\nguarantees in leveraging similar representational structures from related tasks\ntowards simplifying a target task. Importantly, the main aim in theory works on\nthe subject is to understand the extent to which convergence rates -- in\nlearning a common representation -- \\emph{may scale with the number $N$ of\ntasks} (as well as the number of samples per task). First steps in this setting\ndemonstrate this property when both the shared representation amongst tasks,\nand task-specific regression functions, are linear. This linear setting readily\nreveals the benefits of aggregating tasks, e.g., via averaging arguments. In\npractice, however, the representation is often highly nonlinear, introducing\nnontrivial biases in each task that cannot easily be averaged out as in the\nlinear case. In the present work, we derive theoretical guarantees for\nmeta-learning with nonlinear representations. In particular, assuming the\nshared nonlinearity maps to an infinite-dimensional RKHS, we show that\nadditional biases can be mitigated with careful regularization that leverages\nthe smoothness of task-specific regression functions,\n","authors":["Dimitri Meunier","Zhu Li","Arthur Gretton","Samory Kpotufe"],"pdf_url":"https://arxiv.org/pdf/2307.10870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10869v1","updated":"2023-07-20T13:41:26Z","published":"2023-07-20T13:41:26Z","title":"Performance Issue Identification in Cloud Systems with\n  Relational-Temporal Anomaly Detection","summary":"  Performance issues permeate large-scale cloud service systems, which can lead\nto huge revenue losses. To ensure reliable performance, it's essential to\naccurately identify and localize these issues using service monitoring metrics.\nGiven the complexity and scale of modern cloud systems, this task can be\nchallenging and may require extensive expertise and resources beyond the\ncapacity of individual humans. Some existing methods tackle this problem by\nanalyzing each metric independently to detect anomalies. However, this could\nincur overwhelming alert storms that are difficult for engineers to diagnose\nmanually. To pursue better performance, not only the temporal patterns of\nmetrics but also the correlation between metrics (i.e., relational patterns)\nshould be considered, which can be formulated as a multivariate metrics anomaly\ndetection problem. However, most of the studies fall short of extracting these\ntwo types of features explicitly. Moreover, there exist some unlabeled\nanomalies mixed in the training data, which may hinder the detection\nperformance. To address these limitations, we propose the Relational- Temporal\nAnomaly Detection Model (RTAnomaly) that combines the relational and temporal\ninformation of metrics. RTAnomaly employs a graph attention layer to learn the\ndependencies among metrics, which will further help pinpoint the anomalous\nmetrics that may cause the anomaly effectively. In addition, we exploit the\nconcept of positive unlabeled learning to address the issue of potential\nanomalies in the training data. To evaluate our method, we conduct experiments\non a public dataset and two industrial datasets. RTAnomaly outperforms all the\nbaseline models by achieving an average F1 score of 0.929 and Hit@3 of 0.920,\ndemonstrating its superiority.\n","authors":["Wenwei Gu","Jinyang Liu","Zhuangbin Chen","Jianping Zhang","Yuxin Su","Jiazhen Gu","Cong Feng","Zengyin Yang","Michael Lyu"],"pdf_url":"https://arxiv.org/pdf/2307.10869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10867v1","updated":"2023-07-20T13:40:22Z","published":"2023-07-20T13:40:22Z","title":"FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with\n  Human Feedback","summary":"  Captions are crucial for understanding scientific visualizations and\ndocuments. Existing captioning methods for scientific figures rely on\nfigure-caption pairs extracted from documents for training, many of which fall\nshort with respect to metrics like helpfulness, explainability, and\nvisual-descriptiveness [15] leading to generated captions being misaligned with\nreader preferences. To enable the generation of high-quality figure captions,\nwe introduce FigCaps-HF a new framework for figure-caption generation that can\nincorporate domain expert feedback in generating captions optimized for reader\npreferences. Our framework comprises of 1) an automatic method for evaluating\nquality of figure-caption pairs, 2) a novel reinforcement learning with human\nfeedback (RLHF) method to optimize a generative figure-to-caption model for\nreader preferences. We demonstrate the effectiveness of our simple learning\nframework by improving performance over standard fine-tuning across different\ntypes of models. In particular, when using BLIP as the base model, our RLHF\nframework achieves a mean gain of 35.7%, 16.9%, and 9% in ROUGE, BLEU, and\nMeteor, respectively. Finally, we release a large-scale benchmark dataset with\nhuman feedback on figure-caption pairs to enable further evaluation and\ndevelopment of RLHF techniques for this problem.\n","authors":["Ashish Singh","Prateek Agarwal","Zixuan Huang","Arpita Singh","Tong Yu","Sungchul Kim","Victor Bursztyn","Nikos Vlassis","Ryan A. Rossi"],"pdf_url":"https://arxiv.org/pdf/2307.10867v1.pdf","comment":"19 pages, 4 figures. Benchmark Documentation:\n  https://figcapshf.github.io/"},{"id":"http://arxiv.org/abs/2307.10865v1","updated":"2023-07-20T13:34:11Z","published":"2023-07-20T13:34:11Z","title":"Addressing caveats of neural persistence with deep graph persistence","summary":"  Neural Persistence is a prominent measure for quantifying neural network\ncomplexity, proposed in the emerging field of topological data analysis in deep\nlearning. In this work, however, we find both theoretically and empirically\nthat the variance of network weights and spatial concentration of large weights\nare the main factors that impact neural persistence. Whilst this captures\nuseful information for linear classifiers, we find that no relevant spatial\nstructure is present in later layers of deep neural networks, making neural\npersistence roughly equivalent to the variance of weights. Additionally, the\nproposed averaging procedure across layers for deep neural networks does not\nconsider interaction between layers. Based on our analysis, we propose an\nextension of the filtration underlying neural persistence to the whole neural\nnetwork instead of single layers, which is equivalent to calculating neural\npersistence on one particular matrix. This yields our deep graph persistence\nmeasure, which implicitly incorporates persistent paths through the network and\nalleviates variance-related issues through standardisation. Code is available\nat https://github.com/ExplainableML/Deep-Graph-Persistence .\n","authors":["Leander Girrbach","Anders Christensen","Ole Winther","Zeynep Akata","A. Sophia Koepke"],"pdf_url":"https://arxiv.org/pdf/2307.10865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10864v1","updated":"2023-07-20T13:33:28Z","published":"2023-07-20T13:33:28Z","title":"Divide & Bind Your Attention for Improved Generative Semantic Nursing","summary":"  Emerging large-scale text-to-image generative models, e.g., Stable Diffusion\n(SD), have exhibited overwhelming results with high fidelity. Despite the\nmagnificent progress, current state-of-the-art models still struggle to\ngenerate images fully adhering to the input prompt. Prior work, Attend &\nExcite, has introduced the concept of Generative Semantic Nursing (GSN), aiming\nto optimize cross-attention during inference time to better incorporate the\nsemantics. It demonstrates promising results in generating simple prompts,\ne.g., ``a cat and a dog''. However, its efficacy declines when dealing with\nmore complex prompts, and it does not explicitly address the problem of\nimproper attribute binding. To address the challenges posed by complex prompts\nor scenarios involving multiple entities and to achieve improved attribute\nbinding, we propose Divide & Bind. We introduce two novel loss objectives for\nGSN: a novel attendance loss and a binding loss. Our approach stands out in its\nability to faithfully synthesize desired objects with improved attribute\nalignment from complex prompts and exhibits superior performance across\nmultiple evaluation benchmarks. More videos and updates can be found on the\nproject page \\url{https://sites.google.com/view/divide-and-bind}.\n","authors":["Yumeng Li","Margret Keuper","Dan Zhang","Anna Khoreva"],"pdf_url":"https://arxiv.org/pdf/2307.10864v1.pdf","comment":"Project page: \\url{https://sites.google.com/view/divide-and-bind}"},{"id":"http://arxiv.org/abs/2307.09206v2","updated":"2023-07-20T13:29:27Z","published":"2023-07-18T12:42:59Z","title":"Context-Conditional Navigation with a Learning-Based Terrain- and\n  Robot-Aware Dynamics Model","summary":"  In autonomous navigation settings, several quantities can be subject to\nvariations. Terrain properties such as friction coefficients may vary over time\ndepending on the location of the robot. Also, the dynamics of the robot may\nchange due to, e.g., different payloads, changing the system's mass, or wear\nand tear, changing actuator gains or joint friction. An autonomous agent should\nthus be able to adapt to such variations. In this paper, we develop a novel\nprobabilistic, terrain- and robot-aware forward dynamics model, termed TRADYN,\nwhich is able to adapt to the above-mentioned variations. It builds on recent\nadvances in meta-learning forward dynamics models based on Neural Processes. We\nevaluate our method in a simulated 2D navigation setting with a unicycle-like\nrobot and different terrain layouts with spatially varying friction\ncoefficients. In our experiments, the proposed model exhibits lower prediction\nerror for the task of long-horizon trajectory prediction, compared to\nnon-adaptive ablation models. We also evaluate our model on the downstream task\nof navigation planning, which demonstrates improved performance in planning\ncontrol-efficient paths by taking robot and terrain properties into account.\n","authors":["Suresh Guttikonda","Jan Achterhold","Haolong Li","Joschka Boedecker","Joerg Stueckler"],"pdf_url":"https://arxiv.org/pdf/2307.09206v2.pdf","comment":"\\copyright 2023 IEEE. Accepted for publication in European Conference\n  on Mobile Robots (ECMR), 2023. Updated copyright statement"},{"id":"http://arxiv.org/abs/2211.04974v2","updated":"2023-07-20T13:11:13Z","published":"2022-11-09T15:39:32Z","title":"Leveraging Offline Data in Online Reinforcement Learning","summary":"  Two central paradigms have emerged in the reinforcement learning (RL)\ncommunity: online RL and offline RL. In the online RL setting, the agent has no\nprior knowledge of the environment, and must interact with it in order to find\nan $\\epsilon$-optimal policy. In the offline RL setting, the learner instead\nhas access to a fixed dataset to learn from, but is unable to otherwise\ninteract with the environment, and must obtain the best policy it can from this\noffline data. Practical scenarios often motivate an intermediate setting: if we\nhave some set of offline data and, in addition, may also interact with the\nenvironment, how can we best use the offline data to minimize the number of\nonline interactions necessary to learn an $\\epsilon$-optimal policy?\n  In this work, we consider this setting, which we call the \\textsf{FineTuneRL}\nsetting, for MDPs with linear structure. We characterize the necessary number\nof online samples needed in this setting given access to some offline dataset,\nand develop an algorithm, \\textsc{FTPedel}, which is provably optimal, up to\n$H$ factors. We show through an explicit example that combining offline data\nwith online interactions can lead to a provable improvement over either purely\noffline or purely online RL. Finally, our results illustrate the distinction\nbetween \\emph{verifiable} learning, the typical setting considered in online\nRL, and \\emph{unverifiable} learning, the setting often considered in offline\nRL, and show that there is a formal separation between these regimes.\n","authors":["Andrew Wagenmaker","Aldo Pacchiano"],"pdf_url":"https://arxiv.org/pdf/2211.04974v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10845v1","updated":"2023-07-20T13:07:41Z","published":"2023-07-20T13:07:41Z","title":"Self-paced Weight Consolidation for Continual Learning","summary":"  Continual learning algorithms which keep the parameters of new tasks close to\nthat of previous tasks, are popular in preventing catastrophic forgetting in\nsequential task learning settings. However, 1) the performance for the new\ncontinual learner will be degraded without distinguishing the contributions of\npreviously learned tasks; 2) the computational cost will be greatly increased\nwith the number of tasks, since most existing algorithms need to regularize all\nprevious tasks when learning new tasks. To address the above challenges, we\npropose a self-paced Weight Consolidation (spWC) framework to attain robust\ncontinual learning via evaluating the discriminative contributions of previous\ntasks. To be specific, we develop a self-paced regularization to reflect the\npriorities of past tasks via measuring difficulty based on key performance\nindicator (i.e., accuracy). When encountering a new task, all previous tasks\nare sorted from \"difficult\" to \"easy\" based on the priorities. Then the\nparameters of the new continual learner will be learned via selectively\nmaintaining the knowledge amongst more difficult past tasks, which could well\novercome catastrophic forgetting with less computational cost. We adopt an\nalternative convex search to iteratively update the model parameters and\npriority weights in the bi-convex formulation. The proposed spWC framework is\nplug-and-play, which is applicable to most continual learning algorithms (e.g.,\nEWC, MAS and RCIL) in different directions (e.g., classification and\nsegmentation). Experimental results on several public benchmark datasets\ndemonstrate that our proposed framework can effectively improve performance\nwhen compared with other popular continual learning algorithms.\n","authors":["Wei Cong","Yang Cong","Gan Sun","Yuyang Liu","Jiahua Dong"],"pdf_url":"https://arxiv.org/pdf/2307.10845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10843v1","updated":"2023-07-20T13:04:26Z","published":"2023-07-20T13:04:26Z","title":"Global Precipitation Nowcasting of Integrated Multi-satellitE Retrievals\n  for GPM: A U-Net Convolutional LSTM Architecture","summary":"  This paper presents a deep learning architecture for nowcasting of\nprecipitation almost globally every 30 min with a 4-hour lead time. The\narchitecture fuses a U-Net and a convolutional long short-term memory (LSTM)\nneural network and is trained using data from the Integrated MultisatellitE\nRetrievals for GPM (IMERG) and a few key precipitation drivers from the Global\nForecast System (GFS). The impacts of different training loss functions,\nincluding the mean-squared error (regression) and the focal-loss\n(classification), on the quality of precipitation nowcasts are studied. The\nresults indicate that the regression network performs well in capturing light\nprecipitation (below 1.6 mm/hr), but the classification network can outperform\nthe regression network for nowcasting of precipitation extremes (>8 mm/hr), in\nterms of the critical success index (CSI).. Using the Wasserstein distance, it\nis shown that the predicted precipitation by the classification network has a\ncloser class probability distribution to the IMERG than the regression network.\nIt is uncovered that the inclusion of the physical variables can improve\nprecipitation nowcasting, especially at longer lead times in both networks.\nTaking IMERG as a relative reference, a multi-scale analysis in terms of\nfractions skill score (FSS), shows that the nowcasting machine remains skillful\n(FSS > 0.5) at the resolution of 10 km compared to 50 km for GFS. For\nprecipitation rates greater than 4~mm/hr, only the classification network\nremains FSS-skillful on scales greater than 50 km within a 2-hour lead time.\n","authors":["Reyhaneh Rahimi","Ardeshir Ebtehaj","Ali Behrangi","Jackson Tan"],"pdf_url":"https://arxiv.org/pdf/2307.10843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10842v1","updated":"2023-07-20T13:02:45Z","published":"2023-07-20T13:02:45Z","title":"Label Calibration for Semantic Segmentation Under Domain Shift","summary":"  Performance of a pre-trained semantic segmentation model is likely to\nsubstantially decrease on data from a new domain. We show a pre-trained model\ncan be adapted to unlabelled target domain data by calculating soft-label\nprototypes under the domain shift and making predictions according to the\nprototype closest to the vector with predicted class probabilities. The\nproposed adaptation procedure is fast, comes almost for free in terms of\ncomputational resources and leads to considerable performance improvements. We\ndemonstrate the benefits of such label calibration on the highly-practical\nsynthetic-to-real semantic segmentation problem.\n","authors":["Ondrej Bohdal","Da Li","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2307.10842v1.pdf","comment":"ICLR 2023 Workshop on Pitfalls of Limited Data and Computation for\n  Trustworthy ML"},{"id":"http://arxiv.org/abs/2207.02575v2","updated":"2023-07-20T12:59:44Z","published":"2022-07-06T10:42:57Z","title":"Instance-Dependent Near-Optimal Policy Identification in Linear MDPs via\n  Online Experiment Design","summary":"  While much progress has been made in understanding the minimax sample\ncomplexity of reinforcement learning (RL) -- the complexity of learning on the\n\"worst-case\" instance -- such measures of complexity often do not capture the\ntrue difficulty of learning. In practice, on an \"easy\" instance, we might hope\nto achieve a complexity far better than that achievable on the worst-case\ninstance. In this work we seek to understand the \"instance-dependent\"\ncomplexity of learning near-optimal policies (PAC RL) in the setting of RL with\nlinear function approximation. We propose an algorithm, \\textsc{Pedel}, which\nachieves a fine-grained instance-dependent measure of complexity, the first of\nits kind in the RL with function approximation setting, thereby capturing the\ndifficulty of learning on each particular problem instance. Through an explicit\nexample, we show that \\textsc{Pedel} yields provable gains over low-regret,\nminimax-optimal algorithms and that such algorithms are unable to hit the\ninstance-optimal rate. Our approach relies on a novel online experiment\ndesign-based procedure which focuses the exploration budget on the \"directions\"\nmost relevant to learning a near-optimal policy, and may be of independent\ninterest.\n","authors":["Andrew Wagenmaker","Kevin Jamieson"],"pdf_url":"https://arxiv.org/pdf/2207.02575v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06092v2","updated":"2023-07-20T12:54:32Z","published":"2023-07-12T11:35:37Z","title":"Quantitative CLTs in Deep Neural Networks","summary":"  We study the distribution of a fully connected neural network with random\nGaussian weights and biases in which the hidden layer widths are proportional\nto a large constant $n$. Under mild assumptions on the non-linearity, we obtain\nquantitative bounds on normal approximations valid at large but finite $n$ and\nany fixed network depth. Our theorems show both for the finite-dimensional\ndistributions and the entire process, that the distance between a random fully\nconnected network (and its derivatives) to the corresponding infinite width\nGaussian process scales like $n^{-\\gamma}$ for $\\gamma>0$, with the exponent\ndepending on the metric used to measure discrepancy. Our bounds are strictly\nstronger in terms of their dependence on network width than any previously\navailable in the literature; in the one-dimensional case, we also prove that\nthey are optimal, i.e., we establish matching lower bounds.\n","authors":["Stefano Favaro","Boris Hanin","Domenico Marinucci","Ivan Nourdin","Giovanni Peccati"],"pdf_url":"https://arxiv.org/pdf/2307.06092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10810v1","updated":"2023-07-20T12:20:18Z","published":"2023-07-20T12:20:18Z","title":"On Combining Expert Demonstrations in Imitation Learning via Optimal\n  Transport","summary":"  Imitation learning (IL) seeks to teach agents specific tasks through expert\ndemonstrations. One of the key approaches to IL is to define a distance between\nagent and expert and to find an agent policy that minimizes that distance.\nOptimal transport methods have been widely used in imitation learning as they\nprovide ways to measure meaningful distances between agent and expert\ntrajectories. However, the problem of how to optimally combine multiple expert\ndemonstrations has not been widely studied. The standard method is to simply\nconcatenate state (-action) trajectories, which is problematic when\ntrajectories are multi-modal. We propose an alternative method that uses a\nmulti-marginal optimal transport distance and enables the combination of\nmultiple and diverse state-trajectories in the OT sense, providing a more\nsensible geometric average of the demonstrations. Our approach enables an agent\nto learn from several experts, and its efficiency is analyzed on OpenAI Gym\ncontrol environments and demonstrates that the standard method is not always\noptimal.\n","authors":["Ilana Sebag","Samuel Cohen","Marc Peter Deisenroth"],"pdf_url":"https://arxiv.org/pdf/2307.10810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15382v2","updated":"2023-07-20T12:18:49Z","published":"2022-11-24T13:21:36Z","title":"Neural Network Complexity of Chaos and Turbulence","summary":"  Chaos and turbulence are complex physical phenomena, yet a precise definition\nof the complexity measure that quantifies them is still lacking. In this work\nwe consider the relative complexity of chaos and turbulence from the\nperspective of deep neural networks. We analyze a set of classification\nproblems, where the network has to distinguish images of fluid profiles in the\nturbulent regime from other classes of images such as fluid profiles in the\nchaotic regime, various constructions of noise and real world images. We\nanalyze incompressible as well as weakly compressible fluid flows. We quantify\nthe complexity of the computation performed by the network via the intrinsic\ndimensionality of the internal feature representations, and calculate the\neffective number of independent features which the network uses in order to\ndistinguish between classes. In addition to providing a numerical estimate of\nthe complexity of the computation, the measure also characterizes the neural\nnetwork processing at intermediate and final stages. We construct adversarial\nexamples and use them to identify the two point correlation spectra for the\nchaotic and turbulent vorticity as the feature used by the network for\nclassification.\n","authors":["Tim Whittaker","Romuald A. Janik","Yaron Oz"],"pdf_url":"https://arxiv.org/pdf/2211.15382v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.01001v3","updated":"2023-07-20T12:17:50Z","published":"2021-06-02T07:53:54Z","title":"Warming up recurrent neural networks to maximise reachable\n  multistability greatly improves learning","summary":"  Training recurrent neural networks is known to be difficult when time\ndependencies become long. In this work, we show that most standard cells only\nhave one stable equilibrium at initialisation, and that learning on tasks with\nlong time dependencies generally occurs once the number of network stable\nequilibria increases; a property known as multistability. Multistability is\noften not easily attained by initially monostable networks, making learning of\nlong time dependencies between inputs and outputs difficult. This insight leads\nto the design of a novel way to initialise any recurrent cell connectivity\nthrough a procedure called \"warmup\" to improve its capability to learn\narbitrarily long time dependencies. This initialisation procedure is designed\nto maximise network reachable multistability, i.e., the number of equilibria\nwithin the network that can be reached through relevant input trajectories, in\nfew gradient steps. We show on several information restitution, sequence\nclassification, and reinforcement learning benchmarks that warming up greatly\nimproves learning speed and performance, for multiple recurrent cells, but\nsometimes impedes precision. We therefore introduce a double-layer architecture\ninitialised with a partial warmup that is shown to greatly improve learning of\nlong time dependencies while maintaining high levels of precision. This\napproach provides a general framework for improving learning abilities of any\nrecurrent cell when long time dependencies are present. We also show\nempirically that other initialisation and pretraining procedures from the\nliterature implicitly foster reachable multistability of recurrent cells.\n","authors":["Gaspard Lambrechts","Florent De Geeter","Nicolas Vecoven","Damien Ernst","Guillaume Drion"],"pdf_url":"https://arxiv.org/pdf/2106.01001v3.pdf","comment":"20 pages, 35 pages total, 38 figures"},{"id":"http://arxiv.org/abs/2307.10805v1","updated":"2023-07-20T12:16:26Z","published":"2023-07-20T12:16:26Z","title":"Communication-Efficient Split Learning via Adaptive Feature-Wise\n  Compression","summary":"  This paper proposes a novel communication-efficient split learning (SL)\nframework, named SplitFC, which reduces the communication overhead required for\ntransmitting intermediate feature and gradient vectors during the SL training\nprocess. The key idea of SplitFC is to leverage different dispersion degrees\nexhibited in the columns of the matrices. SplitFC incorporates two compression\nstrategies: (i) adaptive feature-wise dropout and (ii) adaptive feature-wise\nquantization. In the first strategy, the intermediate feature vectors are\ndropped with adaptive dropout probabilities determined based on the standard\ndeviation of these vectors. Then, by the chain rule, the intermediate gradient\nvectors associated with the dropped feature vectors are also dropped. In the\nsecond strategy, the non-dropped intermediate feature and gradient vectors are\nquantized using adaptive quantization levels determined based on the ranges of\nthe vectors. To minimize the quantization error, the optimal quantization\nlevels of this strategy are derived in a closed-form expression. Simulation\nresults on the MNIST, CIFAR-10, and CelebA datasets demonstrate that SplitFC\nprovides more than a 5.6% increase in classification accuracy compared to\nstate-of-the-art SL frameworks, while they require 320 times less communication\noverhead compared to the vanilla SL framework without compression.\n","authors":["Yongjeong Oh","Jaeho Lee","Christopher G. Brinton","Yo-Seb Jeon"],"pdf_url":"https://arxiv.org/pdf/2307.10805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10803v1","updated":"2023-07-20T12:12:05Z","published":"2023-07-20T12:12:05Z","title":"Spatial-Temporal Data Mining for Ocean Science: Data, Methodologies, and\n  Opportunities","summary":"  With the increasing amount of spatial-temporal~(ST) ocean data, numerous\nspatial-temporal data mining (STDM) studies have been conducted to address\nvarious oceanic issues, e.g., climate forecasting and disaster warning.\nCompared with typical ST data (e.g., traffic data), ST ocean data is more\ncomplicated with some unique characteristics, e.g., diverse regionality and\nhigh sparsity. These characteristics make it difficult to design and train STDM\nmodels. Unfortunately, an overview of these studies is still missing, hindering\ncomputer scientists to identify the research issues in ocean while discouraging\nresearchers in ocean science from applying advanced STDM techniques. To remedy\nthis situation, we provide a comprehensive survey to summarize existing STDM\nstudies in ocean. Concretely, we first summarize the widely-used ST ocean\ndatasets and identify their unique characteristics. Then, typical ST ocean data\nquality enhancement techniques are discussed. Next, we classify existing STDM\nstudies for ocean into four types of tasks, i.e., prediction, event detection,\npattern mining, and anomaly detection, and elaborate the techniques for these\ntasks. Finally, promising research opportunities are highlighted. This survey\nwill help scientists from the fields of both computer science and ocean science\nhave a better understanding of the fundamental concepts, key techniques, and\nopen challenges of STDM in ocean.\n","authors":["Hanchen Yang","Wengen Li","Shuyu Wang","Hui Li","Jihong Guan","Shuigeng Zhou","Jiannong Cao"],"pdf_url":"https://arxiv.org/pdf/2307.10803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2009.03259v2","updated":"2023-07-20T12:11:56Z","published":"2020-09-07T17:27:27Z","title":"Implicit Multidimensional Projection of Local Subspaces","summary":"  We propose a visualization method to understand the effect of\nmultidimensional projection on local subspaces, using implicit function\ndifferentiation. Here, we understand the local subspace as the multidimensional\nlocal neighborhood of data points. Existing methods focus on the projection of\nmultidimensional data points, and the neighborhood information is ignored. Our\nmethod is able to analyze the shape and directional information of the local\nsubspace to gain more insights into the global structure of the data through\nthe perception of local structures. Local subspaces are fitted by\nmultidimensional ellipses that are spanned by basis vectors. An accurate and\nefficient vector transformation method is proposed based on analytical\ndifferentiation of multidimensional projections formulated as implicit\nfunctions. The results are visualized as glyphs and analyzed using a full set\nof specifically-designed interactions supported in our efficient web-based\nvisualization tool. The usefulness of our method is demonstrated using various\nmulti- and high-dimensional benchmark datasets. Our implicit differentiation\nvector transformation is evaluated through numerical comparisons; the overall\nmethod is evaluated through exploration examples and use cases.\n","authors":["Rongzheng Bian","Yumeng Xue","Liang Zhou","Jian Zhang","Baoquan Chen","Daniel Weiskopf","Yunhai Wang"],"pdf_url":"https://arxiv.org/pdf/2009.03259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10802v1","updated":"2023-07-20T12:10:29Z","published":"2023-07-20T12:10:29Z","title":"Meta-Transformer: A Unified Framework for Multimodal Learning","summary":"  Multimodal learning aims to build models that can process and relate\ninformation from multiple modalities. Despite years of development in this\nfield, it still remains challenging to design a unified network for processing\nvarious modalities ($\\textit{e.g.}$ natural language, 2D images, 3D point\nclouds, audio, video, time series, tabular data) due to the inherent gaps among\nthem. In this work, we propose a framework, named Meta-Transformer, that\nleverages a $\\textbf{frozen}$ encoder to perform multimodal perception without\nany paired multimodal training data. In Meta-Transformer, the raw input data\nfrom various modalities are mapped into a shared token space, allowing a\nsubsequent encoder with frozen parameters to extract high-level semantic\nfeatures of the input data. Composed of three main components: a unified data\ntokenizer, a modality-shared encoder, and task-specific heads for downstream\ntasks, Meta-Transformer is the first framework to perform unified learning\nacross 12 modalities with unpaired data. Experiments on different benchmarks\nreveal that Meta-Transformer can handle a wide range of tasks including\nfundamental perception (text, image, point cloud, audio, video), practical\napplication (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph,\ntabular, and time-series). Meta-Transformer indicates a promising future for\ndeveloping unified multimodal intelligence with transformers. Code will be\navailable at https://github.com/invictus717/MetaTransformer\n","authors":["Yiyuan Zhang","Kaixiong Gong","Kaipeng Zhang","Hongsheng Li","Yu Qiao","Wanli Ouyang","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.10802v1.pdf","comment":"Project website: https://kxgong.github.io/meta_transformer/"},{"id":"http://arxiv.org/abs/2205.12900v4","updated":"2023-07-20T12:10:09Z","published":"2022-05-25T16:46:01Z","title":"Pre-trained Perceptual Features Improve Differentially Private Image\n  Generation","summary":"  Training even moderately-sized generative models with differentially-private\nstochastic gradient descent (DP-SGD) is difficult: the required level of noise\nfor reasonable levels of privacy is simply too large. We advocate instead\nbuilding off a good, relevant representation on an informative public dataset,\nthen learning to model the private data with that representation. In\nparticular, we minimize the maximum mean discrepancy (MMD) between private\ntarget data and a generator's distribution, using a kernel based on perceptual\nfeatures learned from a public dataset. With the MMD, we can simply privatize\nthe data-dependent term once and for all, rather than introducing noise at each\nstep of optimization as in DP-SGD. Our algorithm allows us to generate\nCIFAR10-level images with $\\epsilon \\approx 2$ which capture distinctive\nfeatures in the distribution, far surpassing the current state of the art,\nwhich mostly focuses on datasets such as MNIST and FashionMNIST at a large\n$\\epsilon \\approx 10$. Our work introduces simple yet powerful foundations for\nreducing the gap between private and non-private deep generative models. Our\ncode is available at \\url{https://github.com/ParkLabML/DP-MEPF}.\n","authors":["Fredrik Harder","Milad Jalali Asadabadi","Danica J. Sutherland","Mijung Park"],"pdf_url":"https://arxiv.org/pdf/2205.12900v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.03455v2","updated":"2023-07-20T11:54:22Z","published":"2021-07-07T19:35:31Z","title":"Model Selection for Generic Contextual Bandits","summary":"  We consider the problem of model selection for the general stochastic\ncontextual bandits under the realizability assumption. We propose a successive\nrefinement based algorithm called Adaptive Contextual Bandit ({\\ttfamily ACB}),\nthat works in phases and successively eliminates model classes that are too\nsimple to fit the given instance. We prove that this algorithm is adaptive,\ni.e., the regret rate order-wise matches that of any provable contextual bandit\nalgorithm (ex. \\cite{falcon}), that needs the knowledge of the true model\nclass. The price of not knowing the correct model class turns out to be only an\nadditive term contributing to the second order term in the regret bound. This\ncost possess the intuitive property that it becomes smaller as the model class\nbecomes easier to identify, and vice-versa. We also show that a much simpler\nexplore-then-commit (ETC) style algorithm also obtains similar regret bound,\ndespite not knowing the true model class. However, the cost of model selection\nis higher in ETC as opposed to in {\\ttfamily ACB}, as expected. Furthermore,\nfor the special case of linear contextual bandits, we propose specialized\nalgorithms that obtain sharper guarantees compared to the generic setup.\n","authors":["Avishek Ghosh","Abishek Sankararaman","Kannan Ramchandran"],"pdf_url":"https://arxiv.org/pdf/2107.03455v2.pdf","comment":"Accepted at IEEE Transactions on Information Theory. arXiv admin\n  note: text overlap with arXiv:2006.02612"},{"id":"http://arxiv.org/abs/2307.10792v1","updated":"2023-07-20T11:45:38Z","published":"2023-07-20T11:45:38Z","title":"Optimizing PatchCore for Few/many-shot Anomaly Detection","summary":"  Few-shot anomaly detection (AD) is an emerging sub-field of general AD, and\ntries to distinguish between normal and anomalous data using only few selected\nsamples. While newly proposed few-shot AD methods do compare against\npre-existing algorithms developed for the full-shot domain as baselines, they\ndo not dedicatedly optimize them for the few-shot setting. It thus remains\nunclear if the performance of such pre-existing algorithms can be further\nimproved. We address said question in this work. Specifically, we present a\nstudy on the AD/anomaly segmentation (AS) performance of PatchCore, the current\nstate-of-the-art full-shot AD/AS algorithm, in both the few-shot and the\nmany-shot settings. We hypothesize that further performance improvements can be\nrealized by (I) optimizing its various hyperparameters, and by (II)\ntransferring techniques known to improve few-shot supervised learning to the AD\ndomain. Exhaustive experiments on the public VisA and MVTec AD datasets reveal\nthat (I) significant performance improvements can be realized by optimizing\nhyperparameters such as the underlying feature extractor, and that (II)\nimage-level augmentations can, but are not guaranteed, to improve performance.\nBased on these findings, we achieve a new state of the art in few-shot AD on\nVisA, further demonstrating the merit of adapting pre-existing AD/AS methods to\nthe few-shot setting. Last, we identify the investigation of feature extractors\nwith a strong inductive bias as a potential future research direction for\n(few-shot) AD/AS.\n","authors":["João Santos","Triet Tran","Oliver Rippel"],"pdf_url":"https://arxiv.org/pdf/2307.10792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10788v1","updated":"2023-07-20T11:38:55Z","published":"2023-07-20T11:38:55Z","title":"Adversarial attacks for mixtures of classifiers","summary":"  Mixtures of classifiers (a.k.a. randomized ensembles) have been proposed as a\nway to improve robustness against adversarial attacks. However, it has been\nshown that existing attacks are not well suited for this kind of classifiers.\nIn this paper, we discuss the problem of attacking a mixture in a principled\nway and introduce two desirable properties of attacks based on a geometrical\nanalysis of the problem (effectiveness and maximality). We then show that\nexisting attacks do not meet both of these properties. Finally, we introduce a\nnew attack called lattice climber attack with theoretical guarantees on the\nbinary linear setting, and we demonstrate its performance by conducting\nexperiments on synthetic and real datasets.\n","authors":["Lucas Gnecco Heredia","Benjamin Negrevergne","Yann Chevaleyre"],"pdf_url":"https://arxiv.org/pdf/2307.10788v1.pdf","comment":"7 pages + 4 pages of appendix. 5 figures in main text"},{"id":"http://arxiv.org/abs/2307.09614v2","updated":"2023-07-20T11:36:52Z","published":"2023-07-13T19:03:06Z","title":"Multi-view self-supervised learning for multivariate variable-channel\n  time series","summary":"  Labeling of multivariate biomedical time series data is a laborious and\nexpensive process. Self-supervised contrastive learning alleviates the need for\nlarge, labeled datasets through pretraining on unlabeled data. However, for\nmultivariate time series data, the set of input channels often varies between\napplications, and most existing work does not allow for transfer between\ndatasets with different sets of input channels. We propose learning one encoder\nto operate on all input channels individually. We then use a message passing\nneural network to extract a single representation across channels. We\ndemonstrate the potential of this method by pretraining our model on a dataset\nwith six EEG channels and then fine-tuning it on a dataset with two different\nEEG channels. We compare models with and without the message passing neural\nnetwork across different contrastive loss functions. We show that our method,\ncombined with the TS2Vec loss, outperforms all other methods in most settings.\n","authors":["Thea Brüsch","Mikkel N. Schmidt","Tommy S. Alstrøm"],"pdf_url":"https://arxiv.org/pdf/2307.09614v2.pdf","comment":"To appear in proceedings of 2023 IEEE International workshop on\n  Machine Learning for Signal Processing"},{"id":"http://arxiv.org/abs/2307.10787v1","updated":"2023-07-20T11:36:45Z","published":"2023-07-20T11:36:45Z","title":"Feed-Forward Source-Free Domain Adaptation via Class Prototypes","summary":"  Source-free domain adaptation has become popular because of its practical\nusefulness and no need to access source data. However, the adaptation process\nstill takes a considerable amount of time and is predominantly based on\noptimization that relies on back-propagation. In this work we present a simple\nfeed-forward approach that challenges the need for back-propagation based\nadaptation. Our approach is based on computing prototypes of classes under the\ndomain shift using a pre-trained model. It achieves strong improvements in\naccuracy compared to the pre-trained model and requires only a small fraction\nof time of existing domain adaptation methods.\n","authors":["Ondrej Bohdal","Da Li","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2307.10787v1.pdf","comment":"ECCV 2022 Workshop on Out of Distribution Generalization in Computer\n  Vision (OOD-CV)"},{"id":"http://arxiv.org/abs/2307.10779v1","updated":"2023-07-20T11:29:17Z","published":"2023-07-20T11:29:17Z","title":"Efficient Beam Tree Recursion","summary":"  Beam Tree Recursive Neural Network (BT-RvNN) was recently proposed as a\nsimple extension of Gumbel Tree RvNN and it was shown to achieve\nstate-of-the-art length generalization performance in ListOps while maintaining\ncomparable performance on other tasks. However, although not the worst in its\nkind, BT-RvNN can be still exorbitantly expensive in memory usage. In this\npaper, we identify the main bottleneck in BT-RvNN's memory usage to be the\nentanglement of the scorer function and the recursive cell function. We propose\nstrategies to remove this bottleneck and further simplify its memory usage.\nOverall, our strategies not only reduce the memory usage of BT-RvNN by\n$10$-$16$ times but also create a new state-of-the-art in ListOps while\nmaintaining similar performance in other tasks. In addition, we also propose a\nstrategy to utilize the induced latent-tree node representations produced by\nBT-RvNN to turn BT-RvNN from a sentence encoder of the form $f:\\mathbb{R}^{n\n\\times d} \\rightarrow \\mathbb{R}^{d}$ into a sequence contextualizer of the\nform $f:\\mathbb{R}^{n \\times d} \\rightarrow \\mathbb{R}^{n \\times d}$. Thus, our\nproposals not only open up a path for further scalability of RvNNs but also\nstandardize a way to use BT-RvNNs as another building block in the deep\nlearning toolkit that can be easily stacked or interfaced with other popular\nmodels such as Transformers and Structured State Space models.\n","authors":["Jishnu Ray Chowdhury","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2307.10779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10774v1","updated":"2023-07-20T11:14:24Z","published":"2023-07-20T11:14:24Z","title":"Assessing the Use of AutoML for Data-Driven Software Engineering","summary":"  Background. Due to the widespread adoption of Artificial Intelligence (AI)\nand Machine Learning (ML) for building software applications, companies are\nstruggling to recruit employees with a deep understanding of such technologies.\nIn this scenario, AutoML is soaring as a promising solution to fill the AI/ML\nskills gap since it promises to automate the building of end-to-end AI/ML\npipelines that would normally be engineered by specialized team members. Aims.\nDespite the growing interest and high expectations, there is a dearth of\ninformation about the extent to which AutoML is currently adopted by teams\ndeveloping AI/ML-enabled systems and how it is perceived by practitioners and\nresearchers. Method. To fill these gaps, in this paper, we present a\nmixed-method study comprising a benchmark of 12 end-to-end AutoML tools on two\nSE datasets and a user survey with follow-up interviews to further our\nunderstanding of AutoML adoption and perception. Results. We found that AutoML\nsolutions can generate models that outperform those trained and optimized by\nresearchers to perform classification tasks in the SE domain. Also, our\nfindings show that the currently available AutoML solutions do not live up to\ntheir names as they do not equally support automation across the stages of the\nML development workflow and for all the team members. Conclusions. We derive\ninsights to inform the SE research community on how AutoML can facilitate their\nactivities and tool builders on how to design the next generation of AutoML\ntechnologies.\n","authors":["Fabio Calefato","Luigi Quaranta","Filippo Lanubile","Marcos Kalinowski"],"pdf_url":"https://arxiv.org/pdf/2307.10774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10773v1","updated":"2023-07-20T11:10:06Z","published":"2023-07-20T11:10:06Z","title":"Music Genre Classification with ResNet and Bi-GRU Using Visual\n  Spectrograms","summary":"  Music recommendation systems have emerged as a vital component to enhance\nuser experience and satisfaction for the music streaming services, which\ndominates music consumption. The key challenge in improving these recommender\nsystems lies in comprehending the complexity of music data, specifically for\nthe underpinning music genre classification. The limitations of manual genre\nclassification have highlighted the need for a more advanced system, namely the\nAutomatic Music Genre Classification (AMGC) system. While traditional machine\nlearning techniques have shown potential in genre classification, they heavily\nrely on manually engineered features and feature selection, failing to capture\nthe full complexity of music data. On the other hand, deep learning\nclassification architectures like the traditional Convolutional Neural Networks\n(CNN) are effective in capturing the spatial hierarchies but struggle to\ncapture the temporal dynamics inherent in music data. To address these\nchallenges, this study proposes a novel approach using visual spectrograms as\ninput, and propose a hybrid model that combines the strength of the Residual\nneural Network (ResNet) and the Gated Recurrent Unit (GRU). This model is\ndesigned to provide a more comprehensive analysis of music data, offering the\npotential to improve the music recommender systems through achieving a more\ncomprehensive analysis of music data and hence potentially more accurate genre\nclassification.\n","authors":["Junfei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10768v1","updated":"2023-07-20T10:57:02Z","published":"2023-07-20T10:57:02Z","title":"Decoding the Enigma: Benchmarking Humans and AIs on the Many Facets of\n  Working Memory","summary":"  Working memory (WM), a fundamental cognitive process facilitating the\ntemporary storage, integration, manipulation, and retrieval of information,\nplays a vital role in reasoning and decision-making tasks. Robust benchmark\ndatasets that capture the multifaceted nature of WM are crucial for the\neffective development and evaluation of AI WM models. Here, we introduce a\ncomprehensive Working Memory (WorM) benchmark dataset for this purpose. WorM\ncomprises 10 tasks and a total of 1 million trials, assessing 4\nfunctionalities, 3 domains, and 11 behavioral and neural characteristics of WM.\nWe jointly trained and tested state-of-the-art recurrent neural networks and\ntransformers on all these tasks. We also include human behavioral benchmarks as\nan upper bound for comparison. Our results suggest that AI models replicate\nsome characteristics of WM in the brain, most notably primacy and recency\neffects, and neural clusters and correlates specialized for different domains\nand functionalities of WM. In the experiments, we also reveal some limitations\nin existing models to approximate human behavior. This dataset serves as a\nvaluable resource for communities in cognitive psychology, neuroscience, and\nAI, offering a standardized framework to compare and enhance WM models,\ninvestigate WM's neural underpinnings, and develop WM models with human-like\ncapabilities. Our source code and data are available at\nhttps://github.com/ZhangLab-DeepNeuroCogLab/WorM.\n","authors":["Ankur Sikarwar","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10763v1","updated":"2023-07-20T10:53:12Z","published":"2023-07-20T10:53:12Z","title":"MSQNet: Actor-agnostic Action Recognition with Multi-modal Query","summary":"  Existing action recognition methods are typically actor-specific due to the\nintrinsic topological and apparent differences among the actors. This requires\nactor-specific pose estimation (e.g., humans vs. animals), leading to\ncumbersome model design complexity and high maintenance costs. Moreover, they\noften focus on learning the visual modality alone and single-label\nclassification whilst neglecting other available information sources (e.g.,\nclass name text) and the concurrent occurrence of multiple actions. To overcome\nthese limitations, we propose a new approach called 'actor-agnostic multi-modal\nmulti-label action recognition,' which offers a unified solution for various\ntypes of actors, including humans and animals. We further formulate a novel\nMulti-modal Semantic Query Network (MSQNet) model in a transformer-based object\ndetection framework (e.g., DETR), characterized by leveraging visual and\ntextual modalities to represent the action classes better. The elimination of\nactor-specific model designs is a key advantage, as it removes the need for\nactor pose estimation altogether. Extensive experiments on five publicly\navailable benchmarks show that our MSQNet consistently outperforms the prior\narts of actor-specific alternatives on human and animal single- and multi-label\naction recognition tasks by up to 50%. Code will be released at\nhttps://github.com/mondalanindya/MSQNet.\n","authors":["Anindya Mondal","Sauradip Nag","Joaquin M Prada","Xiatian Zhu","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.10763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13960v2","updated":"2023-07-20T10:26:56Z","published":"2023-06-24T13:29:54Z","title":"Regular SE(3) Group Convolutions for Volumetric Medical Image Analysis","summary":"  Regular group convolutional neural networks (G-CNNs) have been shown to\nincrease model performance and improve equivariance to different geometrical\nsymmetries. This work addresses the problem of SE(3), i.e., roto-translation\nequivariance, on volumetric data. Volumetric image data is prevalent in many\nmedical settings. Motivated by the recent work on separable group convolutions,\nwe devise a SE(3) group convolution kernel separated into a continuous SO(3)\n(rotation) kernel and a spatial kernel. We approximate equivariance to the\ncontinuous setting by sampling uniform SO(3) grids. Our continuous SO(3) kernel\nis parameterized via RBF interpolation on similarly uniform grids. We\ndemonstrate the advantages of our approach in volumetric medical image\nanalysis. Our SE(3) equivariant models consistently outperform CNNs and regular\ndiscrete G-CNNs on challenging medical classification tasks and show\nsignificantly improved generalization capabilities. Our approach achieves up to\na 16.5% gain in accuracy over regular CNNs.\n","authors":["Thijs P. Kuipers","Erik J. Bekkers"],"pdf_url":"https://arxiv.org/pdf/2306.13960v2.pdf","comment":"10 pages, 1 figure, 2 tables, accepted at MICCAI 2023. Updated\n  version to camera ready version 1"},{"id":"http://arxiv.org/abs/2307.10749v1","updated":"2023-07-20T10:24:18Z","published":"2023-07-20T10:24:18Z","title":"Mitigating Voter Attribute Bias for Fair Opinion Aggregation","summary":"  The aggregation of multiple opinions plays a crucial role in decision-making,\nsuch as in hiring and loan review, and in labeling data for supervised\nlearning. Although majority voting and existing opinion aggregation models are\neffective for simple tasks, they are inappropriate for tasks without\nobjectively true labels in which disagreements may occur. In particular, when\nvoter attributes such as gender or race introduce bias into opinions, the\naggregation results may vary depending on the composition of voter attributes.\nA balanced group of voters is desirable for fair aggregation results but may be\ndifficult to prepare. In this study, we consider methods to achieve fair\nopinion aggregation based on voter attributes and evaluate the fairness of the\naggregated results. To this end, we consider an approach that combines opinion\naggregation models such as majority voting and the Dawid and Skene model (D&S\nmodel) with fairness options such as sample weighting. To evaluate the fairness\nof opinion aggregation, probabilistic soft labels are preferred over discrete\nclass labels. First, we address the problem of soft label estimation without\nconsidering voter attributes and identify some issues with the D&S model. To\naddress these limitations, we propose a new Soft D&S model with improved\naccuracy in estimating soft labels. Moreover, we evaluated the fairness of an\nopinion aggregation model, including Soft D&S, in combination with different\nfairness options using synthetic and semi-synthetic data. The experimental\nresults suggest that the combination of Soft D&S and data splitting as a\nfairness option is effective for dense data, whereas weighted majority voting\nis effective for sparse data. These findings should prove particularly valuable\nin supporting decision-making by human and machine-learning models with\nbalanced opinion aggregation.\n","authors":["Ryosuke Ueda","Koh Takeuchi","Hisashi Kashima"],"pdf_url":"https://arxiv.org/pdf/2307.10749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10738v1","updated":"2023-07-20T10:04:55Z","published":"2023-07-20T10:04:55Z","title":"Fairness-Aware Client Selection for Federated Learning","summary":"  Federated learning (FL) has enabled multiple data owners (a.k.a. FL clients)\nto train machine learning models collaboratively without revealing private\ndata. Since the FL server can only engage a limited number of clients in each\ntraining round, FL client selection has become an important research problem.\nExisting approaches generally focus on either enhancing FL model performance or\nenhancing the fair treatment of FL clients. The problem of balancing\nperformance and fairness considerations when selecting FL clients remains open.\nTo address this problem, we propose the Fairness-aware Federated Client\nSelection (FairFedCS) approach. Based on Lyapunov optimization, it dynamically\nadjusts FL clients' selection probabilities by jointly considering their\nreputations, times of participation in FL tasks and contributions to the\nresulting model performance. By not using threshold-based reputation filtering,\nit provides FL clients with opportunities to redeem their reputations after a\nperceived poor performance, thereby further enhancing fair client treatment.\nExtensive experiments based on real-world multimedia datasets show that\nFairFedCS achieves 19.6% higher fairness and 0.73% higher test accuracy on\naverage than the best-performing state-of-the-art approach.\n","authors":["Yuxin Shi","Zelei Liu","Zhuan Shi","Han Yu"],"pdf_url":"https://arxiv.org/pdf/2307.10738v1.pdf","comment":"Accepted by ICME 2023"},{"id":"http://arxiv.org/abs/2307.10736v1","updated":"2023-07-20T10:03:50Z","published":"2023-07-20T10:03:50Z","title":"Long-Tail Theory under Gaussian Mixtures","summary":"  We suggest a simple Gaussian mixture model for data generation that complies\nwith Feldman's long tail theory (2020). We demonstrate that a linear classifier\ncannot decrease the generalization error below a certain level in the proposed\nmodel, whereas a nonlinear classifier with a memorization capacity can. This\nconfirms that for long-tailed distributions, rare training examples must be\nconsidered for optimal generalization to new data. Finally, we show that the\nperformance gap between linear and nonlinear models can be lessened as the tail\nbecomes shorter in the subpopulation frequency distribution, as confirmed by\nexperiments on synthetic and real data.\n","authors":["Arman Bolatov","Maxat Tezekbayev","Igor Melnykov","Artur Pak","Vassilina Nikoulina","Zhenisbek Assylbekov"],"pdf_url":"https://arxiv.org/pdf/2307.10736v1.pdf","comment":"accepted to ECAI 2023"},{"id":"http://arxiv.org/abs/2307.10718v1","updated":"2023-07-20T09:24:23Z","published":"2023-07-20T09:24:23Z","title":"Differences Between Hard and Noisy-labeled Samples: An Empirical Study","summary":"  Extracting noisy or incorrectly labeled samples from a labeled dataset with\nhard/difficult samples is an important yet under-explored topic. Two general\nand often independent lines of work exist, one focuses on addressing noisy\nlabels, and another deals with hard samples. However, when both types of data\nare present, most existing methods treat them equally, which results in a\ndecline in the overall performance of the model. In this paper, we first design\nvarious synthetic datasets with custom hardness and noisiness levels for\ndifferent samples. Our proposed systematic empirical study enables us to better\nunderstand the similarities and more importantly the differences between\nhard-to-learn samples and incorrectly-labeled samples. These controlled\nexperiments pave the way for the development of methods that distinguish\nbetween hard and noisy samples. Through our study, we introduce a simple yet\neffective metric that filters out noisy-labeled samples while keeping the hard\nsamples. We study various data partitioning methods in the presence of label\nnoise and observe that filtering out noisy samples from hard samples with this\nproposed metric results in the best datasets as evidenced by the high test\naccuracy achieved after models are trained on the filtered datasets. We\ndemonstrate this for both our created synthetic datasets and for datasets with\nreal-world label noise. Furthermore, our proposed data partitioning method\nsignificantly outperforms other methods when employed within a semi-supervised\nlearning framework.\n","authors":["Mahsa Forouzesh","Patrick Thiran"],"pdf_url":"https://arxiv.org/pdf/2307.10718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10710v1","updated":"2023-07-20T09:05:46Z","published":"2023-07-20T09:05:46Z","title":"Reparameterized Policy Learning for Multimodal Trajectory Optimization","summary":"  We investigate the challenge of parametrizing policies for reinforcement\nlearning (RL) in high-dimensional continuous action spaces. Our objective is to\ndevelop a multimodal policy that overcomes limitations inherent in the\ncommonly-used Gaussian parameterization. To achieve this, we propose a\nprincipled framework that models the continuous RL policy as a generative model\nof optimal trajectories. By conditioning the policy on a latent variable, we\nderive a novel variational bound as the optimization objective, which promotes\nexploration of the environment. We then present a practical model-based RL\nmethod, called Reparameterized Policy Gradient (RPG), which leverages the\nmultimodal policy parameterization and learned world model to achieve strong\nexploration capabilities and high data efficiency. Empirical results\ndemonstrate that our method can help agents evade local optima in tasks with\ndense rewards and solve challenging sparse-reward environments by incorporating\nan object-centric intrinsic reward. Our method consistently outperforms\nprevious approaches across a range of tasks. Code and supplementary materials\nare available on the project page https://haosulab.github.io/RPG/\n","authors":["Zhiao Huang","Litian Liang","Zhan Ling","Xuanlin Li","Chuang Gan","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2307.10710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10705v1","updated":"2023-07-20T08:53:47Z","published":"2023-07-20T08:53:47Z","title":"TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and\n  Lane Segmentation in Self-Driving Cars","summary":"  Semantic segmentation is a common task in autonomous driving to understand\nthe surrounding environment. Driveable Area Segmentation and Lane Detection are\nparticularly important for safe and efficient navigation on the road. However,\noriginal semantic segmentation models are computationally expensive and require\nhigh-end hardware, which is not feasible for embedded systems in autonomous\nvehicles. This paper proposes a lightweight model for the driveable area and\nlane line segmentation. TwinLiteNet is designed cheaply but achieves accurate\nand efficient segmentation results. We evaluate TwinLiteNet on the BDD100K\ndataset and compare it with modern models. Experimental results show that our\nTwinLiteNet performs similarly to existing approaches, requiring significantly\nfewer computational resources. Specifically, TwinLiteNet achieves a mIoU score\nof 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task\nwith only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000.\nFurthermore, TwinLiteNet can run in real-time on embedded devices with limited\ncomputing power, especially since it achieves 60FPS on Jetson Xavier NX, making\nit an ideal solution for self-driving vehicles. Code is available:\nurl{https://github.com/chequanghuy/TwinLiteNet}.\n","authors":["Quang Huy Che","Dinh Phuc Nguyen","Minh Quan Pham","Duc Khai Lam"],"pdf_url":"https://arxiv.org/pdf/2307.10705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10704v1","updated":"2023-07-20T08:53:16Z","published":"2023-07-20T08:53:16Z","title":"Decentralized Smart Charging of Large-Scale EVs using Adaptive\n  Multi-Agent Multi-Armed Bandits","summary":"  The drastic growth of electric vehicles and photovoltaics can introduce new\nchallenges, such as electrical current congestion and voltage limit violations\ndue to peak load demands. These issues can be mitigated by controlling the\noperation of electric vehicles i.e., smart charging. Centralized smart charging\nsolutions have already been proposed in the literature. But such solutions may\nlack scalability and suffer from inherent drawbacks of centralization, such as\na single point of failure, and data privacy concerns. Decentralization can help\ntackle these challenges. In this paper, a fully decentralized smart charging\nsystem is proposed using the philosophy of adaptive multi-agent systems. The\nproposed system utilizes multi-armed bandit learning to handle uncertainties in\nthe system. The presented system is decentralized, scalable, real-time,\nmodel-free, and takes fairness among different players into account. A detailed\ncase study is also presented for performance evaluation.\n","authors":["Sharyal Zafar","Raphaël Feraud","Anne Blavette","Guy Camilleri","Hamid Ben"],"pdf_url":"https://arxiv.org/pdf/2307.10704v1.pdf","comment":"CIRED 2023 International Conference & Exhibition on Electricity\n  Distribution, Jun 2023, Rome, Italy"},{"id":"http://arxiv.org/abs/2307.10703v1","updated":"2023-07-20T08:50:16Z","published":"2023-07-20T08:50:16Z","title":"Graphs in State-Space Models for Granger Causality in Climate Science","summary":"  Granger causality (GC) is often considered not an actual form of causality.\nStill, it is arguably the most widely used method to assess the predictability\nof a time series from another one. Granger causality has been widely used in\nmany applied disciplines, from neuroscience and econometrics to Earth sciences.\nWe revisit GC under a graphical perspective of state-space models. For that, we\nuse GraphEM, a recently presented expectation-maximisation algorithm for\nestimating the linear matrix operator in the state equation of a\nlinear-Gaussian state-space model. Lasso regularisation is included in the\nM-step, which is solved using a proximal splitting Douglas-Rachford algorithm.\nExperiments in toy examples and challenging climate problems illustrate the\nbenefits of the proposed model and inference technique over standard Granger\ncausality methods.\n","authors":["Víctor Elvira","Émilie Chouzenoux","Jordi Cerdà","Gustau Camps-Valls"],"pdf_url":"https://arxiv.org/pdf/2307.10703v1.pdf","comment":"4 pages, 2 figures, 3 tables, CausalStats23: When Causal Inference\n  meets Statistical Analysis, April 17-21, 2023, Paris, France"},{"id":"http://arxiv.org/abs/2205.09753v2","updated":"2023-07-20T08:41:46Z","published":"2022-04-30T07:08:30Z","title":"HDGT: Heterogeneous Driving Graph Transformer for Multi-Agent Trajectory\n  Prediction via Scene Encoding","summary":"  Encoding a driving scene into vector representations has been an essential\ntask for autonomous driving that can benefit downstream tasks e.g. trajectory\nprediction. The driving scene often involves heterogeneous elements such as the\ndifferent types of objects (agents, lanes, traffic signs) and the semantic\nrelations between objects are rich and diverse. Meanwhile, there also exist\nrelativity across elements, which means that the spatial relation is a relative\nconcept and need be encoded in a ego-centric manner instead of in a global\ncoordinate system. Based on these observations, we propose Heterogeneous\nDriving Graph Transformer (HDGT), a backbone modelling the driving scene as a\nheterogeneous graph with different types of nodes and edges. For heterogeneous\ngraph construction, we connect different types of nodes according to diverse\nsemantic relations. For spatial relation encoding, the coordinates of the node\nas well as its in-edges are in the local node-centric coordinate system. For\nthe aggregation module in the graph neural network (GNN), we adopt the\ntransformer structure in a hierarchical way to fit the heterogeneous nature of\ninputs. Experimental results show that HDGT achieves state-of-the-art\nperformance for the task of trajectory prediction, on INTERACTION Prediction\nChallenge and Waymo Open Motion Challenge.\n","authors":["Xiaosong Jia","Penghao Wu","Li Chen","Yu Liu","Hongyang Li","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2205.09753v2.pdf","comment":"Accepted at IEEE TPAMI in 2023. Code url:\n  https://github.com/OpenDriveLab/HDGT"},{"id":"http://arxiv.org/abs/2307.10695v1","updated":"2023-07-20T08:38:01Z","published":"2023-07-20T08:38:01Z","title":"Self2Self+: Single-Image Denoising with Self-Supervised Learning and\n  Image Quality Assessment Loss","summary":"  Recently, denoising methods based on supervised learning have exhibited\npromising performance. However, their reliance on external datasets containing\nnoisy-clean image pairs restricts their applicability. To address this\nlimitation, researchers have focused on training denoising networks using\nsolely a set of noisy inputs. To improve the feasibility of denoising\nprocedures, in this study, we proposed a single-image self-supervised learning\nmethod in which only the noisy input image is used for network training. Gated\nconvolution was used for feature extraction and no-reference image quality\nassessment was used for guiding the training process. Moreover, the proposed\nmethod sampled instances from the input image dataset using Bernoulli sampling\nwith a certain dropout rate for training. The corresponding result was produced\nby averaging the generated predictions from various instances of the trained\nnetwork with dropouts. The experimental results indicated that the proposed\nmethod achieved state-of-the-art denoising performance on both synthetic and\nreal-world datasets. This highlights the effectiveness and practicality of our\nmethod as a potential solution for various noise removal tasks.\n","authors":["Jaekyun Ko","Sanghwan Lee"],"pdf_url":"https://arxiv.org/pdf/2307.10695v1.pdf","comment":"Technical report and supplemantry materials are combined into one\n  paper. - Technical report: Page 1~7 - Supplemantry materials : Page 8~18"},{"id":"http://arxiv.org/abs/2302.08292v3","updated":"2023-07-20T08:35:26Z","published":"2023-02-16T13:41:19Z","title":"Navya3DSeg -- Navya 3D Semantic Segmentation Dataset & split generation\n  for autonomous vehicles","summary":"  Autonomous driving (AD) perception today relies heavily on deep learning\nbased architectures requiring large scale annotated datasets with their\nassociated costs for curation and annotation. The 3D semantic data are useful\nfor core perception tasks such as obstacle detection and ego-vehicle\nlocalization. We propose a new dataset, Navya 3D Segmentation (Navya3DSeg),\nwith a diverse label space corresponding to a large scale production grade\noperational domain, including rural, urban, industrial sites and universities\nfrom 13 countries. It contains 23 labeled sequences and 25 supplementary\nsequences without labels, designed to explore self-supervised and\nsemi-supervised semantic segmentation benchmarks on point clouds. We also\npropose a novel method for sequential dataset split generation based on\niterative multi-label stratification, and demonstrated to achieve a +1.2% mIoU\nimprovement over the original split proposed by SemanticKITTI dataset. A\ncomplete benchmark for semantic segmentation task was performed, with state of\nthe art methods. Finally, we demonstrate an Active Learning (AL) based dataset\ndistillation framework. We introduce a novel heuristic-free sampling method\ncalled ego-pose distance based sampling in the context of AL. A detailed\npresentation on the dataset is available here\nhttps://www.youtube.com/watch?v=5m6ALIs-s20.\n","authors":["Alexandre Almin","Léo Lemarié","Anh Duong","B Ravi Kiran"],"pdf_url":"https://arxiv.org/pdf/2302.08292v3.pdf","comment":"Accepted version to IEEE RA-L. Version with supplementary materials"},{"id":"http://arxiv.org/abs/2307.10683v1","updated":"2023-07-20T08:20:12Z","published":"2023-07-20T08:20:12Z","title":"Fractional Denoising for 3D Molecular Pre-training","summary":"  Coordinate denoising is a promising 3D molecular pre-training method, which\nhas achieved remarkable performance in various downstream drug discovery tasks.\nTheoretically, the objective is equivalent to learning the force field, which\nis revealed helpful for downstream tasks. Nevertheless, there are two\nchallenges for coordinate denoising to learn an effective force field, i.e. low\ncoverage samples and isotropic force field. The underlying reason is that\nmolecular distributions assumed by existing denoising methods fail to capture\nthe anisotropic characteristic of molecules. To tackle these challenges, we\npropose a novel hybrid noise strategy, including noises on both dihedral angel\nand coordinate. However, denoising such hybrid noise in a traditional way is no\nmore equivalent to learning the force field. Through theoretical deductions, we\nfind that the problem is caused by the dependency of the input conformation for\ncovariance. To this end, we propose to decouple the two types of noise and\ndesign a novel fractional denoising method (Frad), which only denoises the\nlatter coordinate part. In this way, Frad enjoys both the merits of sampling\nmore low-energy structures and the force field equivalence. Extensive\nexperiments show the effectiveness of Frad in molecular representation, with a\nnew state-of-the-art on 9 out of 12 tasks of QM9 and on 7 out of 8 targets of\nMD17.\n","authors":["Shikun Feng","Yuyan Ni","Yanyan Lan","Zhi-Ming Ma","Wei-Ying Ma"],"pdf_url":"https://arxiv.org/pdf/2307.10683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10677v1","updated":"2023-07-20T07:57:14Z","published":"2023-07-20T07:57:14Z","title":"Deep learning for classification of noisy QR codes","summary":"  We wish to define the limits of a classical classification model based on\ndeep learning when applied to abstract images, which do not represent visually\nidentifiable objects.QR codes (Quick Response codes) fall into this category of\nabstract images: one bit corresponding to one encoded character, QR codes were\nnot designed to be decoded manually. To understand the limitations of a deep\nlearning-based model for abstract image classification, we train an image\nclassification model on QR codes generated from information obtained when\nreading a health pass. We compare a classification model with a classical\n(deterministic) decoding method in the presence of noise. This study allows us\nto conclude that a model based on deep learning can be relevant for the\nunderstanding of abstract images.\n","authors":["Rebecca Leygonie","Sylvain Lobry"," )","Laurent Wendling (LIPADE)"],"pdf_url":"https://arxiv.org/pdf/2307.10677v1.pdf","comment":"in French language. RFIAP 2022 - Reconnaissance des Formes, Image,\n  Apprentissage et Perception, Jul 2022, Vannes (Bretagne), France"},{"id":"http://arxiv.org/abs/2307.07666v2","updated":"2023-07-20T07:55:04Z","published":"2023-07-15T00:26:51Z","title":"Efficient Action Robust Reinforcement Learning with Probabilistic Policy\n  Execution Uncertainty","summary":"  Robust reinforcement learning (RL) aims to find a policy that optimizes the\nworst-case performance in the face of uncertainties. In this paper, we focus on\naction robust RL with the probabilistic policy execution uncertainty, in which,\ninstead of always carrying out the action specified by the policy, the agent\nwill take the action specified by the policy with probability $1-\\rho$ and an\nalternative adversarial action with probability $\\rho$. We establish the\nexistence of an optimal policy on the action robust MDPs with probabilistic\npolicy execution uncertainty and provide the action robust Bellman optimality\nequation for its solution. Furthermore, we develop Action Robust Reinforcement\nLearning with Certificates (ARRLC) algorithm that achieves minimax optimal\nregret and sample complexity. Furthermore, we conduct numerical experiments to\nvalidate our approach's robustness, demonstrating that ARRLC outperforms\nnon-robust RL algorithms and converges faster than the robust TD algorithm in\nthe presence of action perturbations.\n","authors":["Guanlin Liu","Zhihan Zhou","Han Liu","Lifeng Lai"],"pdf_url":"https://arxiv.org/pdf/2307.07666v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10655v1","updated":"2023-07-20T07:35:42Z","published":"2023-07-20T07:35:42Z","title":"A Survey of What to Share in Federated Learning: Perspectives on Model\n  Utility, Privacy Leakage, and Communication Efficiency","summary":"  Federated learning (FL) has emerged as a highly effective paradigm for\nprivacy-preserving collaborative training among different parties. Unlike\ntraditional centralized learning, which requires collecting data from each\nparty, FL allows clients to share privacy-preserving information without\nexposing private datasets. This approach not only guarantees enhanced privacy\nprotection but also facilitates more efficient and secure collaboration among\nmultiple participants. Therefore, FL has gained considerable attention from\nresearchers, promoting numerous surveys to summarize the related works.\nHowever, the majority of these surveys concentrate on methods sharing model\nparameters during the training process, while overlooking the potential of\nsharing other forms of local information. In this paper, we present a\nsystematic survey from a new perspective, i.e., what to share in FL, with an\nemphasis on the model utility, privacy leakage, and communication efficiency.\nThis survey differs from previous ones due to four distinct contributions.\nFirst, we present a new taxonomy of FL methods in terms of the sharing methods,\nwhich includes three categories of shared information: model sharing, synthetic\ndata sharing, and knowledge sharing. Second, we analyze the vulnerability of\ndifferent sharing methods to privacy attacks and review the defense mechanisms\nthat provide certain privacy guarantees. Third, we conduct extensive\nexperiments to compare the performance and communication overhead of various\nsharing methods in FL. Besides, we assess the potential privacy leakage through\nmodel inversion and membership inference attacks, while comparing the\neffectiveness of various defense approaches. Finally, we discuss potential\ndeficiencies in current methods and outline future directions for improvement.\n","authors":["Jiawei Shao","Zijian Li","Wenqiang Sun","Tailin Zhou","Yuchang Sun","Lumin Liu","Zehong Lin","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10654v1","updated":"2023-07-20T07:35:15Z","published":"2023-07-20T07:35:15Z","title":"Conditional expectation network for SHAP","summary":"  A very popular model-agnostic technique for explaining predictive models is\nthe SHapley Additive exPlanation (SHAP). The two most popular versions of SHAP\nare a conditional expectation version and an unconditional expectation version\n(the latter is also known as interventional SHAP). Except for tree-based\nmethods, usually the unconditional version is used (for computational reasons).\nWe provide a (surrogate) neural network approach which allows us to efficiently\ncalculate the conditional version for both neural networks and other regression\nmodels, and which properly considers the dependence structure in the feature\ncomponents. This proposal is also useful to provide drop1 and anova analyses in\ncomplex regression models which are similar to their generalized linear model\n(GLM) counterparts, and we provide a partial dependence plot (PDP) counterpart\nthat considers the right dependence structure in the feature components.\n","authors":["Ronald Richman","Mario V. Wüthrich"],"pdf_url":"https://arxiv.org/pdf/2307.10654v1.pdf","comment":"24 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.10653v1","updated":"2023-07-20T07:33:36Z","published":"2023-07-20T07:33:36Z","title":"Refining the Optimization Target for Automatic Univariate Time Series\n  Anomaly Detection in Monitoring Services","summary":"  Time series anomaly detection is crucial for industrial monitoring services\nthat handle a large volume of data, aiming to ensure reliability and optimize\nsystem performance. Existing methods often require extensive labeled resources\nand manual parameter selection, highlighting the need for automation. This\npaper proposes a comprehensive framework for automatic parameter optimization\nin time series anomaly detection models. The framework introduces three\noptimization targets: prediction score, shape score, and sensitivity score,\nwhich can be easily adapted to different model backbones without prior\nknowledge or manual labeling efforts. The proposed framework has been\nsuccessfully applied online for over six months, serving more than 50,000 time\nseries every minute. It simplifies the user's experience by requiring only an\nexpected sensitive value, offering a user-friendly interface, and achieving\ndesired detection results. Extensive evaluations conducted on public datasets\nand comparison with other methods further confirm the effectiveness of the\nproposed framework.\n","authors":["Manqing Dong","Zhanxiang Zhao","Yitong Geng","Wentao Li","Wei Wang","Huai Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.10653v1.pdf","comment":"Accepted by 2023 IJCAI Workshop"},{"id":"http://arxiv.org/abs/2307.10648v1","updated":"2023-07-20T07:23:15Z","published":"2023-07-20T07:23:15Z","title":"Data-Driven Latency Probability Prediction for Wireless Networks:\n  Focusing on Tail Probabilities","summary":"  With the emergence of new application areas, such as cyber-physical systems\nand human-in-the-loop applications, there is a need to guarantee a certain\nlevel of end-to-end network latency with extremely high reliability, e.g.,\n99.999%. While mechanisms specified under IEEE 802.1as time-sensitive\nnetworking (TSN) can be used to achieve these requirements for switched\nEthernet networks, implementing TSN mechanisms in wireless networks is\nchallenging due to their stochastic nature. To conform the wireless link to a\nreliability level of 99.999%, the behavior of extremely rare outliers in the\nlatency probability distribution, or the tail of the distribution, must be\nanalyzed and controlled. This work proposes predicting the tail of the latency\ndistribution using state-of-the-art data-driven approaches, such as mixture\ndensity networks (MDN) and extreme value mixture models, to estimate the\nlikelihood of rare latencies conditioned on the network parameters, which can\nbe used to make more informed decisions in wireless transmission. Actual\nlatency measurements of IEEE 802.11g (WiFi), commercial private and a\nsoftware-defined 5G network are used to benchmark the proposed approaches and\nevaluate their sensitivities concerning the tail probabilities.\n","authors":["Samie Mostafavi","Gourav Prateek Sharma","James Gross"],"pdf_url":"https://arxiv.org/pdf/2307.10648v1.pdf","comment":"Submitted to IEEE Global Communications (GLOBECOM) 2023 conference"},{"id":"http://arxiv.org/abs/2305.15776v2","updated":"2023-07-20T07:20:20Z","published":"2023-05-25T06:43:42Z","title":"AUC Optimization from Multiple Unlabeled Datasets","summary":"  Weakly supervised learning aims to empower machine learning when the perfect\nsupervision is unavailable, which has drawn great attention from researchers.\nAmong various types of weak supervision, one of the most challenging cases is\nto learn from multiple unlabeled (U) datasets with only a little knowledge of\nthe class priors, or U$^m$ learning for short. In this paper, we study the\nproblem of building an AUC (area under ROC curve) optimization model from\nmultiple unlabeled datasets, which maximizes the pairwise ranking ability of\nthe classifier. We propose U$^m$-AUC, an AUC optimization approach that\nconverts the U$^m$ data into a multi-label AUC optimization problem, and can be\ntrained efficiently. We show that the proposed U$^m$-AUC is effective\ntheoretically and empirically.\n","authors":["Yu Liu","Zheng Xie","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2305.15776v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10644v1","updated":"2023-07-20T07:14:58Z","published":"2023-07-20T07:14:58Z","title":"Fisher-Rao distance and pullback SPD cone distances between multivariate\n  normal distributions","summary":"  Data sets of multivariate normal distributions abound in many scientific\nareas like diffusion tensor imaging, structure tensor computer vision, radar\nsignal processing, machine learning, just to name a few. In order to process\nthose normal data sets for downstream tasks like filtering, classification or\nclustering, one needs to define proper notions of dissimilarities between\nnormals and paths joining them. The Fisher-Rao distance defined as the\nRiemannian geodesic distance induced by the Fisher information metric is such a\nprincipled metric distance which however is not known in closed-form excepts\nfor a few particular cases. In this work, we first report a fast and robust\nmethod to approximate arbitrarily finely the Fisher-Rao distance between\nmultivariate normal distributions. Second, we introduce a class of distances\nbased on diffeomorphic embeddings of the normal manifold into a submanifold of\nthe higher-dimensional symmetric positive-definite cone corresponding to the\nmanifold of centered normal distributions. We show that the projective Hilbert\ndistance on the cone yields a metric on the embedded normal submanifold and we\npullback that cone distance with its associated straight line Hilbert cone\ngeodesics to obtain a distance and smooth paths between normal distributions.\nCompared to the Fisher-Rao distance approximation, the pullback Hilbert cone\ndistance is computationally light since it requires to compute only the extreme\nminimal and maximal eigenvalues of matrices. Finally, we show how to use those\ndistances in clustering tasks.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2307.10644v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2208.06620v2","updated":"2023-07-20T07:09:45Z","published":"2022-08-13T10:36:04Z","title":"Opinion Market Model: Stemming Far-Right Opinion Spread using Positive\n  Interventions","summary":"  Online extremism has severe societal consequences, including normalizing hate\nspeech, user radicalization, and increased social divisions. Various mitigation\nstrategies have been explored to address these consequences. One such strategy\nuses positive interventions: controlled signals that add attention to the\nopinion ecosystem to boost certain opinions. To evaluate the effectiveness of\npositive interventions, we introduce the Opinion Market Model (OMM), a two-tier\nonline opinion ecosystem model that considers both inter-opinion interactions\nand the role of positive interventions. The size of the opinion attention\nmarket is modeled in the first tier using the multivariate discrete-time Hawkes\nprocess; in the second tier, opinions cooperate and compete for market share,\ngiven limited attention using the market share attraction model. We demonstrate\nthe convergence of our proposed estimation scheme on a synthetic dataset. Next,\nwe test OMM on two learning tasks, applying to two real-world datasets to\npredict attention market shares and uncover latent relationships between online\nitems. The first dataset comprises Facebook and Twitter discussions containing\nmoderate and far-right opinions about bushfires and climate change. The second\ndataset captures popular VEVO artists' YouTube and Twitter attention volumes.\nOMM outperforms the state-of-the-art predictive models on both datasets and\ncaptures latent cooperation-competition relations. We uncover (1) self- and\ncross-reinforcement between far-right and moderate opinions on the bushfires\nand (2) pairwise artist relations that correlate with real-world interactions\nsuch as collaborations and long-lasting feuds. Lastly, we use OMM as a testbed\nfor positive interventions and show how media coverage modulates the spread of\nfar-right opinions.\n","authors":["Pio Calderon","Rohit Ram","Marian-Andrei Rizoiu"],"pdf_url":"https://arxiv.org/pdf/2208.06620v2.pdf","comment":"accepted in the 18th AAAI International Conference on Web and Social\n  Media (ICWSM'24)"},{"id":"http://arxiv.org/abs/2305.08396v3","updated":"2023-07-20T07:06:03Z","published":"2023-05-15T07:23:54Z","title":"MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation","summary":"  Convolutional Neural Networks (CNNs) have made significant strides in medical\nimage analysis in recent years. However, the local nature of the convolution\noperator may pose a limitation for capturing global and long-range interactions\nin CNNs. Recently, Transformers have gained popularity in the computer vision\ncommunity and also medical image segmentation due to their ability to process\nglobal features effectively. The scalability issues of self-attention mechanism\nand lack of the CNN-like inductive bias may have limited their adoption.\nTherefore, hybrid Vision transformers (CNN-Transformer), exploiting advantages\nof both Convolution and Self-attention Mechanisms, have gained importance. In\nthis work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision\ntransformer (CNN-Transformer) for medical image segmentation. The proposed\nHybrid Decoder, based on MaxViT-block, is designed to harness the power of both\nthe convolution and self-attention mechanisms at each decoding stage with\nnominal computational burden. The inclusion of multi-axis self-attention,\nwithin each decoder stage, significantly enhances the discriminating capacity\nbetween the object and background regions, and thereby helps in improving the\nsegmentation efficiency. In the Hybrid Decoder block, the fusion process\ncommences by integrating the upsampled lower level decoder features, obtained\nthrough transpose convolution, with the skip-connection features derived from\nthe hybrid encoder. Subsequently, the fused features undergo refinement through\nthe utilization of a multi-axis attention mechanism. The proposed decoder block\nis repeated multiple times to progressively segment the nuclei regions.\nExperimental results on MoNuSeg18 and MoNuSAC20 dataset demonstrates the\neffectiveness of the proposed technique.\n","authors":["Abdul Rehman Khan","Asifullah Khan"],"pdf_url":"https://arxiv.org/pdf/2305.08396v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10635v1","updated":"2023-07-20T07:01:57Z","published":"2023-07-20T07:01:57Z","title":"SciBench: Evaluating College-Level Scientific Problem-Solving Abilities\n  of Large Language Models","summary":"  Recent advances in large language models (LLMs) have demonstrated notable\nprogress on many mathematical benchmarks. However, most of these benchmarks\nonly feature problems grounded in junior and senior high school subjects,\ncontain only multiple-choice questions, and are confined to a limited scope of\nelementary arithmetic operations. To address these issues, this paper\nintroduces an expansive benchmark suite SciBench that aims to systematically\nexamine the reasoning capabilities required for complex scientific problem\nsolving. SciBench contains two carefully curated datasets: an open set\nfeaturing a range of collegiate-level scientific problems drawn from\nmathematics, chemistry, and physics textbooks, and a closed set comprising\nproblems from undergraduate-level exams in computer science and mathematics.\nBased on the two datasets, we conduct an in-depth benchmark study of two\nrepresentative LLMs with various prompting strategies. The results reveal that\ncurrent LLMs fall short of delivering satisfactory performance, with an overall\nscore of merely 35.80%. Furthermore, through a detailed user study, we\ncategorize the errors made by LLMs into ten problem-solving abilities. Our\nanalysis indicates that no single prompting strategy significantly outperforms\nothers and some strategies that demonstrate improvements in certain\nproblem-solving skills result in declines in other skills. We envision that\nSciBench will catalyze further developments in the reasoning abilities of LLMs,\nthereby ultimately contributing to scientific research and discovery.\n","authors":["Xiaoxuan Wang","Ziniu Hu","Pan Lu","Yanqiao Zhu","Jieyu Zhang","Satyen Subramaniam","Arjun R. Loomba","Shichang Zhang","Yizhou Sun","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2307.10635v1.pdf","comment":"Work in progress, 18 pages"},{"id":"http://arxiv.org/abs/2307.10634v1","updated":"2023-07-20T06:59:02Z","published":"2023-07-20T06:59:02Z","title":"Generative Language Models on Nucleotide Sequences of Human Genes","summary":"  Language models, primarily transformer-based ones, obtained colossal success\nin NLP. To be more precise, studies like BERT in NLU and works such as GPT-3\nfor NLG are very crucial. DNA sequences are very close to natural language in\nterms of structure, so if the DNA-related bioinformatics domain is concerned,\ndiscriminative models, like DNABert, exist. Yet, the generative side of the\ncoin is mainly unexplored to the best of our knowledge. Consequently, we\nfocused on developing an autoregressive generative language model like GPT-3\nfor DNA sequences. Because working with whole DNA sequences is challenging\nwithout substantial computational resources, we decided to carry out our study\non a smaller scale, focusing on nucleotide sequences of human genes, unique\nparts in DNA with specific functionalities, instead of the whole DNA. This\ndecision did not change the problem structure a lot due to the fact that both\nDNA and genes can be seen as 1D sequences consisting of four different\nnucleotides without losing much information and making too much simplification.\nFirst of all, we systematically examined an almost entirely unexplored problem\nand observed that RNNs performed the best while simple techniques like N-grams\nwere also promising. Another beneficial point was learning how to work with\ngenerative models on languages we do not understand, unlike natural language.\nHow essential using real-life tasks beyond the classical metrics such as\nperplexity is observed. Furthermore, checking whether the data-hungry nature of\nthese models can be changed through selecting a language with minimal\nvocabulary size, four owing to four different types of nucleotides, is\nexamined. The reason for reviewing this was that choosing such a language might\nmake the problem easier. However, what we observed in this study was it did not\nprovide that much of a change in the amount of data needed.\n","authors":["Musa Nuri Ihtiyar","Arzucan Ozgur"],"pdf_url":"https://arxiv.org/pdf/2307.10634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10633v1","updated":"2023-07-20T06:58:55Z","published":"2023-07-20T06:58:55Z","title":"Multi-Method Self-Training: Improving Code Generation With Text, And\n  Vice Versa","summary":"  Large Language Models have many methods for solving the same problem. This\nintroduces novel strengths (different methods may work well for different\nproblems) and weaknesses (it may be difficult for users to know which method to\nuse). In this paper, we introduce Multi-Method Self-Training (MMST), where one\nmethod is trained on the filtered outputs of another, allowing us to augment\nthe strengths and ameliorate the weaknesses of each method. Using a 176B\nparameter model trained on both language and code, we show that MMST can 1)\nimprove the less performant method (up to 30%) making the model easier to use,\n2) improve the more performant method (up to 32.2%) making the model more\nperformant, and 3) improve the performance of related but distinct tasks (up to\n10.3%) by improving the ability of the model to generate rationales. We then\nconduct ablation analyses to explore why MMST works. We show that MMST\ngenerates more data than traditional self-training, but the improvement in\nperformance is driven by the use of multiple methods. We also analyze\nprompt-engineering and anti-correlated performance between methods as means of\nmaking MMST more effective. We hope the evidence from our paper motivates\nmachine learning researchers to explore ways in which advances in language\nmodels allow for new forms of training.\n","authors":["Shriyash K. Upadhyay","Etan J. Ginsberg"],"pdf_url":"https://arxiv.org/pdf/2307.10633v1.pdf","comment":"23 pages, 3 figures"},{"id":"http://arxiv.org/abs/2211.14085v3","updated":"2023-07-20T06:42:56Z","published":"2022-11-25T13:14:33Z","title":"Positive unlabeled learning with tensor networks","summary":"  Positive unlabeled learning is a binary classification problem with positive\nand unlabeled data. It is common in domains where negative labels are costly or\nimpossible to obtain, e.g., medicine and personalized advertising. Most\napproaches to positive unlabeled learning apply to specific data types (e.g.,\nimages, categorical data) and can not generate new positive and negative\nsamples. This work introduces a feature-space distance-based tensor network\napproach to the positive unlabeled learning problem. The presented method is\nnot domain specific and significantly improves the state-of-the-art results on\nthe MNIST image and 15 categorical/mixed datasets. The trained tensor network\nmodel is also a generative model and enables the generation of new positive and\nnegative instances.\n","authors":["Bojan Žunkovič"],"pdf_url":"https://arxiv.org/pdf/2211.14085v3.pdf","comment":"12 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.10617v1","updated":"2023-07-20T06:35:43Z","published":"2023-07-20T06:35:43Z","title":"Detecting deceptive reviews using text classification","summary":"  In recent years, online reviews play a vital role for promoting any kind of\nproduct or services. Businesses may embed fake reviews in order to attract\ncustomers to purchase their products. They may even highlight the benefits of\ntheir own product or criticize the competition's product. Marketers,\nadvertisers, and other online business users have incentive to create fake\npositive reviews for products which they want to promote or give fake negative\nreviews for products which they really don't like. So now-a-days writing a\ndeceptive review is inevitable thing for promoting their own business or\ndegrading competitor's reputation. Thus, identifying deceptive reviews is an\nintense and on-going research area. This research paper proposes machine\nlearning model approach to identify deceptive reviews. The paper investigates\nthe performance of the several experiments done on a Deceptive Opinion Spam\nCorpus dataset of restaurants reviews. We developed a n-gram model and max\nfeatures to identify deceptive contents with a particular focus on fake\nreviews. Further, we conduct a benchmark study to investigate the performance\nof two different features extraction techniques and apply five machine learning\nclassification techniques. The experimental results show that passive\naggressive classifier outperforms other algorithms, and it reaches the highest\naccuracy not only in text classification but also to fake reviews. We also\nstudy the data augmentation and implement different deep learning techniques.\n","authors":["Anusuya Baby"],"pdf_url":"https://arxiv.org/pdf/2307.10617v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2307.09018v2","updated":"2023-07-20T06:35:34Z","published":"2023-07-18T07:12:46Z","title":"Multimodal LLMs for health grounded in individual-specific data","summary":"  Foundation large language models (LLMs) have shown an impressive ability to\nsolve tasks across a wide range of fields including health. To effectively\nsolve personalized health tasks, LLMs need the ability to ingest a diversity of\ndata modalities that are relevant to an individual's health status. In this\npaper, we take a step towards creating multimodal LLMs for health that are\ngrounded in individual-specific data by developing a framework (HeLM: Health\nLarge Language Model for Multimodal Understanding) that enables LLMs to use\nhigh-dimensional clinical modalities to estimate underlying disease risk. HeLM\nencodes complex data modalities by learning an encoder that maps them into the\nLLM's token embedding space and for simple modalities like tabular data by\nserializing the data into text. Using data from the UK Biobank, we show that\nHeLM can effectively use demographic and clinical features in addition to\nhigh-dimensional time-series data to estimate disease risk. For example, HeLM\nachieves an AUROC of 0.75 for asthma prediction when combining tabular and\nspirogram data modalities compared with 0.49 when only using tabular data.\nOverall, we find that HeLM outperforms or performs at parity with classical\nmachine learning approaches across a selection of eight binary traits.\nFurthermore, we investigate the downstream uses of this model such as its\ngeneralizability to out-of-distribution traits and its ability to power\nconversations around individual health and wellness.\n","authors":["Anastasiya Belyaeva","Justin Cosentino","Farhad Hormozdiari","Krish Eswaran","Shravya Shetty","Greg Corrado","Andrew Carroll","Cory Y. McLean","Nicholas A. Furlotte"],"pdf_url":"https://arxiv.org/pdf/2307.09018v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10616v1","updated":"2023-07-20T06:32:14Z","published":"2023-07-20T06:32:14Z","title":"Heterogeneous Federated Learning: State-of-the-art and Research\n  Challenges","summary":"  Federated learning (FL) has drawn increasing attention owing to its potential\nuse in large-scale industrial applications. Existing federated learning works\nmainly focus on model homogeneous settings. However, practical federated\nlearning typically faces the heterogeneity of data distributions, model\narchitectures, network environments, and hardware devices among participant\nclients. Heterogeneous Federated Learning (HFL) is much more challenging, and\ncorresponding solutions are diverse and complex. Therefore, a systematic survey\non this topic about the research challenges and state-of-the-art is essential.\nIn this survey, we firstly summarize the various research challenges in HFL\nfrom five aspects: statistical heterogeneity, model heterogeneity,\ncommunication heterogeneity, device heterogeneity, and additional challenges.\nIn addition, recent advances in HFL are reviewed and a new taxonomy of existing\nHFL methods is proposed with an in-depth analysis of their pros and cons. We\nclassify existing methods from three different levels according to the HFL\nprocedure: data-level, model-level, and server-level. Finally, several critical\nand promising future research directions in HFL are discussed, which may\nfacilitate further developments in this field. A periodically updated\ncollection on HFL is available at https://github.com/marswhu/HFL_Survey.\n","authors":["Mang Ye","Xiuwen Fang","Bo Du","Pong C. Yuen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2307.10616v1.pdf","comment":"42 pages, 11 figures, and 4 tables"},{"id":"http://arxiv.org/abs/2305.18088v4","updated":"2023-07-20T06:29:28Z","published":"2023-05-25T05:34:39Z","title":"Drug Repurposing Targeting COVID-19 3CL Protease using Molecular Docking\n  and Machine Learning Regression Approach","summary":"  The COVID-19 pandemic has created a global health crisis, driving the need\nfor the rapid identification of potential therapeutics. To meet this challenge,\ndrug repurposing is the only solution with saving cost, time, and labor. In\nthis study, we used the Zinc database to screen the world-approved including\nFDA-approved 5903 drugs for repurposing as potential COVID-19 treatments\ntargeting the main protease 3CL of SARS-CoV-2. We performed molecular docking\nand checked the efficacy of drug molecules. To enhance the efficiency of drug\nrepurposing approach, we modeled the binding affinities using several machine\nlearning regression approaches for QSAR modeling such as decision tree, extra\ntrees, MLP, KNN, XGBoost, and gradient boosting. The computational results\ndemonstrated that Decision Tree Regression (DTR) model has improved statistical\nmeasures of R2 and RMSE. These simulated results helped to identify drugs with\nhigh binding affinity. From the docking and other statistical analysis, we\nshortlisted six promising drugs with their respective Zinc IDs (ZINC3873365,\nZINC85432544, ZINC203757351, ZINC85536956, ZINC8214470 and ZINC261494640)\nwithin the range of -15 kcal/mol to -13 kcal/mol. In the study, the repurposed\ndrugs are novel except ZINC203757351 antiviral compound that has already\nidentified against COVID-19 in other studies. Further, we analyzed the\nphysiochemical and pharmacokinetic properties of these top-ranked selected\ndrugs with respect to their best binding interaction for specific target\nprotease 3CLpro. Our study has provided an efficient framework for drug\nrepurposing against COVID-19. This highlights the potential of combining\nmolecular docking with machine learning regression approaches to accelerate the\nidentification of potential therapeutic candidates.\n","authors":["Imra Aqeel","Abdul Majid"],"pdf_url":"https://arxiv.org/pdf/2305.18088v4.pdf","comment":"27 Pages"},{"id":"http://arxiv.org/abs/2102.03403v2","updated":"2023-07-20T05:58:30Z","published":"2021-02-05T19:59:05Z","title":"Robust Principal Component Analysis: A Median of Means Approach","summary":"  Principal Component Analysis (PCA) is a fundamental tool for data\nvisualization, denoising, and dimensionality reduction. It is widely popular in\nStatistics, Machine Learning, Computer Vision, and related fields. However, PCA\nis well-known to fall prey to outliers and often fails to detect the true\nunderlying low-dimensional structure within the dataset. Following the Median\nof Means (MoM) philosophy, recent supervised learning methods have shown great\nsuccess in dealing with outlying observations without much compromise to their\nlarge sample theoretical properties. This paper proposes a PCA procedure based\non the MoM principle. Called the \\textbf{M}edian of \\textbf{M}eans\n\\textbf{P}rincipal \\textbf{C}omponent \\textbf{A}nalysis (MoMPCA), the proposed\nmethod is not only computationally appealing but also achieves optimal\nconvergence rates under minimal assumptions. In particular, we explore the\nnon-asymptotic error bounds of the obtained solution via the aid of the\nRademacher complexities while granting absolutely no assumption on the outlying\nobservations. The derived concentration results are not dependent on the\ndimension because the analysis is conducted in a separable Hilbert space, and\nthe results only depend on the fourth moment of the underlying distribution in\nthe corresponding norm. The proposal's efficacy is also thoroughly showcased\nthrough simulations and real data applications.\n","authors":["Debolina Paul","Saptarshi Chakraborty","Swagatam Das"],"pdf_url":"https://arxiv.org/pdf/2102.03403v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.10224v4","updated":"2023-07-20T05:42:46Z","published":"2022-08-14T02:41:05Z","title":"Friendly Noise against Adversarial Noise: A Powerful Defense against\n  Data Poisoning Attacks","summary":"  A powerful category of (invisible) data poisoning attacks modify a subset of\ntraining examples by small adversarial perturbations to change the prediction\nof certain test-time data. Existing defense mechanisms are not desirable to\ndeploy in practice, as they often either drastically harm the generalization\nperformance, or are attack-specific, and prohibitively slow to apply. Here, we\npropose a simple but highly effective approach that unlike existing methods\nbreaks various types of invisible poisoning attacks with the slightest drop in\nthe generalization performance. We make the key observation that attacks\nintroduce local sharp regions of high training loss, which when minimized,\nresults in learning the adversarial perturbations and makes the attack\nsuccessful. To break poisoning attacks, our key idea is to alleviate the sharp\nloss regions introduced by poisons. To do so, our approach comprises two\ncomponents: an optimized friendly noise that is generated to maximally perturb\nexamples without degrading the performance, and a randomly varying noise\ncomponent. The combination of both components builds a very light-weight but\nextremely effective defense against the most powerful triggerless targeted and\nhidden-trigger backdoor poisoning attacks, including Gradient Matching,\nBulls-eye Polytope, and Sleeper Agent. We show that our friendly noise is\ntransferable to other architectures, and adaptive attacks cannot break our\ndefense due to its random noise component. Our code is available at:\nhttps://github.com/tianyu139/friendly-noise\n","authors":["Tian Yu Liu","Yu Yang","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2208.10224v4.pdf","comment":"Code available at: https://github.com/tianyu139/friendly-noise"},{"id":"http://arxiv.org/abs/2210.08363v3","updated":"2023-07-20T05:41:18Z","published":"2022-10-15T19:32:20Z","title":"Data-Efficient Augmentation for Training Neural Networks","summary":"  Data augmentation is essential to achieve state-of-the-art performance in\nmany deep learning applications. However, the most effective augmentation\ntechniques become computationally prohibitive for even medium-sized datasets.\nTo address this, we propose a rigorous technique to select subsets of data\npoints that when augmented, closely capture the training dynamics of full data\naugmentation. We first show that data augmentation, modeled as additive\nperturbations, improves learning and generalization by relatively enlarging and\nperturbing the smaller singular values of the network Jacobian, while\npreserving its prominent directions. This prevents overfitting and enhances\nlearning the harder to learn information. Then, we propose a framework to\niteratively extract small subsets of training data that when augmented, closely\ncapture the alignment of the fully augmented Jacobian with labels/residuals. We\nprove that stochastic gradient descent applied to the augmented subsets found\nby our approach has similar training dynamics to that of fully augmented data.\nOur experiments demonstrate that our method achieves 6.3x speedup on CIFAR10\nand 2.2x speedup on SVHN, and outperforms the baselines by up to 10% across\nvarious subset sizes. Similarly, on TinyImageNet and ImageNet, our method beats\nthe baselines by up to 8%, while achieving up to 3.3x speedup across various\nsubset sizes. Finally, training on and augmenting 50% subsets using our method\non a version of CIFAR10 corrupted with label noise even outperforms using the\nfull dataset. Our code is available at:\nhttps://github.com/tianyu139/data-efficient-augmentation\n","authors":["Tian Yu Liu","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2210.08363v3.pdf","comment":"Code available at:\n  https://github.com/tianyu139/data-efficient-augmentation"},{"id":"http://arxiv.org/abs/2206.08309v2","updated":"2023-07-20T05:32:00Z","published":"2022-06-16T17:11:41Z","title":"Pythae: Unifying Generative Autoencoders in Python -- A Benchmarking Use\n  Case","summary":"  In recent years, deep generative models have attracted increasing interest\ndue to their capacity to model complex distributions. Among those models,\nvariational autoencoders have gained popularity as they have proven both to be\ncomputationally efficient and yield impressive results in multiple fields.\nFollowing this breakthrough, extensive research has been done in order to\nimprove the original publication, resulting in a variety of different VAE\nmodels in response to different tasks. In this paper we present Pythae, a\nversatile open-source Python library providing both a unified implementation\nand a dedicated framework allowing straightforward, reproducible and reliable\nuse of generative autoencoder models. We then propose to use this library to\nperform a case study benchmark where we present and compare 19 generative\nautoencoder models representative of some of the main improvements on\ndownstream tasks such as image reconstruction, generation, classification,\nclustering and interpolation. The open-source library can be found at\nhttps://github.com/clementchadebec/benchmark_VAE.\n","authors":["Clément Chadebec","Louis J. Vincent","Stéphanie Allassonnière"],"pdf_url":"https://arxiv.org/pdf/2206.08309v2.pdf","comment":"Accepted to NeurIPS 2022"},{"id":"http://arxiv.org/abs/2210.16299v3","updated":"2023-07-20T05:27:03Z","published":"2022-10-28T17:52:18Z","title":"Nonuniqueness and Convergence to Equivalent Solutions in Observer-based\n  Inverse Reinforcement Learning","summary":"  A key challenge in solving the deterministic inverse reinforcement learning\n(IRL) problem online and in real-time is the existence of multiple solutions.\nNonuniqueness necessitates the study of the notion of equivalent solutions,\ni.e., solutions that result in a different cost functional but same feedback\nmatrix, and convergence to such solutions. While offline algorithms that result\nin convergence to equivalent solutions have been developed in the literature,\nonline, real-time techniques that address nonuniqueness are not available. In\nthis paper, a regularized history stack observer that converges to\napproximately equivalent solutions of the IRL problem is developed. Novel\ndata-richness conditions are developed to facilitate the analysis and\nsimulation results are provided to demonstrate the effectiveness of the\ndeveloped technique.\n","authors":["Jared Town","Zachary Morrison","Rushikesh Kamalapurkar"],"pdf_url":"https://arxiv.org/pdf/2210.16299v3.pdf","comment":"16 pages, 7 figures, submitted to American Controls Conference 2023"},{"id":"http://arxiv.org/abs/2307.10596v1","updated":"2023-07-20T05:23:49Z","published":"2023-07-20T05:23:49Z","title":"Ensemble Learning based Anomaly Detection for IoT Cybersecurity via\n  Bayesian Hyperparameters Sensitivity Analysis","summary":"  The Internet of Things (IoT) integrates more than billions of intelligent\ndevices over the globe with the capability of communicating with other\nconnected devices with little to no human intervention. IoT enables data\naggregation and analysis on a large scale to improve life quality in many\ndomains. In particular, data collected by IoT contain a tremendous amount of\ninformation for anomaly detection. The heterogeneous nature of IoT is both a\nchallenge and an opportunity for cybersecurity. Traditional approaches in\ncybersecurity monitoring often require different kinds of data pre-processing\nand handling for various data types, which might be problematic for datasets\nthat contain heterogeneous features. However, heterogeneous types of network\ndevices can often capture a more diverse set of signals than a single type of\ndevice readings, which is particularly useful for anomaly detection. In this\npaper, we present a comprehensive study on using ensemble machine learning\nmethods for enhancing IoT cybersecurity via anomaly detection. Rather than\nusing one single machine learning model, ensemble learning combines the\npredictive power from multiple models, enhancing their predictive accuracy in\nheterogeneous datasets rather than using one single machine learning model. We\npropose a unified framework with ensemble learning that utilises Bayesian\nhyperparameter optimisation to adapt to a network environment that contains\nmultiple IoT sensor readings. Experimentally, we illustrate their high\npredictive power when compared to traditional methods.\n","authors":["Tin Lai","Farnaz Farid","Abubakar Bello","Fariza Sabrina"],"pdf_url":"https://arxiv.org/pdf/2307.10596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10588v1","updated":"2023-07-20T05:03:25Z","published":"2023-07-20T05:03:25Z","title":"Forecasting Battery Electric Vehicle Charging Behavior: A Deep Learning\n  Approach Equipped with Micro-Clustering and SMOTE Techniques","summary":"  Energy systems, climate change, and public health are among the primary\nreasons for moving toward electrification in transportation. Transportation\nelectrification is being promoted worldwide to reduce emissions. As a result,\nmany automakers will soon start making only battery electric vehicles (BEVs).\nBEV adoption rates are rising in California, mainly due to climate change and\nair pollution concerns. While great for climate and pollution goals, improperly\nmanaged BEV charging can lead to insufficient charging infrastructure and power\noutages. This study develops a novel Micro Clustering Deep Neural Network\n(MCDNN), an artificial neural network algorithm that is highly effective at\nlearning BEVs trip and charging data to forecast BEV charging events,\ninformation that is essential for electricity load aggregators and utility\nmanagers to provide charging stations and electricity capacity effectively. The\nMCDNN is configured using a robust dataset of trips and charges that occurred\nin California between 2015 and 2020 from 132 BEVs, spanning 5 BEV models for a\ntotal of 1570167 vehicle miles traveled. The numerical findings revealed that\nthe proposed MCDNN is more effective than benchmark approaches in this field,\nsuch as support vector machine, k nearest neighbors, decision tree, and other\nneural network-based models in predicting the charging events.\n","authors":["Hanif Tayarani","Trisha V. Ramadoss","Vaishnavi Karanam","Gil Tal","Christopher Nitta"],"pdf_url":"https://arxiv.org/pdf/2307.10588v1.pdf","comment":"18 pages,8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.10586v1","updated":"2023-07-20T05:00:13Z","published":"2023-07-20T05:00:13Z","title":"A Holistic Assessment of the Reliability of Machine Learning Systems","summary":"  As machine learning (ML) systems increasingly permeate high-stakes settings\nsuch as healthcare, transportation, military, and national security, concerns\nregarding their reliability have emerged. Despite notable progress, the\nperformance of these systems can significantly diminish due to adversarial\nattacks or environmental changes, leading to overconfident predictions,\nfailures to detect input faults, and an inability to generalize in unexpected\nscenarios. This paper proposes a holistic assessment methodology for the\nreliability of ML systems. Our framework evaluates five key properties:\nin-distribution accuracy, distribution-shift robustness, adversarial\nrobustness, calibration, and out-of-distribution detection. A reliability score\nis also introduced and used to assess the overall system reliability. To\nprovide insights into the performance of different algorithmic approaches, we\nidentify and categorize state-of-the-art techniques, then evaluate a selection\non real-world tasks using our proposed reliability metrics and reliability\nscore. Our analysis of over 500 models reveals that designing for one metric\ndoes not necessarily constrain others but certain algorithmic techniques can\nimprove reliability across multiple metrics simultaneously. This study\ncontributes to a more comprehensive understanding of ML reliability and\nprovides a roadmap for future research and development.\n","authors":["Anthony Corso","David Karamadian","Romeo Valentin","Mary Cooper","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2307.10586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10580v1","updated":"2023-07-20T04:46:34Z","published":"2023-07-20T04:46:34Z","title":"Intelligent model for offshore China sea fog forecasting","summary":"  Accurate and timely prediction of sea fog is very important for effectively\nmanaging maritime and coastal economic activities. Given the intricate nature\nand inherent variability of sea fog, traditional numerical and statistical\nforecasting methods are often proven inadequate. This study aims to develop an\nadvanced sea fog forecasting method embedded in a numerical weather prediction\nmodel using the Yangtze River Estuary (YRE) coastal area as a case study. Prior\nto training our machine learning model, we employ a time-lagged correlation\nanalysis technique to identify key predictors and decipher the underlying\nmechanisms driving sea fog occurrence. In addition, we implement ensemble\nlearning and a focal loss function to address the issue of imbalanced data,\nthereby enhancing the predictive ability of our model. To verify the accuracy\nof our method, we evaluate its performance using a comprehensive dataset\nspanning one year, which encompasses both weather station observations and\nhistorical forecasts. Remarkably, our machine learning-based approach surpasses\nthe predictive performance of two conventional methods, the weather research\nand forecasting nonhydrostatic mesoscale model (WRF-NMM) and the algorithm\ndeveloped by the National Oceanic and Atmospheric Administration (NOAA)\nForecast Systems Laboratory (FSL). Specifically, in regard to predicting sea\nfog with a visibility of less than or equal to 1 km with a lead time of 60\nhours, our methodology achieves superior results by increasing the probability\nof detection (POD) while simultaneously reducing the false alarm ratio (FAR).\n","authors":["Yanfei Xiang","Qinghong Zhang","Mingqing Wang","Ruixue Xia","Yang Kong","Xiaomeng Huang"],"pdf_url":"https://arxiv.org/pdf/2307.10580v1.pdf","comment":"19 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.10579v1","updated":"2023-07-20T04:45:59Z","published":"2023-07-20T04:45:59Z","title":"SecureBoost Hyperparameter Tuning via Multi-Objective Federated Learning","summary":"  SecureBoost is a tree-boosting algorithm leveraging homomorphic encryption to\nprotect data privacy in vertical federated learning setting. It is widely used\nin fields such as finance and healthcare due to its interpretability,\neffectiveness, and privacy-preserving capability. However, SecureBoost suffers\nfrom high computational complexity and risk of label leakage. To harness the\nfull potential of SecureBoost, hyperparameters of SecureBoost should be\ncarefully chosen to strike an optimal balance between utility, efficiency, and\nprivacy. Existing methods either set hyperparameters empirically or\nheuristically, which are far from optimal. To fill this gap, we propose a\nConstrained Multi-Objective SecureBoost (CMOSB) algorithm to find Pareto\noptimal solutions that each solution is a set of hyperparameters achieving\noptimal tradeoff between utility loss, training cost, and privacy leakage. We\ndesign measurements of the three objectives. In particular, the privacy leakage\nis measured using our proposed instance clustering attack. Experimental results\ndemonstrate that the CMOSB yields not only hyperparameters superior to the\nbaseline but also optimal sets of hyperparameters that can support the flexible\nrequirements of FL participants.\n","authors":["Ziyao Ren","Yan Kang","Lixin Fan","Linghua Yang","Tao Fan","Yongxin Tong","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.10579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10575v1","updated":"2023-07-20T04:35:50Z","published":"2023-07-20T04:35:50Z","title":"Boosting Federated Learning Convergence with Prototype Regularization","summary":"  As a distributed machine learning technique, federated learning (FL) requires\nclients to collaboratively train a shared model with an edge server without\nleaking their local data. However, the heterogeneous data distribution among\nclients often leads to a decrease in model performance. To tackle this issue,\nthis paper introduces a prototype-based regularization strategy to address the\nheterogeneity in the data distribution. Specifically, the regularization\nprocess involves the server aggregating local prototypes from distributed\nclients to generate a global prototype, which is then sent back to the\nindividual clients to guide their local training. The experimental results on\nMNIST and Fashion-MNIST show that our proposal achieves improvements of 3.3%\nand 8.9% in average test accuracy, respectively, compared to the most popular\nbaseline FedAvg. Furthermore, our approach has a fast convergence rate in\nheterogeneous settings.\n","authors":["Yu Qiao","Huy Q. Le","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2307.10575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10569v1","updated":"2023-07-20T04:14:09Z","published":"2023-07-20T04:14:09Z","title":"Deceptive Alignment Monitoring","summary":"  As the capabilities of large machine learning models continue to grow, and as\nthe autonomy afforded to such models continues to expand, the spectre of a new\nadversary looms: the models themselves. The threat that a model might behave in\na seemingly reasonable manner, while secretly and subtly modifying its behavior\nfor ulterior reasons is often referred to as deceptive alignment in the AI\nSafety & Alignment communities. Consequently, we call this new direction\nDeceptive Alignment Monitoring. In this work, we identify emerging directions\nin diverse machine learning subfields that we believe will become increasingly\nimportant and intertwined in the near future for deceptive alignment\nmonitoring, and we argue that advances in these fields present both long-term\nchallenges and new research opportunities. We conclude by advocating for\ngreater involvement by the adversarial machine learning community in these\nemerging directions.\n","authors":["Andres Carranza","Dhruv Pai","Rylan Schaeffer","Arnuv Tandon","Sanmi Koyejo"],"pdf_url":"https://arxiv.org/pdf/2307.10569v1.pdf","comment":"Accepted as BlueSky Oral to 2023 ICML AdvML Workshop"},{"id":"http://arxiv.org/abs/2307.10563v1","updated":"2023-07-20T04:00:37Z","published":"2023-07-20T04:00:37Z","title":"FACADE: A Framework for Adversarial Circuit Anomaly Detection and\n  Evaluation","summary":"  We present FACADE, a novel probabilistic and geometric framework designed for\nunsupervised mechanistic anomaly detection in deep neural networks. Its primary\ngoal is advancing the understanding and mitigation of adversarial attacks.\nFACADE aims to generate probabilistic distributions over circuits, which\nprovide critical insights to their contribution to changes in the manifold\nproperties of pseudo-classes, or high-dimensional modes in activation space,\nyielding a powerful tool for uncovering and combating adversarial attacks. Our\napproach seeks to improve model robustness, enhance scalable model oversight,\nand demonstrates promising applications in real-world deployment settings.\n","authors":["Dhruv Pai","Andres Carranza","Rylan Schaeffer","Arnuv Tandon","Sanmi Koyejo"],"pdf_url":"https://arxiv.org/pdf/2307.10563v1.pdf","comment":"Accepted as BlueSky Poster at 2023 ICML AdvML Workshop"},{"id":"http://arxiv.org/abs/2307.10562v1","updated":"2023-07-20T03:56:04Z","published":"2023-07-20T03:56:04Z","title":"Shared Adversarial Unlearning: Backdoor Mitigation by Unlearning Shared\n  Adversarial Examples","summary":"  Backdoor attacks are serious security threats to machine learning models\nwhere an adversary can inject poisoned samples into the training set, causing a\nbackdoored model which predicts poisoned samples with particular triggers to\nparticular target classes, while behaving normally on benign samples. In this\npaper, we explore the task of purifying a backdoored model using a small clean\ndataset. By establishing the connection between backdoor risk and adversarial\nrisk, we derive a novel upper bound for backdoor risk, which mainly captures\nthe risk on the shared adversarial examples (SAEs) between the backdoored model\nand the purified model. This upper bound further suggests a novel bi-level\noptimization problem for mitigating backdoor using adversarial training\ntechniques. To solve it, we propose Shared Adversarial Unlearning (SAU).\nSpecifically, SAU first generates SAEs, and then, unlearns the generated SAEs\nsuch that they are either correctly classified by the purified model and/or\ndifferently classified by the two models, such that the backdoor effect in the\nbackdoored model will be mitigated in the purified model. Experiments on\nvarious benchmark datasets and network architectures show that our proposed\nmethod achieves state-of-the-art performance for backdoor defense.\n","authors":["Shaokui Wei","Mingda Zhang","Hongyuan Zha","Baoyuan Wu"],"pdf_url":"https://arxiv.org/pdf/2307.10562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10560v1","updated":"2023-07-20T03:55:53Z","published":"2023-07-20T03:55:53Z","title":"Post-variational quantum neural networks","summary":"  Quantum computing has the potential to provide substantial computational\nadvantages over current state-of-the-art classical supercomputers. However,\ncurrent hardware is not advanced enough to execute fault-tolerant quantum\nalgorithms. An alternative of using hybrid quantum-classical computing with\nvariational algorithms can exhibit barren plateau issues, causing slow\nconvergence of gradient-based optimization techniques. In this paper, we\ndiscuss \"post-variational strategies\", which shift tunable parameters from the\nquantum computer to the classical computer, opting for ensemble strategies when\noptimizing quantum models. We discuss various strategies and design principles\nfor constructing individual quantum circuits, where the resulting ensembles can\nbe optimized with convex programming. Further, we discuss architectural designs\nof post-variational quantum neural networks and analyze the propagation of\nestimation errors throughout such neural networks. Lastly, we show that our\nalgorithm can be applied to real-world applications such as image\nclassification on handwritten digits, producing a 96% classification accuracy.\n","authors":["Po-Wei Huang","Patrick Rebentrost"],"pdf_url":"https://arxiv.org/pdf/2307.10560v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.10559v1","updated":"2023-07-20T03:54:47Z","published":"2023-07-20T03:54:47Z","title":"Air Traffic Controller Workload Level Prediction using Conformalized\n  Dynamical Graph Learning","summary":"  Air traffic control (ATC) is a safety-critical service system that demands\nconstant attention from ground air traffic controllers (ATCos) to maintain\ndaily aviation operations. The workload of the ATCos can have negative effects\non operational safety and airspace usage. To avoid overloading and ensure an\nacceptable workload level for the ATCos, it is important to predict the ATCos'\nworkload accurately for mitigation actions. In this paper, we first perform a\nreview of research on ATCo workload, mostly from the air traffic perspective.\nThen, we briefly introduce the setup of the human-in-the-loop (HITL)\nsimulations with retired ATCos, where the air traffic data and workload labels\nare obtained. The simulations are conducted under three Phoenix approach\nscenarios while the human ATCos are requested to self-evaluate their workload\nratings (i.e., low-1 to high-7). Preliminary data analysis is conducted. Next,\nwe propose a graph-based deep-learning framework with conformal prediction to\nidentify the ATCo workload levels. The number of aircraft under the\ncontroller's control varies both spatially and temporally, resulting in\ndynamically evolving graphs. The experiment results suggest that (a) besides\nthe traffic density feature, the traffic conflict feature contributes to the\nworkload prediction capabilities (i.e., minimum horizontal/vertical separation\ndistance); (b) directly learning from the spatiotemporal graph layout of\nairspace with graph neural network can achieve higher prediction accuracy,\ncompare to hand-crafted traffic complexity features; (c) conformal prediction\nis a valuable tool to further boost model prediction accuracy, resulting a\nrange of predicted workload labels. The code used is available at\n\\href{https://github.com/ymlasu/para-atm-collection/blob/master/air-traffic-prediction/ATC-Workload-Prediction/}{$\\mathsf{Link}$}.\n","authors":["Yutian Pang","Jueming Hu","Christopher S. Lieber","Nancy J. Cooke","Yongming Liu"],"pdf_url":"https://arxiv.org/pdf/2307.10559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10550v1","updated":"2023-07-20T03:28:06Z","published":"2023-07-20T03:28:06Z","title":"SC VALL-E: Style-Controllable Zero-Shot Text to Speech Synthesizer","summary":"  Expressive speech synthesis models are trained by adding corpora with diverse\nspeakers, various emotions, and different speaking styles to the dataset, in\norder to control various characteristics of speech and generate the desired\nvoice. In this paper, we propose a style control (SC) VALL-E model based on the\nneural codec language model (called VALL-E), which follows the structure of the\ngenerative pretrained transformer 3 (GPT-3). The proposed SC VALL-E takes input\nfrom text sentences and prompt audio and is designed to generate controllable\nspeech by not simply mimicking the characteristics of the prompt audio but by\ncontrolling the attributes to produce diverse voices. We identify tokens in the\nstyle embedding matrix of the newly designed style network that represent\nattributes such as emotion, speaking rate, pitch, and voice intensity, and\ndesign a model that can control these attributes. To evaluate the performance\nof SC VALL-E, we conduct comparative experiments with three representative\nexpressive speech synthesis models: global style token (GST) Tacotron2,\nvariational autoencoder (VAE) Tacotron2, and original VALL-E. We measure word\nerror rate (WER), F0 voiced error (FVE), and F0 gross pitch error (F0GPE) as\nevaluation metrics to assess the accuracy of generated sentences. For comparing\nthe quality of synthesized speech, we measure comparative mean option score\n(CMOS) and similarity mean option score (SMOS). To evaluate the style control\nability of the generated speech, we observe the changes in F0 and\nmel-spectrogram by modifying the trained tokens. When using prompt audio that\nis not present in the training data, SC VALL-E generates a variety of\nexpressive sounds and demonstrates competitive performance compared to the\nexisting models. Our implementation, pretrained models, and audio samples are\nlocated on GitHub.\n","authors":["Daegyeom Kim","Seongho Hong","Yong-Hoon Choi"],"pdf_url":"https://arxiv.org/pdf/2307.10550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08122v2","updated":"2023-07-20T03:07:28Z","published":"2023-07-16T18:31:25Z","title":"Tangent Transformers for Composition, Privacy and Removal","summary":"  We introduce Tangent Attention Fine-Tuning (TAFT), a method for fine-tuning\nlinearized transformers obtained by computing a First-order Taylor Expansion\naround a pre-trained initialization. We show that the Jacobian-Vector Product\nresulting from linearization can be computed efficiently in a single forward\npass, reducing training and inference cost to the same order of magnitude as\nits original non-linear counterpart, while using the same number of parameters.\nFurthermore, we show that, when applied to various downstream visual\nclassification tasks, the resulting Tangent Transformer fine-tuned with TAFT\ncan perform comparably with fine-tuning the original non-linear network. Since\nTangent Transformers are linear with respect to the new set of weights, and the\nresulting fine-tuning loss is convex, we show that TAFT enjoys several\nadvantages compared to non-linear fine-tuning when it comes to model\ncomposition, parallel training, machine unlearning, and differential privacy.\n","authors":["Tian Yu Liu","Aditya Golatkar","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2307.08122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03718v4","updated":"2023-07-20T03:06:50Z","published":"2023-06-06T14:28:57Z","title":"Emotion-Conditioned Melody Harmonization with Hierarchical Variational\n  Autoencoder","summary":"  Existing melody harmonization models have made great progress in improving\nthe quality of generated harmonies, but most of them ignored the emotions\nbeneath the music. Meanwhile, the variability of harmonies generated by\nprevious methods is insufficient. To solve these problems, we propose a novel\nLSTM-based Hierarchical Variational Auto-Encoder (LHVAE) to investigate the\ninfluence of emotional conditions on melody harmonization, while improving the\nquality of generated harmonies and capturing the abundant variability of chord\nprogressions. Specifically, LHVAE incorporates latent variables and emotional\nconditions at different levels (piece- and bar-level) to model the global and\nlocal music properties. Additionally, we introduce an attention-based melody\ncontext vector at each step to better learn the correspondence between melodies\nand harmonies. Objective experimental results show that our proposed model\noutperforms other LSTM-based models. Through subjective evaluation, we conclude\nthat only altering the types of chords hardly changes the overall emotion of\nthe music. The qualitative analysis demonstrates the ability of our model to\ngenerate variable harmonies.\n","authors":["Shulei Ji","Xinyu Yang"],"pdf_url":"https://arxiv.org/pdf/2306.03718v4.pdf","comment":"Accepted by IEEE SMC 2023"},{"id":"http://arxiv.org/abs/2212.12658v2","updated":"2023-07-20T03:00:05Z","published":"2022-12-24T05:25:09Z","title":"Improving Uncertainty Quantification of Variance Networks by\n  Tree-Structured Learning","summary":"  To improve the uncertainty quantification of variance networks, we propose a\nnovel tree-structured local neural network model that partitions the feature\nspace into multiple regions based on uncertainty heterogeneity. A tree is built\nupon giving the training data, whose leaf nodes represent different regions\nwhere region-specific neural networks are trained to predict both the mean and\nthe variance for quantifying uncertainty. The proposed Uncertainty-Splitting\nNeural Regression Tree (USNRT) employs novel splitting criteria. At each node,\na neural network is trained on the full data first, and a statistical test for\nthe residuals is conducted to find the best split, corresponding to the two\nsub-regions with the most significant uncertainty heterogeneity between them.\nUSNRT is computationally friendly because very few leaf nodes are sufficient\nand pruning is unnecessary. Furthermore, an ensemble version can be easily\nconstructed to estimate the total uncertainty including the aleatory and\nepistemic. On extensive UCI datasets, USNRT or its ensemble shows superior\nperformance compared to some recent popular methods for quantifying uncertainty\nwith variances. Through comprehensive visualization and analysis, we uncover\nhow USNRT works and show its merits, revealing that uncertainty heterogeneity\ndoes exist in many datasets and can be learned by USNRT.\n","authors":["Wenxuan Ma","Xing Yan","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2212.12658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09767v2","updated":"2023-07-20T02:51:15Z","published":"2023-03-17T04:18:03Z","title":"It Is All About Data: A Survey on the Effects of Data on Adversarial\n  Robustness","summary":"  Adversarial examples are inputs to machine learning models that an attacker\nhas intentionally designed to confuse the model into making a mistake. Such\nexamples pose a serious threat to the applicability of machine-learning-based\nsystems, especially in life- and safety-critical domains. To address this\nproblem, the area of adversarial robustness investigates mechanisms behind\nadversarial attacks and defenses against these attacks. This survey reviews a\nparticular subset of this literature that focuses on investigating properties\nof training data in the context of model robustness under evasion attacks. It\nfirst summarizes the main properties of data leading to adversarial\nvulnerability. It then discusses guidelines and techniques for improving\nadversarial robustness by enhancing the data representation and learning\nprocedures, as well as techniques for estimating robustness guarantees given\nparticular data. Finally, it discusses gaps of knowledge and promising future\nresearch directions in this area.\n","authors":["Peiyu Xiong","Michael Tegegn","Jaskeerat Singh Sarin","Shubhraneel Pal","Julia Rubin"],"pdf_url":"https://arxiv.org/pdf/2303.09767v2.pdf","comment":"51 pages, 25 figures, under review"},{"id":"http://arxiv.org/abs/2304.10159v2","updated":"2023-07-20T02:49:49Z","published":"2023-04-20T08:32:58Z","title":"Deep-Q Learning with Hybrid Quantum Neural Network on Solving Maze\n  Problems","summary":"  Quantum computing holds great potential for advancing the limitations of\nmachine learning algorithms to handle higher data dimensions and reduce overall\ntraining parameters in deep neural network (DNN) models. This study uses a\nparameterized quantum circuit (PQC) on a gate-based quantum computer to\ninvestigate the potential for quantum advantage in a model-free reinforcement\nlearning problem. Through a comprehensive investigation and evaluation of the\ncurrent model and capabilities of quantum computers, we designed and trained a\nnovel hybrid Quantum neural network based on the latest Qiskit and PyTorch\nframework. We compared its performance with a full-classical DNN with and\nwithout an integrated PQC. Our research provides insights into the potential of\ndeep quantum learning to solve a maze problem and, potentially, other\nreinforcement learning problems. We conclude that various reinforcement\nlearning problems can be effective with reasonable training epochs. Moreover, a\ncomparative discussion of the various quantum reinforcement learning model on\nmaze problems is discussed to evaluate our research's overall potential and\nadvantages.\n","authors":["Hao-Yuan Chen","Yen-Jui Chang","Ching-Ray Chang"],"pdf_url":"https://arxiv.org/pdf/2304.10159v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10541v1","updated":"2023-07-20T02:42:23Z","published":"2023-07-20T02:42:23Z","title":"Differentially Flat Learning-based Model Predictive Control Using a\n  Stability, State, and Input Constraining Safety Filter","summary":"  Learning-based optimal control algorithms control unknown systems using past\ntrajectory data and a learned model of the system dynamics. These controllers\nuse either a linear approximation of the learned dynamics, trading performance\nfor faster computation, or nonlinear optimization methods, which typically\nperform better but can limit real-time applicability. In this work, we present\na novel nonlinear controller that exploits differential flatness to achieve\nsimilar performance to state-of-the-art learning-based controllers but with\nsignificantly less computational effort. Differential flatness is a property of\ndynamical systems whereby nonlinear systems can be exactly linearized through a\nnonlinear input mapping. Here, the nonlinear transformation is learned as a\nGaussian process and is used in a safety filter that guarantees, with high\nprobability, stability as well as input and flat state constraint satisfaction.\nThis safety filter is then used to refine inputs from a flat model predictive\ncontroller to perform constrained nonlinear learning-based optimal control\nthrough two successive convex optimizations. We compare our method to\nstate-of-the-art learning-based control strategies and achieve similar\nperformance, but with significantly better computational efficiency, while also\nrespecting flat state and input constraints, and guaranteeing stability.\n","authors":["Adam W. Hall","Melissa Greeff","Angela P. Schoellig"],"pdf_url":"https://arxiv.org/pdf/2307.10541v1.pdf","comment":"6 pages, 5 figures, Published in IEEE Control Systems Letters"},{"id":"http://arxiv.org/abs/2307.10529v1","updated":"2023-07-20T02:07:20Z","published":"2023-07-20T02:07:20Z","title":"Fast Unsupervised Deep Outlier Model Selection with Hypernetworks","summary":"  Outlier detection (OD) finds many applications with a rich literature of\nnumerous techniques. Deep neural network based OD (DOD) has seen a recent surge\nof attention thanks to the many advances in deep learning. In this paper, we\nconsider a critical-yet-understudied challenge with unsupervised DOD, that is,\neffective hyperparameter (HP) tuning/model selection. While several prior work\nreport the sensitivity of OD models to HPs, it becomes ever so critical for the\nmodern DOD models that exhibit a long list of HPs. We introduce HYPER for\ntuning DOD models, tackling two fundamental challenges: (1) validation without\nsupervision (due to lack of labeled anomalies), and (2) efficient search of the\nHP/model space (due to exponential growth in the number of HPs). A key idea is\nto design and train a novel hypernetwork (HN) that maps HPs onto optimal\nweights of the main DOD model. In turn, HYPER capitalizes on a single HN that\ncan dynamically generate weights for many DOD models (corresponding to varying\nHPs), which offers significant speed-up. In addition, it employs meta-learning\non historical OD tasks with labels to train a proxy validation function,\nlikewise trained with our proposed HN efficiently. Extensive experiments on 35\nOD tasks show that HYPER achieves high performance against 8 baselines with\nsignificant efficiency gains.\n","authors":["Xueying Ding","Yue Zhao","Leman Akoglu"],"pdf_url":"https://arxiv.org/pdf/2307.10529v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.10524v1","updated":"2023-07-20T01:56:10Z","published":"2023-07-20T01:56:10Z","title":"Beyond Black-Box Advice: Learning-Augmented Algorithms for MDPs with\n  Q-Value Predictions","summary":"  We study the tradeoff between consistency and robustness in the context of a\nsingle-trajectory time-varying Markov Decision Process (MDP) with untrusted\nmachine-learned advice. Our work departs from the typical approach of treating\nadvice as coming from black-box sources by instead considering a setting where\nadditional information about how the advice is generated is available. We prove\na first-of-its-kind consistency and robustness tradeoff given Q-value advice\nunder a general MDP model that includes both continuous and discrete\nstate/action spaces. Our results highlight that utilizing Q-value advice\nenables dynamic pursuit of the better of machine-learned advice and a robust\nbaseline, thus result in near-optimal performance guarantees, which provably\nimproves what can be obtained solely with black-box advice.\n","authors":["Tongxin Li","Yiheng Lin","Shaolei Ren","Adam Wierman"],"pdf_url":"https://arxiv.org/pdf/2307.10524v1.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2302.10980v3","updated":"2023-07-20T01:34:16Z","published":"2023-02-21T20:26:39Z","title":"MultiRobustBench: Benchmarking Robustness Against Multiple Attacks","summary":"  The bulk of existing research in defending against adversarial examples\nfocuses on defending against a single (typically bounded Lp-norm) attack, but\nfor a practical setting, machine learning (ML) models should be robust to a\nwide variety of attacks. In this paper, we present the first unified framework\nfor considering multiple attacks against ML models. Our framework is able to\nmodel different levels of learner's knowledge about the test-time adversary,\nallowing us to model robustness against unforeseen attacks and robustness\nagainst unions of attacks. Using our framework, we present the first\nleaderboard, MultiRobustBench, for benchmarking multiattack evaluation which\ncaptures performance across attack types and attack strengths. We evaluate the\nperformance of 16 defended models for robustness against a set of 9 different\nattack types, including Lp-based threat models, spatial transformations, and\ncolor changes, at 20 different attack strengths (180 attacks total).\nAdditionally, we analyze the state of current defenses against multiple\nattacks. Our analysis shows that while existing defenses have made progress in\nterms of average robustness across the set of attacks used, robustness against\nthe worst-case attack is still a big open problem as all existing models\nperform worse than random guessing.\n","authors":["Sihui Dai","Saeed Mahloujifar","Chong Xiang","Vikash Sehwag","Pin-Yu Chen","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2302.10980v3.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2305.11408v2","updated":"2023-07-20T00:58:30Z","published":"2023-05-19T03:31:42Z","title":"AlignAtt: Using Attention-based Audio-Translation Alignments as a Guide\n  for Simultaneous Speech Translation","summary":"  Attention is the core mechanism of today's most used architectures for\nnatural language processing and has been analyzed from many perspectives,\nincluding its effectiveness for machine translation-related tasks. Among these\nstudies, attention resulted to be a useful source of information to get\ninsights about word alignment also when the input text is substituted with\naudio segments, as in the case of the speech translation (ST) task. In this\npaper, we propose AlignAtt, a novel policy for simultaneous ST (SimulST) that\nexploits the attention information to generate source-target alignments that\nguide the model during inference. Through experiments on the 8 language pairs\nof MuST-C v1.0, we show that AlignAtt outperforms previous state-of-the-art\nSimulST policies applied to offline-trained models with gains in terms of BLEU\nof 2 points and latency reductions ranging from 0.5s to 0.8s across the 8\nlanguages.\n","authors":["Sara Papi","Marco Turchi","Matteo Negri"],"pdf_url":"https://arxiv.org/pdf/2305.11408v2.pdf","comment":"Accepted at Interspeech 2023"},{"id":"http://arxiv.org/abs/2307.04603v4","updated":"2023-07-20T00:49:13Z","published":"2023-07-07T09:01:42Z","title":"Solvent: A Framework for Protein Folding","summary":"  Consistency and reliability are crucial for conducting AI research. Many\nfamous research fields, such as object detection, have been compared and\nvalidated with solid benchmark frameworks. After AlphaFold2, the protein\nfolding task has entered a new phase, and many methods are proposed based on\nthe component of AlphaFold2. The importance of a unified research framework in\nprotein folding contains implementations and benchmarks to consistently and\nfairly compare various approaches. To achieve this, we present Solvent, an\nprotein folding framework that supports significant components of\nstate-of-the-art models in the manner of off-the-shelf interface Solvent\ncontains different models implemented in a unified codebase and supports\ntraining and evaluation for defined models on the same dataset. We benchmark\nwell-known algorithms and their components and provide experiments that give\nhelpful insights into the protein structure modeling field. We hope that\nSolvent will increase the reliability and consistency of proposed models and\ngives efficiency in both speed and costs, resulting in acceleration on protein\nfolding modeling research. The code is available at\nhttps://github.com/kakaobrain/solvent, and the project will continue to be\ndeveloped.\n","authors":["Jaemyung Lee","Kyeongtak Han","Jaehoon Kim","Hasun Yu","Youhan Lee"],"pdf_url":"https://arxiv.org/pdf/2307.04603v4.pdf","comment":"preprint, 8pages"},{"id":"http://arxiv.org/abs/2307.09702v2","updated":"2023-07-20T00:40:41Z","published":"2023-07-19T01:14:49Z","title":"Efficient Guided Generation for Large Language Models","summary":"  In this article we describe an efficient approach to guiding language model\ntext generation with regular expressions and context-free grammars. Our\napproach adds little to no overhead to the token sequence generation process,\nand makes guided generation feasible in practice. An implementation is provided\nin the open source Python library Outlines.\n","authors":["Brandon T. Willard","Rémi Louf"],"pdf_url":"https://arxiv.org/pdf/2307.09702v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10507v1","updated":"2023-07-20T00:07:29Z","published":"2023-07-20T00:07:29Z","title":"FedSoup: Improving Generalization and Personalization in Federated\n  Learning via Selective Model Interpolation","summary":"  Cross-silo federated learning (FL) enables the development of machine\nlearning models on datasets distributed across data centers such as hospitals\nand clinical research laboratories. However, recent research has found that\ncurrent FL algorithms face a trade-off between local and global performance\nwhen confronted with distribution shifts. Specifically, personalized FL methods\nhave a tendency to overfit to local data, leading to a sharp valley in the\nlocal model and inhibiting its ability to generalize to out-of-distribution\ndata. In this paper, we propose a novel federated model soup method (i.e.,\nselective interpolation of model parameters) to optimize the trade-off between\nlocal and global performance. Specifically, during the federated training\nphase, each client maintains its own global model pool by monitoring the\nperformance of the interpolated model between the local and global models. This\nallows us to alleviate overfitting and seek flat minima, which can\nsignificantly improve the model's generalization performance. We evaluate our\nmethod on retinal and pathological image classification tasks, and our proposed\nmethod achieves significant improvements for out-of-distribution\ngeneralization. Our code is available at https://github.com/ubc-tea/FedSoup.\n","authors":["Minghui Chen","Meirui Jiang","Qi Dou","Zehua Wang","Xiaoxiao Li"],"pdf_url":"https://arxiv.org/pdf/2307.10507v1.pdf","comment":"Accepted by MICCAI2023"},{"id":"http://arxiv.org/abs/2307.10504v1","updated":"2023-07-20T00:02:24Z","published":"2023-07-20T00:02:24Z","title":"Identifying Interpretable Subspaces in Image Representations","summary":"  We propose Automatic Feature Explanation using Contrasting Concepts (FALCON),\nan interpretability framework to explain features of image representations. For\na target feature, FALCON captions its highly activating cropped images using a\nlarge captioning dataset (like LAION-400m) and a pre-trained vision-language\nmodel like CLIP. Each word among the captions is scored and ranked leading to a\nsmall number of shared, human-understandable concepts that closely describe the\ntarget feature. FALCON also applies contrastive interpretation using lowly\nactivating (counterfactual) images, to eliminate spurious concepts. Although\nmany existing approaches interpret features independently, we observe in\nstate-of-the-art self-supervised and supervised models, that less than 20% of\nthe representation space can be explained by individual features. We show that\nfeatures in larger spaces become more interpretable when studied in groups and\ncan be explained with high-order scoring concepts through FALCON. We discuss\nhow extracted concepts can be used to explain and debug failures in downstream\ntasks. Finally, we present a technique to transfer concepts from one\n(explainable) representation space to another unseen representation space by\nlearning a simple linear transformation.\n","authors":["Neha Kalibhat","Shweta Bhardwaj","Bayan Bruss","Hamed Firooz","Maziar Sanjabi","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2307.10504v1.pdf","comment":"Published at ICML 2023"},{"id":"http://arxiv.org/abs/2307.11081v1","updated":"2023-07-20T17:57:04Z","published":"2023-07-20T17:57:04Z","title":"GLSFormer: Gated - Long, Short Sequence Transformer for Step Recognition\n  in Surgical Videos","summary":"  Automated surgical step recognition is an important task that can\nsignificantly improve patient safety and decision-making during surgeries.\nExisting state-of-the-art methods for surgical step recognition either rely on\nseparate, multi-stage modeling of spatial and temporal information or operate\non short-range temporal resolution when learned jointly. However, the benefits\nof joint modeling of spatio-temporal features and long-range information are\nnot taken in account. In this paper, we propose a vision transformer-based\napproach to jointly learn spatio-temporal features directly from sequence of\nframe-level patches. Our method incorporates a gated-temporal attention\nmechanism that intelligently combines short-term and long-term spatio-temporal\nfeature representations. We extensively evaluate our approach on two cataract\nsurgery video datasets, namely Cataract-101 and D99, and demonstrate superior\nperformance compared to various state-of-the-art methods. These results\nvalidate the suitability of our proposed approach for automated surgical step\nrecognition. Our code is released at:\nhttps://github.com/nisargshah1999/GLSFormer\n","authors":["Nisarg A. Shah","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2307.11081v1.pdf","comment":"Accepted to MICCAI 2023 (Early Accept)"},{"id":"http://arxiv.org/abs/2307.11018v1","updated":"2023-07-20T16:45:22Z","published":"2023-07-20T16:45:22Z","title":"Amortized Variational Inference: When and Why?","summary":"  Amortized variational inference (A-VI) is a method for approximating the\nintractable posterior distributions that arise in probabilistic models. The\ndefining feature of A-VI is that it learns a global inference function that\nmaps each observation to its local latent variable's approximate posterior.\nThis stands in contrast to the more classical factorized (or mean-field)\nvariational inference (F-VI), which directly learns the parameters of the\napproximating distribution for each latent variable. In deep generative models,\nA-VI is used as a computational trick to speed up inference for local latent\nvariables. In this paper, we study A-VI as a general alternative to F-VI for\napproximate posterior inference. A-VI cannot produce an approximation with a\nlower Kullback-Leibler divergence than F-VI's optimal solution, because the\namortized family is a subset of the factorized family. Thus a central\ntheoretical problem is to characterize when A-VI still attains F-VI's optimal\nsolution. We derive conditions on both the model and the inference function\nunder which A-VI can theoretically achieve F-VI's optimum. We show that for a\nbroad class of hierarchical models, including deep generative models, it is\npossible to close the gap between A-VI and F-VI. Further, for an even broader\nclass of models, we establish when and how to expand the domain of the\ninference function to make amortization a feasible strategy. Finally, we prove\nthat for certain models -- including hidden Markov models and Gaussian\nprocesses -- A-VI cannot match F-VI's solution, no matter how expressive the\ninference function is. We also study A-VI empirically. On several examples, we\ncorroborate our theoretical results and investigate the performance of A-VI\nwhen varying the complexity of the inference function. When the gap between\nA-VI and F-VI can be closed, we find that the required complexity of the\nfunction need not scale with the number of observations, and that A-VI often\nconverges faster than F-VI.\n","authors":["Charles C. Margossian","David M. Blei"],"pdf_url":"https://arxiv.org/pdf/2307.11018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18451v3","updated":"2023-07-20T23:59:38Z","published":"2023-05-29T04:02:10Z","title":"Shift-Robust Molecular Relational Learning with Causal Substructure","summary":"  Recently, molecular relational learning, whose goal is to predict the\ninteraction behavior between molecular pairs, got a surge of interest in\nmolecular sciences due to its wide range of applications. In this work, we\npropose CMRL that is robust to the distributional shift in molecular relational\nlearning by detecting the core substructure that is causally related to\nchemical reactions. To do so, we first assume a causal relationship based on\nthe domain knowledge of molecular sciences and construct a structural causal\nmodel (SCM) that reveals the relationship between variables. Based on the SCM,\nwe introduce a novel conditional intervention framework whose intervention is\nconditioned on the paired molecule. With the conditional intervention\nframework, our model successfully learns from the causal substructure and\nalleviates the confounding effect of shortcut substructures that are spuriously\ncorrelated to chemical reactions. Extensive experiments on various tasks with\nreal-world and synthetic datasets demonstrate the superiority of CMRL over\nstate-of-the-art baseline models. Our code is available at\nhttps://github.com/Namkyeong/CMRL.\n","authors":["Namkyeong Lee","Kanghoon Yoon","Gyoung S. Na","Sein Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2305.18451v3.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2307.08167v2","updated":"2023-07-20T23:08:11Z","published":"2023-07-16T22:35:52Z","title":"Computing the gradients with respect to all parameters of a quantum\n  neural network using a single circuit","summary":"  When computing the gradients of a quantum neural network using the\nparameter-shift rule, the cost function needs to be calculated twice for the\ngradient with respect to a single adjustable parameter of the network. When the\ntotal number of parameters is high, the quantum circuit for the computation has\nto be adjusted and run for many times. Here we propose an approach to compute\nall the gradients using a single circuit only, with a much reduced circuit\ndepth and less classical registers. We also demonstrate experimentally, on both\nreal quantum hardware and simulator, that our approach has the advantages that\nthe circuit takes a significantly shorter time to compile than the conventional\napproach, resulting in a speedup on the total runtime.\n","authors":["Guang Ping He"],"pdf_url":"https://arxiv.org/pdf/2307.08167v2.pdf","comment":"Added a suggestion on improving real quantum computers"},{"id":"http://arxiv.org/abs/2307.11249v1","updated":"2023-07-20T21:49:38Z","published":"2023-07-20T21:49:38Z","title":"On the Fisher-Rao Gradient of the Evidence Lower Bound","summary":"  This article studies the Fisher-Rao gradient, also referred to as the natural\ngradient, of the evidence lower bound, the ELBO, which plays a crucial role\nwithin the theory of the Variational Autonecoder, the Helmholtz Machine and the\nFree Energy Principle. The natural gradient of the ELBO is related to the\nnatural gradient of the Kullback-Leibler divergence from a target distribution,\nthe prime objective function of learning. Based on invariance properties of\ngradients within information geometry, conditions on the underlying model are\nprovided that ensure the equivalence of minimising the prime objective function\nand the maximisation of the ELBO.\n","authors":["Nihat Ay","Jesse van Oostrum"],"pdf_url":"https://arxiv.org/pdf/2307.11249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11242v1","updated":"2023-07-20T21:25:25Z","published":"2023-07-20T21:25:25Z","title":"On-Sensor Data Filtering using Neuromorphic Computing for High Energy\n  Physics Experiments","summary":"  This work describes the investigation of neuromorphic computing-based spiking\nneural network (SNN) models used to filter data from sensor electronics in high\nenergy physics experiments conducted at the High Luminosity Large Hadron\nCollider. We present our approach for developing a compact neuromorphic model\nthat filters out the sensor data based on the particle's transverse momentum\nwith the goal of reducing the amount of data being sent to the downstream\nelectronics. The incoming charge waveforms are converted to streams of\nbinary-valued events, which are then processed by the SNN. We present our\ninsights on the various system design choices - from data encoding to optimal\nhyperparameters of the training algorithm - for an accurate and compact SNN\noptimized for hardware deployment. Our results show that an SNN trained with an\nevolutionary algorithm and an optimized set of hyperparameters obtains a signal\nefficiency of about 91% with nearly half as many parameters as a deep neural\nnetwork.\n","authors":["Shruti R. Kulkarni","Aaron Young","Prasanna Date","Narasinga Rao Miniskar","Jeffrey S. Vetter","Farah Fahim","Benjamin Parpillon","Jennet Dickinson","Nhan Tran","Jieun Yoo","Corrinne Mills","Morris Swartz","Petar Maksimovic","Catherine D. Schuman","Alice Bean"],"pdf_url":"https://arxiv.org/pdf/2307.11242v1.pdf","comment":"Manuscript accepted at ICONS'23"},{"id":"http://arxiv.org/abs/2307.11239v1","updated":"2023-07-20T21:22:02Z","published":"2023-07-20T21:22:02Z","title":"Edgewise outliers of network indexed signals","summary":"  We consider models for network indexed multivariate data involving a\ndependence between variables as well as across graph nodes.\n  In the framework of these models, we focus on outliers detection and\nintroduce the concept of edgewise outliers. For this purpose, we first derive\nthe distribution of some sums of squares, in particular squared Mahalanobis\ndistances that can be used to fix detection rules and thresholds for outlier\ndetection. We then propose a robust version of the deterministic MCD algorithm\nthat we call edgewise MCD. An application on simulated data shows the interest\nof taking the dependence structure into account. We also illustrate the utility\nof the proposed method with a real data set.\n","authors":["Christopher Rieser","Anne Ruiz-Gazen","Christine Thomas-Agnan"],"pdf_url":"https://arxiv.org/pdf/2307.11239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11234v1","updated":"2023-07-20T21:10:54Z","published":"2023-07-20T21:10:54Z","title":"QDC: Quantum Diffusion Convolution Kernels on Graphs","summary":"  Graph convolutional neural networks (GCNs) operate by aggregating messages\nover local neighborhoods given the prediction task under interest. Many GCNs\ncan be understood as a form of generalized diffusion of input features on the\ngraph, and significant work has been dedicated to improving predictive accuracy\nby altering the ways of message passing. In this work, we propose a new\nconvolution kernel that effectively rewires the graph according to the\noccupation correlations of the vertices by trading on the generalized diffusion\nparadigm for the propagation of a quantum particle over the graph. We term this\nnew convolution kernel the Quantum Diffusion Convolution (QDC) operator. In\naddition, we introduce a multiscale variant that combines messages from the QDC\noperator and the traditional combinatorial Laplacian. To understand our method,\nwe explore the spectral dependence of homophily and the importance of quantum\ndynamics in the construction of a bandpass filter. Through these studies, as\nwell as experiments on a range of datasets, we observe that QDC improves\npredictive performance on the widely used benchmark datasets when compared to\nsimilar methods.\n","authors":["Thomas Markovich"],"pdf_url":"https://arxiv.org/pdf/2307.11234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13807v2","updated":"2023-07-20T20:57:08Z","published":"2023-01-31T17:50:52Z","title":"Identifying the Hazard Boundary of ML-enabled Autonomous Systems Using\n  Cooperative Co-Evolutionary Search","summary":"  In Machine Learning (ML)-enabled autonomous systems (MLASs), it is essential\nto identify the hazard boundary of ML Components (MLCs) in the MLAS under\nanalysis. Given that such boundary captures the conditions in terms of MLC\nbehavior and system context that can lead to hazards, it can then be used to,\nfor example, build a safety monitor that can take any predefined fallback\nmechanisms at runtime when reaching the hazard boundary. However, determining\nsuch hazard boundary for an ML component is challenging. This is due to the\nproblem space combining system contexts (i.e., scenarios) and MLC behaviors\n(i.e., inputs and outputs) being far too large for exhaustive exploration and\neven to handle using conventional metaheuristics, such as genetic algorithms.\nAdditionally, the high computational cost of simulations required to determine\nany MLAS safety violations makes the problem even more challenging.\nFurthermore, it is unrealistic to consider a region in the problem space\ndeterministically safe or unsafe due to the uncontrollable parameters in\nsimulations and the non-linear behaviors of ML models (e.g., deep neural\nnetworks) in the MLAS under analysis. To address the challenges, we propose\nMLCSHE (ML Component Safety Hazard Envelope), a novel method based on a\nCooperative Co-Evolutionary Algorithm (CCEA), which aims to tackle a\nhigh-dimensional problem by decomposing it into two lower-dimensional search\nsubproblems. Moreover, we take a probabilistic view of safe and unsafe regions\nand define a novel fitness function to measure the distance from the\nprobabilistic hazard boundary and thus drive the search effectively. We\nevaluate the effectiveness and efficiency of MLCSHE on a complex Autonomous\nVehicle (AV) case study. Our evaluation results show that MLCSHE is\nsignificantly more effective and efficient compared to a standard genetic\nalgorithm and random search.\n","authors":["Sepehr Sharifi","Donghwan Shin","Lionel C. Briand","Nathan Aschbacher"],"pdf_url":"https://arxiv.org/pdf/2301.13807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11228v1","updated":"2023-07-20T20:46:39Z","published":"2023-07-20T20:46:39Z","title":"From Adaptive Query Release to Machine Unlearning","summary":"  We formalize the problem of machine unlearning as design of efficient\nunlearning algorithms corresponding to learning algorithms which perform a\nselection of adaptive queries from structured query classes. We give efficient\nunlearning algorithms for linear and prefix-sum query classes. As applications,\nwe show that unlearning in many problems, in particular, stochastic convex\noptimization (SCO), can be reduced to the above, yielding improved guarantees\nfor the problem. In particular, for smooth Lipschitz losses and any $\\rho>0$,\nour results yield an unlearning algorithm with excess population risk of\n$\\tilde O\\big(\\frac{1}{\\sqrt{n}}+\\frac{\\sqrt{d}}{n\\rho}\\big)$ with unlearning\nquery (gradient) complexity $\\tilde O(\\rho \\cdot \\text{Retraining\nComplexity})$, where $d$ is the model dimensionality and $n$ is the initial\nnumber of samples. For non-smooth Lipschitz losses, we give an unlearning\nalgorithm with excess population risk $\\tilde\nO\\big(\\frac{1}{\\sqrt{n}}+\\big(\\frac{\\sqrt{d}}{n\\rho}\\big)^{1/2}\\big)$ with the\nsame unlearning query (gradient) complexity. Furthermore, in the special case\nof Generalized Linear Models (GLMs), such as those in linear and logistic\nregression, we get dimension-independent rates of $\\tilde\nO\\big(\\frac{1}{\\sqrt{n}} +\\frac{1}{(n\\rho)^{2/3}}\\big)$ and $\\tilde\nO\\big(\\frac{1}{\\sqrt{n}} +\\frac{1}{(n\\rho)^{1/3}}\\big)$ for smooth Lipschitz\nand non-smooth Lipschitz losses respectively. Finally, we give generalizations\nof the above from one unlearning request to \\textit{dynamic} streams consisting\nof insertions and deletions.\n","authors":["Enayat Ullah","Raman Arora"],"pdf_url":"https://arxiv.org/pdf/2307.11228v1.pdf","comment":"Accepted to ICML 2023"},{"id":"http://arxiv.org/abs/2307.11224v1","updated":"2023-07-20T20:37:24Z","published":"2023-07-20T20:37:24Z","title":"Jina Embeddings: A Novel Set of High-Performance Sentence Embedding\n  Models","summary":"  Jina Embeddings constitutes a set of high-performance sentence embedding\nmodels adept at translating various textual inputs into numerical\nrepresentations, thereby capturing the semantic essence of the text. While\nthese models are not exclusively designed for text generation, they excel in\napplications such as dense retrieval and semantic textual similarity. This\npaper details the development of Jina Embeddings, starting with the creation of\na high-quality pairwise and triplet dataset. It underlines the crucial role of\ndata cleaning in dataset preparation, gives in-depth insights into the model\ntraining process, and concludes with a comprehensive performance evaluation\nusing the Massive Textual Embedding Benchmark (MTEB).\n","authors":["Michael Günther","Louis Milliken","Jonathan Geuter","Georgios Mastrapas","Bo Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.11224v1.pdf","comment":"9 pages, 2 page appendix, EMNLP 2023 Industrial Track"},{"id":"http://arxiv.org/abs/2307.11214v1","updated":"2023-07-20T19:56:30Z","published":"2023-07-20T19:56:30Z","title":"FairMobi-Net: A Fairness-aware Deep Learning Model for Urban Mobility\n  Flow Generation","summary":"  Generating realistic human flows across regions is essential for our\nunderstanding of urban structures and population activity patterns, enabling\nimportant applications in the fields of urban planning and management. However,\na notable shortcoming of most existing mobility generation methodologies is\nneglect of prediction fairness, which can result in underestimation of mobility\nflows across regions with vulnerable population groups, potentially resulting\nin inequitable resource distribution and infrastructure development. To\novercome this limitation, our study presents a novel, fairness-aware deep\nlearning model, FairMobi-Net, for inter-region human flow prediction. The\nFairMobi-Net model uniquely incorporates fairness loss into the loss function\nand employs a hybrid approach, merging binary classification and numerical\nregression techniques for human flow prediction. We validate the FairMobi-Net\nmodel using comprehensive human mobility datasets from four U.S. cities,\npredicting human flow at the census-tract level. Our findings reveal that the\nFairMobi-Net model outperforms state-of-the-art models (such as the DeepGravity\nmodel) in producing more accurate and equitable human flow predictions across a\nvariety of region pairs, regardless of regional income differences. The model\nmaintains a high degree of accuracy consistently across diverse regions,\naddressing the previous fairness concern. Further analysis of feature\nimportance elucidates the impact of physical distances and road network\nstructures on human flows across regions. With fairness as its touchstone, the\nmodel and results provide researchers and practitioners across the fields of\nurban sciences, transportation engineering, and computing with an effective\ntool for accurate generation of human mobility flows across regions.\n","authors":["Zhewei Liu","Lipai Huang","Chao Fan","Ali Mostafavi"],"pdf_url":"https://arxiv.org/pdf/2307.11214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11211v1","updated":"2023-07-20T19:53:09Z","published":"2023-07-20T19:53:09Z","title":"The Effect of Epidemiological Cohort Creation on the Machine Learning\n  Prediction of Homelessness and Police Interaction Outcomes Using\n  Administrative Health Care Data","summary":"  Background: Mental illness can lead to adverse outcomes such as homelessness\nand police interaction and understanding of the events leading up to these\nadverse outcomes is important. Predictive models may help identify individuals\nat risk of such adverse outcomes. Using a fixed observation window cohort with\nlogistic regression (LR) or machine learning (ML) models can result in lower\nperformance when compared with adaptive and parcellated windows. Method: An\nadministrative healthcare dataset was used, comprising of 240,219 individuals\nin Calgary, Alberta, Canada who were diagnosed with addiction or mental health\n(AMH) between April 1, 2013, and March 31, 2018. The cohort was followed for 2\nyears to identify factors associated with homelessness and police interactions.\nTo understand the benefit of flexible windows to predictive models, an\nalternative cohort was created. Then LR and ML models, including random forests\n(RF), and extreme gradient boosting (XGBoost) were compared in the two cohorts.\nResults: Among 237,602 individuals, 0.8% (1,800) experienced first\nhomelessness, while 0.32% (759) reported initial police interaction among\n237,141 individuals. Male sex (AORs: H=1.51, P=2.52), substance disorder (AORs:\nH=3.70, P=2.83), psychiatrist visits (AORs: H=1.44, P=1.49), and drug abuse\n(AORs: H=2.67, P=1.83) were associated with initial homelessness (H) and police\ninteraction (P). XGBoost showed superior performance using the flexible method\n(sensitivity =91%, AUC =90% for initial homelessness, and sensitivity =90%,\nAUC=89% for initial police interaction)\n  Conclusion: This study identified key features associated with initial\nhomelessness and police interaction and demonstrated that flexible windows can\nimprove predictive modeling.\n","authors":["Faezehsadat Shahidi","M. Ethan MacDonald","Dallas Seitz","Geoffrey Messier"],"pdf_url":"https://arxiv.org/pdf/2307.11211v1.pdf","comment":"to be published in Frontiers in Digital Health, Health Informatics"},{"id":"http://arxiv.org/abs/2307.11209v1","updated":"2023-07-20T19:52:14Z","published":"2023-07-20T19:52:14Z","title":"Clinical Trial Active Learning","summary":"  This paper presents a novel approach to active learning that takes into\naccount the non-independent and identically distributed (non-i.i.d.) structure\nof a clinical trial setting. There exists two types of clinical trials:\nretrospective and prospective. Retrospective clinical trials analyze data after\ntreatment has been performed; prospective clinical trials collect data as\ntreatment is ongoing. Typically, active learning approaches assume the dataset\nis i.i.d. when selecting training samples; however, in the case of clinical\ntrials, treatment results in a dependency between the data collected at the\ncurrent and past visits. Thus, we propose prospective active learning to\novercome the limitations present in traditional active learning methods and\napply it to disease detection in optical coherence tomography (OCT) images,\nwhere we condition on the time an image was collected to enforce the i.i.d.\nassumption. We compare our proposed method to the traditional active learning\nparadigm, which we refer to as retrospective in nature. We demonstrate that\nprospective active learning outperforms retrospective active learning in two\ndifferent types of test settings.\n","authors":["Zoe Fowler","Kiran Kokilepersaud","Mohit Prabhushankar","Ghassan AlRegib"],"pdf_url":"https://arxiv.org/pdf/2307.11209v1.pdf","comment":"Accepted at 14th ACM International Conference on Bioinformatics,\n  Computational Biology and Health Informatics (ACM-BCB)"},{"id":"http://arxiv.org/abs/2307.06324v4","updated":"2023-07-20T19:51:06Z","published":"2023-07-12T17:41:07Z","title":"Provably Faster Gradient Descent via Long Steps","summary":"  This work establishes provably faster convergence rates for gradient descent\nin smooth convex optimization via a computer-assisted analysis technique. Our\ntheory allows nonconstant stepsize policies with frequent long steps\npotentially violating descent by analyzing the overall effect of many\niterations at once rather than the typical one-iteration inductions used in\nmost first-order method analyses. We show that long steps, which may increase\nthe objective value in the short term, lead to provably faster convergence in\nthe long term. A conjecture towards proving a faster $O(1/T\\log T)$ rate for\ngradient descent is also motivated along with simple numerical validation.\n","authors":["Benjamin Grimmer"],"pdf_url":"https://arxiv.org/pdf/2307.06324v4.pdf","comment":"Apologies for the several updates done shortly after first posting\n  this work: In these, I have added more references to excellent relevant works\n  I missed in my initial literature review, esp the Master's thesis of Jason\n  Altschuler"},{"id":"http://arxiv.org/abs/2210.03297v2","updated":"2023-07-20T19:28:22Z","published":"2022-10-07T03:10:34Z","title":"Preprocessors Matter! Realistic Decision-Based Attacks on Machine\n  Learning Systems","summary":"  Decision-based attacks construct adversarial examples against a machine\nlearning (ML) model by making only hard-label queries. These attacks have\nmainly been applied directly to standalone neural networks. However, in\npractice, ML models are just one component of a larger learning system. We find\nthat by adding a single preprocessor in front of a classifier, state-of-the-art\nquery-based attacks are up to 7$\\times$ less effective at attacking a\nprediction pipeline than at attacking the model alone. We explain this\ndiscrepancy by the fact that most preprocessors introduce some notion of\ninvariance to the input space. Hence, attacks that are unaware of this\ninvariance inevitably waste a large number of queries to re-discover or\novercome it. We, therefore, develop techniques to (i) reverse-engineer the\npreprocessor and then (ii) use this extracted information to attack the\nend-to-end system. Our preprocessors extraction method requires only a few\nhundred queries, and our preprocessor-aware attacks recover the same efficacy\nas when attacking the model alone. The code can be found at\nhttps://github.com/google-research/preprocessor-aware-black-box-attack.\n","authors":["Chawin Sitawarin","Florian Tramèr","Nicholas Carlini"],"pdf_url":"https://arxiv.org/pdf/2210.03297v2.pdf","comment":"ICML 2023. Code can be found at\n  https://github.com/google-research/preprocessor-aware-black-box-attack"},{"id":"http://arxiv.org/abs/2307.11197v1","updated":"2023-07-20T19:20:35Z","published":"2023-07-20T19:20:35Z","title":"Heuristic Hyperparameter Choice for Image Anomaly Detection","summary":"  Anomaly detection (AD) in images is a fundamental computer vision problem by\ndeep learning neural network to identify images deviating significantly from\nnormality. The deep features extracted from pretrained models have been proved\nto be essential for AD based on multivariate Gaussian distribution analysis.\nHowever, since models are usually pretrained on a large dataset for\nclassification tasks such as ImageNet, they might produce lots of redundant\nfeatures for AD, which increases computational cost and degrades the\nperformance. We aim to do the dimension reduction of Negated Principal\nComponent Analysis (NPCA) for these features. So we proposed some heuristic to\nchoose hyperparameter of NPCA algorithm for getting as fewer components of\nfeatures as possible while ensuring a good performance.\n","authors":["Zeyu Jiang","João P. C. Bertoldo","Etienne Decencière"],"pdf_url":"https://arxiv.org/pdf/2307.11197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03933v2","updated":"2023-07-20T19:12:45Z","published":"2023-06-06T18:01:03Z","title":"High-dimensional and Permutation Invariant Anomaly Detection","summary":"  Methods for anomaly detection of new physics processes are often limited to\nlow-dimensional spaces due to the difficulty of learning high-dimensional\nprobability densities. Particularly at the constituent level, incorporating\ndesirable properties such as permutation invariance and variable-length inputs\nbecomes difficult within popular density estimation methods. In this work, we\nintroduce a permutation-invariant density estimator for particle physics data\nbased on diffusion models, specifically designed to handle variable-length\ninputs. We demonstrate the efficacy of our methodology by utilizing the learned\ndensity as a permutation-invariant anomaly detection score, effectively\nidentifying jets with low likelihood under the background-only hypothesis. To\nvalidate our density estimation method, we investigate the ratio of learned\ndensities and compare to those obtained by a supervised classification\nalgorithm.\n","authors":["Vinicius Mikuni","Benjamin Nachman"],"pdf_url":"https://arxiv.org/pdf/2306.03933v2.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2212.12606v2","updated":"2023-07-20T18:58:11Z","published":"2022-12-23T22:44:25Z","title":"A Convergence Rate for Manifold Neural Networks","summary":"  High-dimensional data arises in numerous applications, and the rapidly\ndeveloping field of geometric deep learning seeks to develop neural network\narchitectures to analyze such data in non-Euclidean domains, such as graphs and\nmanifolds. Recent work by Z. Wang, L. Ruiz, and A. Ribeiro has introduced a\nmethod for constructing manifold neural networks using the spectral\ndecomposition of the Laplace Beltrami operator. Moreover, in this work, the\nauthors provide a numerical scheme for implementing such neural networks when\nthe manifold is unknown and one only has access to finitely many sample points.\nThe authors show that this scheme, which relies upon building a data-driven\ngraph, converges to the continuum limit as the number of sample points tends to\ninfinity. Here, we build upon this result by establishing a rate of convergence\nthat depends on the intrinsic dimension of the manifold but is independent of\nthe ambient dimension. We also discuss how the rate of convergence depends on\nthe depth of the network and the number of filters used in each layer.\n","authors":["Joyce Chew","Deanna Needell","Michael Perlmutter"],"pdf_url":"https://arxiv.org/pdf/2212.12606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.11589v2","updated":"2023-07-20T18:48:37Z","published":"2022-10-20T21:01:14Z","title":"Monotonic Risk Relationships under Distribution Shifts for Regularized\n  Risk Minimization","summary":"  Machine learning systems are often applied to data that is drawn from a\ndifferent distribution than the training distribution. Recent work has shown\nthat for a variety of classification and signal reconstruction problems, the\nout-of-distribution performance is strongly linearly correlated with the\nin-distribution performance. If this relationship or more generally a monotonic\none holds, it has important consequences. For example, it allows to optimize\nperformance on one distribution as a proxy for performance on the other. In\nthis paper, we study conditions under which a monotonic relationship between\nthe performances of a model on two distributions is expected. We prove an exact\nasymptotic linear relation for squared error and a monotonic relation for\nmisclassification error for ridge-regularized general linear models under\ncovariate shift, as well as an approximate linear relation for linear inverse\nproblems.\n","authors":["Daniel LeJeune","Jiayu Liu","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2210.11589v2.pdf","comment":"34 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.09782v2","updated":"2023-07-20T18:47:20Z","published":"2023-07-19T06:58:03Z","title":"ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization\n  Using Floating-Point Formats","summary":"  In the complex domain of large language models (LLMs), striking a balance\nbetween computational efficiency and maintaining model quality is a formidable\nchallenge. Navigating the inherent limitations of uniform quantization,\nparticularly when dealing with outliers, and motivated by the launch of\nNVIDIA's H100 hardware, this study delves into the viability of floating-point\n(FP) quantization, particularly focusing on FP8 and FP4, as a potential\nsolution. Our comprehensive investigation reveals that for LLMs, FP8 activation\nconsistently outshines its integer (INT8) equivalent, with the performance edge\nbecoming more noticeable in models possessing parameters beyond one billion.\nFor weight quantization, our findings indicate that FP4 exhibits comparable, if\nnot superior, performance to INT4, simplifying deployment on FP-supported\nhardware like H100. To mitigate the overhead from precision alignment caused by\nthe disparity between weights and activations, we propose two scaling\nconstraints for weight quantization that negligibly impact the performance\ncompared to the standard W4A8 model. We additionally enhance our quantization\nmethods by integrating the Low Rank Compensation (LoRC) strategy, yielding\nimprovements especially in smaller models. The results of our investigation\nemphasize the immense potential of FP quantization for LLMs, paving the way for\nhigh-efficiency deployment in resource-limited settings.\n","authors":["Xiaoxia Wu","Zhewei Yao","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2307.09782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11166v1","updated":"2023-07-20T18:01:48Z","published":"2023-07-20T18:01:48Z","title":"Exploring reinforcement learning techniques for discrete and continuous\n  control tasks in the MuJoCo environment","summary":"  We leverage the fast physics simulator, MuJoCo to run tasks in a continuous\ncontrol environment and reveal details like the observation space, action\nspace, rewards, etc. for each task. We benchmark value-based methods for\ncontinuous control by comparing Q-learning and SARSA through a discretization\napproach, and using them as baselines, progressively moving into one of the\nstate-of-the-art deep policy gradient method DDPG. Over a large number of\nepisodes, Qlearning outscored SARSA, but DDPG outperformed both in a small\nnumber of episodes. Lastly, we also fine-tuned the model hyper-parameters\nexpecting to squeeze more performance but using lesser time and resources. We\nanticipated that the new design for DDPG would vastly improve performance, yet\nafter only a few episodes, we were able to achieve decent average rewards. We\nexpect to improve the performance provided adequate time and computational\nresources.\n","authors":["Vaddadi Sai Rahul","Debajyoti Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2307.11166v1.pdf","comment":"Released @ Dec 2021. For associated project files, see\n  https://github.com/chakrabortyde/mujoco-control-tasks"}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.11025v1","updated":"2023-07-20T16:53:41Z","published":"2023-07-20T16:53:41Z","title":"Investigating VTubing as a Reconstruction of Streamer Self-Presentation:\n  Identity, Performance, and Gender","summary":"  VTubers, or Virtual YouTubers, are live streamers who create streaming\ncontent using animated 2D or 3D virtual avatars. In recent years, there has\nbeen a significant increase in the number of VTuber creators and viewers across\nthe globe. This practise has drawn research attention into topics such as\nviewers' engagement behaviors and perceptions, however, as animated avatars\noffer more identity and performance flexibility than traditional live streaming\nwhere one uses their own body, little research has focused on how this\nflexibility influences how creators present themselves. This research thus\nseeks to fill this gap by presenting results from a qualitative study of 16\nChinese-speaking VTubers' streaming practices. The data revealed that the\nvirtual avatars that were used while live streaming afforded creators\nopportunities to present themselves using inflated presentations and resulted\nin inclusive interactions with viewers. The results also unveiled the inflated,\nand often sexualized, gender expressions of VTubers while they were situated in\nmisogynistic environments. The socio-technical facets of VTubing were found to\npotentially reduce sexual harassment and sexism, whilst also raising\nself-objectification concerns.\n","authors":["Qian Wan","Zhicong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.11025v1.pdf","comment":"Under review at ACM CSCW after a Major Revision"},{"id":"http://arxiv.org/abs/2210.05335v3","updated":"2023-07-20T16:24:14Z","published":"2022-10-11T10:54:54Z","title":"MAP: Multimodal Uncertainty-Aware Vision-Language Pre-training Model","summary":"  Multimodal semantic understanding often has to deal with uncertainty, which\nmeans the obtained messages tend to refer to multiple targets. Such uncertainty\nis problematic for our interpretation, including inter- and intra-modal\nuncertainty. Little effort has studied the modeling of this uncertainty,\nparticularly in pre-training on unlabeled datasets and fine-tuning in\ntask-specific downstream datasets. In this paper, we project the\nrepresentations of all modalities as probabilistic distributions via a\nProbability Distribution Encoder (PDE) by utilizing sequence-level\ninteractions. Compared to the existing deterministic methods, such uncertainty\nmodeling can convey richer multimodal semantic information and more complex\nrelationships. Furthermore, we integrate uncertainty modeling with popular\npre-training frameworks and propose suitable pre-training tasks:\nDistribution-based Vision-Language Contrastive learning (D-VLC),\nDistribution-based Masked Language Modeling (D-MLM), and Distribution-based\nImage-Text Matching (D-ITM). The fine-tuned models are applied to challenging\ndownstream tasks, including image-text retrieval, visual question answering,\nvisual reasoning, and visual entailment, and achieve state-of-the-art results.\n","authors":["Yatai Ji","Junjie Wang","Yuan Gong","Lin Zhang","Yanru Zhu","Hongfa Wang","Jiaxing Zhang","Tetsuya Sakai","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2210.05335v3.pdf","comment":"CVPR 2023 Main Track Long Paper"},{"id":"http://arxiv.org/abs/2307.10802v1","updated":"2023-07-20T12:10:29Z","published":"2023-07-20T12:10:29Z","title":"Meta-Transformer: A Unified Framework for Multimodal Learning","summary":"  Multimodal learning aims to build models that can process and relate\ninformation from multiple modalities. Despite years of development in this\nfield, it still remains challenging to design a unified network for processing\nvarious modalities ($\\textit{e.g.}$ natural language, 2D images, 3D point\nclouds, audio, video, time series, tabular data) due to the inherent gaps among\nthem. In this work, we propose a framework, named Meta-Transformer, that\nleverages a $\\textbf{frozen}$ encoder to perform multimodal perception without\nany paired multimodal training data. In Meta-Transformer, the raw input data\nfrom various modalities are mapped into a shared token space, allowing a\nsubsequent encoder with frozen parameters to extract high-level semantic\nfeatures of the input data. Composed of three main components: a unified data\ntokenizer, a modality-shared encoder, and task-specific heads for downstream\ntasks, Meta-Transformer is the first framework to perform unified learning\nacross 12 modalities with unpaired data. Experiments on different benchmarks\nreveal that Meta-Transformer can handle a wide range of tasks including\nfundamental perception (text, image, point cloud, audio, video), practical\napplication (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph,\ntabular, and time-series). Meta-Transformer indicates a promising future for\ndeveloping unified multimodal intelligence with transformers. Code will be\navailable at https://github.com/invictus717/MetaTransformer\n","authors":["Yiyuan Zhang","Kaixiong Gong","Kaipeng Zhang","Hongsheng Li","Yu Qiao","Wanli Ouyang","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.10802v1.pdf","comment":"Project website: https://kxgong.github.io/meta_transformer/"},{"id":"http://arxiv.org/abs/2303.12112v3","updated":"2023-07-20T08:16:09Z","published":"2023-03-21T18:03:14Z","title":"Positive-Augmented Contrastive Learning for Image and Video Captioning\n  Evaluation","summary":"  The CLIP model has been recently proven to be very effective for a variety of\ncross-modal tasks, including the evaluation of captions generated from\nvision-and-language architectures. In this paper, we propose a new recipe for a\ncontrastive-based evaluation metric for image captioning, namely\nPositive-Augmented Contrastive learning Score (PAC-S), that in a novel way\nunifies the learning of a contrastive visual-semantic space with the addition\nof generated images and text on curated data. Experiments spanning several\ndatasets demonstrate that our new metric achieves the highest correlation with\nhuman judgments on both images and videos, outperforming existing\nreference-based metrics like CIDEr and SPICE and reference-free metrics like\nCLIP-Score. Finally, we test the system-level correlation of the proposed\nmetric when considering popular image captioning approaches, and assess the\nimpact of employing different cross-modal features. Our source code and trained\nmodels are publicly available at: https://github.com/aimagelab/pacscore.\n","authors":["Sara Sarto","Manuele Barraco","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2303.12112v3.pdf","comment":"CVPR 2023 (highlight paper)"},{"id":"http://arxiv.org/abs/2307.10642v1","updated":"2023-07-20T07:12:56Z","published":"2023-07-20T07:12:56Z","title":"RetouchingFFHQ: A Large-scale Dataset for Fine-grained Face Retouching\n  Detection","summary":"  The widespread use of face retouching filters on short-video platforms has\nraised concerns about the authenticity of digital appearances and the impact of\ndeceptive advertising. To address these issues, there is a pressing need to\ndevelop advanced face retouching techniques. However, the lack of large-scale\nand fine-grained face retouching datasets has been a major obstacle to progress\nin this field. In this paper, we introduce RetouchingFFHQ, a large-scale and\nfine-grained face retouching dataset that contains over half a million\nconditionally-retouched images. RetouchingFFHQ stands out from previous\ndatasets due to its large scale, high quality, fine-grainedness, and\ncustomization. By including four typical types of face retouching operations\nand different retouching levels, we extend the binary face retouching detection\ninto a fine-grained, multi-retouching type, and multi-retouching level\nestimation problem. Additionally, we propose a Multi-granularity Attention\nModule (MAM) as a plugin for CNN backbones for enhanced cross-scale\nrepresentation learning. Extensive experiments using different baselines as\nwell as our proposed method on RetouchingFFHQ show decent performance on face\nretouching detection. With the proposed new dataset, we believe there is great\npotential for future work to tackle the challenging problem of real-world\nfine-grained face retouching detection.\n","authors":["Qichao Ying","Jiaxin Liu","Sheng Li","Haisheng Xu","Zhenxing Qian","Xinpeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10642v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2306.03718v4","updated":"2023-07-20T03:06:50Z","published":"2023-06-06T14:28:57Z","title":"Emotion-Conditioned Melody Harmonization with Hierarchical Variational\n  Autoencoder","summary":"  Existing melody harmonization models have made great progress in improving\nthe quality of generated harmonies, but most of them ignored the emotions\nbeneath the music. Meanwhile, the variability of harmonies generated by\nprevious methods is insufficient. To solve these problems, we propose a novel\nLSTM-based Hierarchical Variational Auto-Encoder (LHVAE) to investigate the\ninfluence of emotional conditions on melody harmonization, while improving the\nquality of generated harmonies and capturing the abundant variability of chord\nprogressions. Specifically, LHVAE incorporates latent variables and emotional\nconditions at different levels (piece- and bar-level) to model the global and\nlocal music properties. Additionally, we introduce an attention-based melody\ncontext vector at each step to better learn the correspondence between melodies\nand harmonies. Objective experimental results show that our proposed model\noutperforms other LSTM-based models. Through subjective evaluation, we conclude\nthat only altering the types of chords hardly changes the overall emotion of\nthe music. The qualitative analysis demonstrates the ability of our model to\ngenerate variable harmonies.\n","authors":["Shulei Ji","Xinyu Yang"],"pdf_url":"https://arxiv.org/pdf/2306.03718v4.pdf","comment":"Accepted by IEEE SMC 2023"}]},"2023-07-21T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.11729v1","updated":"2023-07-21T17:40:47Z","published":"2023-07-21T17:40:47Z","title":"OUTFOX: LLM-generated Essay Detection through In-context Learning with\n  Adversarially Generated Examples","summary":"  Large Language Models (LLMs) have achieved human-level fluency in text\ngeneration, making it difficult to distinguish between human-written and\nLLM-generated texts. This poses a growing risk of misuse of LLMs and demands\nthe development of detectors to identify LLM-generated texts. However, existing\ndetectors degrade detection accuracy by simply paraphrasing LLM-generated\ntexts. Furthermore, the effectiveness of these detectors in real-life\nsituations, such as when students use LLMs for writing homework assignments\n(e.g., essays) and quickly learn how to evade these detectors, has not been\nexplored. In this paper, we propose OUTFOX, a novel framework that improves the\nrobustness of LLM-generated-text detectors by allowing both the detector and\nthe attacker to consider each other's output and apply this to the domain of\nstudent essays. In our framework, the attacker uses the detector's prediction\nlabels as examples for in-context learning and adversarially generates essays\nthat are harder to detect. While the detector uses the adversarially generated\nessays as examples for in-context learning to learn to detect essays from a\nstrong attacker. Our experiments show that our proposed detector learned\nin-context from the attacker improves the detection performance on the attacked\ndataset by up to +41.3 point F1-score. While our proposed attacker can\ndrastically degrade the performance of the detector by up to -57.0 point\nF1-score compared to the paraphrasing method.\n","authors":["Ryuto Koike","Masahiro Kaneko","Naoaki Okazaki"],"pdf_url":"https://arxiv.org/pdf/2307.11729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10490v2","updated":"2023-07-21T16:51:15Z","published":"2023-07-19T23:03:20Z","title":"(Ab)using Images and Sounds for Indirect Instruction Injection in\n  Multi-Modal LLMs","summary":"  We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06576v3","updated":"2023-07-21T16:06:32Z","published":"2023-07-13T06:25:22Z","title":"Going Beyond Local: Global Graph-Enhanced Personalized News\n  Recommendations","summary":"  Precisely recommending candidate news articles to users has always been a\ncore challenge for personalized news recommendation systems. Most recent works\nprimarily focus on using advanced natural language processing techniques to\nextract semantic information from rich textual data, employing content-based\nmethods derived from local historical news. However, this approach lacks a\nglobal perspective, failing to account for users' hidden motivations and\nbehaviors beyond semantic information. To address this challenge, we propose a\nnovel model called GLORY (Global-LOcal news Recommendation sYstem), which\ncombines global representations learned from other users with local\nrepresentations to enhance personalized recommendation systems. We accomplish\nthis by constructing a Global-aware Historical News Encoder, which includes a\nglobal news graph and employs gated graph neural networks to enrich news\nrepresentations, thereby fusing historical news representations by a historical\nnews aggregator. Similarly, we extend this approach to a Global Candidate News\nEncoder, utilizing a global entity graph and a candidate news aggregator to\nenhance candidate news representation. Evaluation results on two public news\ndatasets demonstrate that our method outperforms existing approaches.\nFurthermore, our model offers more diverse recommendations.\n","authors":["Boming Yang","Dairui Liu","Toyotaro Suzumura","Ruihai Dong","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2307.06576v3.pdf","comment":"10 pages, Recsys 2023"},{"id":"http://arxiv.org/abs/2307.11661v1","updated":"2023-07-21T15:49:59Z","published":"2023-07-21T15:49:59Z","title":"Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts","summary":"  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have\nrevolutionized visual representation learning by providing good performance on\ndownstream datasets. VLMs are 0-shot adapted to a downstream dataset by\ndesigning prompts that are relevant to the dataset. Such prompt engineering\nmakes use of domain expertise and a validation dataset. Meanwhile, recent\ndevelopments in generative pretrained models like GPT-4 mean they can be used\nas advanced internet search tools. They can also be manipulated to provide\nvisual information in any structure. In this work, we show that GPT-4 can be\nused to generate text that is visually descriptive and how this can be used to\nadapt CLIP to downstream tasks. We show considerable improvements in 0-shot\ntransfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD\n(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.\nWe also design a simple few-shot adapter that learns to choose the best\npossible sentences to construct generalizable classifiers that outperform the\nrecently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized\nfine-grained datasets. We will release the code, prompts, and auxiliary text\ndataset upon acceptance.\n","authors":["Mayug Maniparambil","Chris Vorster","Derek Molloy","Noel Murphy","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11661v1.pdf","comment":"10 pages, Pre-print"},{"id":"http://arxiv.org/abs/2307.11636v1","updated":"2023-07-21T14:58:44Z","published":"2023-07-21T14:58:44Z","title":"OxfordTVG-HIC: Can Machine Make Humorous Captions from Images?","summary":"  This paper presents OxfordTVG-HIC (Humorous Image Captions), a large-scale\ndataset for humour generation and understanding. Humour is an abstract,\nsubjective, and context-dependent cognitive construct involving several\ncognitive factors, making it a challenging task to generate and interpret.\nHence, humour generation and understanding can serve as a new task for\nevaluating the ability of deep-learning methods to process abstract and\nsubjective information. Due to the scarcity of data, humour-related generation\ntasks such as captioning remain under-explored. To address this gap,\nOxfordTVG-HIC offers approximately 2.9M image-text pairs with humour scores to\ntrain a generalizable humour captioning model. Contrary to existing captioning\ndatasets, OxfordTVG-HIC features a wide range of emotional and semantic\ndiversity resulting in out-of-context examples that are particularly conducive\nto generating humour. Moreover, OxfordTVG-HIC is curated devoid of offensive\ncontent. We also show how OxfordTVG-HIC can be leveraged for evaluating the\nhumour of a generated text. Through explainability analysis of the trained\nmodels, we identify the visual and linguistic cues influential for evoking\nhumour prediction (and generation). We observe qualitatively that these cues\nare aligned with the benign violation theory of humour in cognitive psychology.\n","authors":["Runjia Li","Shuyang Sun","Mohamed Elhoseiny","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2307.11636v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2212.09648v4","updated":"2023-07-21T14:44:45Z","published":"2022-12-19T17:28:22Z","title":"NusaCrowd: Open Source Initiative for Indonesian NLP Resources","summary":"  We present NusaCrowd, a collaborative initiative to collect and unify\nexisting resources for Indonesian languages, including opening access to\npreviously non-public resources. Through this initiative, we have brought\ntogether 137 datasets and 118 standardized data loaders. The quality of the\ndatasets has been assessed manually and automatically, and their value is\ndemonstrated through multiple experiments. NusaCrowd's data collection enables\nthe creation of the first zero-shot benchmarks for natural language\nunderstanding and generation in Indonesian and the local languages of\nIndonesia. Furthermore, NusaCrowd brings the creation of the first multilingual\nautomatic speech recognition benchmark in Indonesian and the local languages of\nIndonesia. Our work strives to advance natural language processing (NLP)\nresearch for languages that are under-represented despite being widely spoken.\n","authors":["Samuel Cahyawijaya","Holy Lovenia","Alham Fikri Aji","Genta Indra Winata","Bryan Wilie","Rahmad Mahendra","Christian Wibisono","Ade Romadhony","Karissa Vincentio","Fajri Koto","Jennifer Santoso","David Moeljadi","Cahya Wirawan","Frederikus Hudi","Ivan Halim Parmonangan","Ika Alfina","Muhammad Satrio Wicaksono","Ilham Firdausi Putra","Samsul Rahmadani","Yulianti Oenang","Ali Akbar Septiandri","James Jaya","Kaustubh D. Dhole","Arie Ardiyanti Suryani","Rifki Afina Putri","Dan Su","Keith Stevens","Made Nindyatama Nityasya","Muhammad Farid Adilazuarda","Ryan Ignatius","Ryandito Diandaru","Tiezheng Yu","Vito Ghifari","Wenliang Dai","Yan Xu","Dyah Damapuspita","Cuk Tho","Ichwanul Muslim Karo Karo","Tirana Noor Fatyanosa","Ziwei Ji","Pascale Fung","Graham Neubig","Timothy Baldwin","Sebastian Ruder","Herry Sujaini","Sakriani Sakti","Ayu Purwarianti"],"pdf_url":"https://arxiv.org/pdf/2212.09648v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11610v1","updated":"2023-07-21T14:25:39Z","published":"2023-07-21T14:25:39Z","title":"CausE: Towards Causal Knowledge Graph Embedding","summary":"  Knowledge graph embedding (KGE) focuses on representing the entities and\nrelations of a knowledge graph (KG) into the continuous vector spaces, which\ncan be employed to predict the missing triples to achieve knowledge graph\ncompletion (KGC). However, KGE models often only briefly learn structural\ncorrelations of triple data and embeddings would be misled by the trivial\npatterns and noisy links in real-world KGs. To address this issue, we build the\nnew paradigm of KGE in the context of causality and embedding disentanglement.\nWe further propose a Causality-enhanced knowledge graph Embedding (CausE)\nframework. CausE employs causal intervention to estimate the causal effect of\nthe confounder embeddings and design new training objectives to make stable\npredictions. Experimental results demonstrate that CausE could outperform the\nbaseline models and achieve state-of-the-art KGC performance. We release our\ncode in https://github.com/zjukg/CausE.\n","authors":["Yichi Zhang","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.00841v3","updated":"2023-07-21T14:03:40Z","published":"2021-07-02T05:29:39Z","title":"ClueReader: Heterogeneous Graph Attention Network for Multi-hop Machine\n  Reading Comprehension","summary":"  Multi-hop machine reading comprehension is a challenging task in natural\nlanguage processing as it requires more reasoning ability across multiple\ndocuments. Spectral models based on graph convolutional networks have shown\ngood inferring abilities and lead to competitive results. However, the analysis\nand reasoning of some are inconsistent with those of humans. Inspired by the\nconcept of grandmother cells in cognitive neuroscience, we propose a\nheterogeneous graph attention network model named ClueReader to imitate the\ngrandmother cell concept. The model is designed to assemble the semantic\nfeatures in multi-level representations and automatically concentrate or\nalleviate information for reasoning through the attention mechanism. The name\nClueReader is a metaphor for the pattern of the model: it regards the subjects\nof queries as the starting points of clues, takes the reasoning entities as\nbridge points, considers the latent candidate entities as grandmother cells,\nand the clues end up in candidate entities. The proposed model enables the\nvisualization of the reasoning graph, making it possible to analyze the\nimportance of edges connecting entities and the selectivity in the mention and\ncandidate nodes, which is easier to comprehend empirically. Evaluations on the\nopen-domain multi-hop reading dataset WikiHop and drug-drug interaction dataset\nMedHop proved the validity of ClueReader and showed the feasibility of its\napplication of the model in the molecular biology domain.\n","authors":["Peng Gao","Feng Gao","Peng Wang","Jian-Cheng Ni","Fei Wang","Hamido Fujita"],"pdf_url":"https://arxiv.org/pdf/2107.00841v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11584v1","updated":"2023-07-21T13:48:11Z","published":"2023-07-21T13:48:11Z","title":"A Change of Heart: Improving Speech Emotion Recognition through\n  Speech-to-Text Modality Conversion","summary":"  Speech Emotion Recognition (SER) is a challenging task. In this paper, we\nintroduce a modality conversion concept aimed at enhancing emotion recognition\nperformance on the MELD dataset. We assess our approach through two\nexperiments: first, a method named Modality-Conversion that employs automatic\nspeech recognition (ASR) systems, followed by a text classifier; second, we\nassume perfect ASR output and investigate the impact of modality conversion on\nSER, this method is called Modality-Conversion++. Our findings indicate that\nthe first method yields substantial results, while the second method\noutperforms state-of-the-art (SOTA) speech-based approaches in terms of SER\nweighted-F1 (WF1) score on the MELD dataset. This research highlights the\npotential of modality conversion for tasks that can be conducted in alternative\nmodalities.\n","authors":["Zeinab Sadat Taghavi","Ali Satvaty","Hossein Sameti"],"pdf_url":"https://arxiv.org/pdf/2307.11584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11558v1","updated":"2023-07-21T13:06:02Z","published":"2023-07-21T13:06:02Z","title":"Advancing Visual Grounding with Scene Knowledge: Benchmark and Method","summary":"  Visual grounding (VG) aims to establish fine-grained alignment between vision\nand language. Ideally, it can be a testbed for vision-and-language models to\nevaluate their understanding of the images and texts and their reasoning\nabilities over their joint space. However, most existing VG datasets are\nconstructed using simple description texts, which do not require sufficient\nreasoning over the images and texts. This has been demonstrated in a recent\nstudy~\\cite{luo2022goes}, where a simple LSTM-based text encoder without\npretraining can achieve state-of-the-art performance on mainstream VG datasets.\nTherefore, in this paper, we propose a novel benchmark of \\underline{S}cene\n\\underline{K}nowledge-guided \\underline{V}isual \\underline{G}rounding (SK-VG),\nwhere the image content and referring expressions are not sufficient to ground\nthe target objects, forcing the models to have a reasoning ability on the\nlong-form scene knowledge. To perform this task, we propose two approaches to\naccept the triple-type input, where the former embeds knowledge into the image\nfeatures before the image-query interaction; the latter leverages linguistic\nstructure to assist in computing the image-text matching. We conduct extensive\nexperiments to analyze the above methods and show that the proposed approaches\nachieve promising results but still leave room for improvement, including\nperformance and interpretability. The dataset and code are available at\n\\url{https://github.com/zhjohnchan/SK-VG}.\n","authors":["Zhihong Chen","Ruifei Zhang","Yibing Song","Xiang Wan","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2307.11558v1.pdf","comment":"Computer Vision and Natural Language Processing. 21 pages, 14\n  figures. CVPR-2023"},{"id":"http://arxiv.org/abs/2307.11545v1","updated":"2023-07-21T12:46:15Z","published":"2023-07-21T12:46:15Z","title":"Bridging Vision and Language Encoders: Parameter-Efficient Tuning for\n  Referring Image Segmentation","summary":"  Parameter Efficient Tuning (PET) has gained attention for reducing the number\nof parameters while maintaining performance and providing better hardware\nresource savings, but few studies investigate dense prediction tasks and\ninteraction between modalities. In this paper, we do an investigation of\nefficient tuning problems on referring image segmentation. We propose a novel\nadapter called Bridger to facilitate cross-modal information exchange and\ninject task-specific information into the pre-trained model. We also design a\nlightweight decoder for image segmentation. Our approach achieves comparable or\nsuperior performance with only 1.61\\% to 3.38\\% backbone parameter updates,\nevaluated on challenging benchmarks. The code is available at\n\\url{https://github.com/kkakkkka/ETRIS}.\n","authors":["Zunnan Xu","Zhihong Chen","Yong Zhang","Yibing Song","Xiang Wan","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2307.11545v1.pdf","comment":"Computer Vision and Natural Language Processing. 14 pages, 8 figures.\n  ICCV-2023"},{"id":"http://arxiv.org/abs/2307.11516v1","updated":"2023-07-21T11:54:53Z","published":"2023-07-21T11:54:53Z","title":"IndigoVX: Where Human Intelligence Meets AI for Optimal Decision Making","summary":"  This paper defines a new approach for augmenting human intelligence with AI\nfor optimal goal solving. Our proposed AI, Indigo, is an acronym for Informed\nNumerical Decision-making through Iterative Goal-Oriented optimization. When\ncombined with a human collaborator, we term the joint system IndigoVX, for\nVirtual eXpert. The system is conceptually simple. We envisage this method\nbeing applied to games or business strategies, with the human providing\nstrategic context and the AI offering optimal, data-driven moves. Indigo\noperates through an iterative feedback loop, harnessing the human expert's\ncontextual knowledge and the AI's data-driven insights to craft and refine\nstrategies towards a well-defined goal. Using a quantified three-score schema,\nthis hybridization allows the combined team to evaluate strategies and refine\ntheir plan, while adapting to challenges and changes in real-time.\n","authors":["Kais Dukes"],"pdf_url":"https://arxiv.org/pdf/2307.11516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.04900v2","updated":"2023-07-21T11:04:29Z","published":"2021-05-11T09:41:25Z","title":"Forecasting consumer confidence through semantic network analysis of\n  online news","summary":"  This research studies the impact of online news on social and economic\nconsumer perceptions through semantic network analysis. Using over 1.8 million\nonline articles on Italian media covering four years, we calculate the semantic\nimportance of specific economic-related keywords to see if words appearing in\nthe articles could anticipate consumers' judgments about the economic situation\nand the Consumer Confidence Index. We use an innovative approach to analyze big\ntextual data, combining methods and tools of text mining and social network\nanalysis. Results show a strong predictive power for the judgments about the\ncurrent households and national situation. Our indicator offers a complementary\napproach to estimating consumer confidence, lessening the limitations of\ntraditional survey-based methods.\n","authors":["A. Fronzetti Colladon","F. Grippa","B. Guardabascio","G. Costante","F. Ravazzolo"],"pdf_url":"https://arxiv.org/pdf/2105.04900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12851v2","updated":"2023-07-21T10:22:53Z","published":"2023-05-22T09:20:58Z","title":"Enhancing Coherence of Extractive Summarization with Multitask Learning","summary":"  This study proposes a multitask learning architecture for extractive\nsummarization with coherence boosting. The architecture contains an extractive\nsummarizer and coherent discriminator module. The coherent discriminator is\ntrained online on the sentence vectors of the augmented textual input, thus\nimproving its general ability of judging whether the input sentences are\ncoherent. Meanwhile, we maximize the coherent scores from the coherent\ndiscriminator by updating the parameters of the summarizer. To make the\nextractive sentences trainable in a differentiable manner, we introduce two\nstrategies, including pre-trained converting model (model-based) and converting\nmatrix (MAT-based) that merge sentence representations. Experiments show that\nour proposed method significantly improves the proportion of consecutive\nsentences in the extracted summaries based on their positions in the original\narticle (i.e., automatic sentence-level coherence metric), while the goodness\nin terms of other automatic metrics (i.e., Rouge scores and BertScores) are\npreserved. Human evaluation also evidences the improvement of coherence and\nconsistency of the extracted summaries given by our method.\n","authors":["Renlong Jie","Xiaojun Meng","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2305.12851v2.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.11457v1","updated":"2023-07-21T09:39:50Z","published":"2023-07-21T09:39:50Z","title":"Incorporating Human Translator Style into English-Turkish Literary\n  Machine Translation","summary":"  Although machine translation systems are mostly designed to serve in the\ngeneral domain, there is a growing tendency to adapt these systems to other\ndomains like literary translation. In this paper, we focus on English-Turkish\nliterary translation and develop machine translation models that take into\naccount the stylistic features of translators. We fine-tune a pre-trained\nmachine translation model by the manually-aligned works of a particular\ntranslator. We make a detailed analysis of the effects of manual and automatic\nalignments, data augmentation methods, and corpus size on the translations. We\npropose an approach based on stylistic features to evaluate the style of a\ntranslator in the output translations. We show that the human translator style\ncan be highly recreated in the target machine translations by adapting the\nmodels to the style of the translator.\n","authors":["Zeynep Yirmibeşoğlu","Olgun Dursun","Harun Dallı","Mehmet Şahin","Ena Hodzik","Sabri Gürses","Tunga Güngör"],"pdf_url":"https://arxiv.org/pdf/2307.11457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11450v1","updated":"2023-07-21T09:30:46Z","published":"2023-07-21T09:30:46Z","title":"Topic Identification For Spontaneous Speech: Enriching Audio Features\n  With Embedded Linguistic Information","summary":"  Traditional topic identification solutions from audio rely on an automatic\nspeech recognition system (ASR) to produce transcripts used as input to a\ntext-based model. These approaches work well in high-resource scenarios, where\nthere are sufficient data to train both components of the pipeline. However, in\nlow-resource situations, the ASR system, even if available, produces\nlow-quality transcripts, leading to a bad text-based classifier. Moreover,\nspontaneous speech containing hesitations can further degrade the performance\nof the ASR model. In this paper, we investigate alternatives to the standard\ntext-only solutions by comparing audio-only and hybrid techniques of jointly\nutilising text and audio features. The models evaluated on spontaneous Finnish\nspeech demonstrate that purely audio-based solutions are a viable option when\nASR components are not available, while the hybrid multi-modal solutions\nachieve the best results.\n","authors":["Dejan Porjazovski","Tamás Grósz","Mikko Kurimo"],"pdf_url":"https://arxiv.org/pdf/2307.11450v1.pdf","comment":"Accepted to EUSIPCO 2023"},{"id":"http://arxiv.org/abs/2306.14096v3","updated":"2023-07-21T08:57:38Z","published":"2023-06-25T02:24:30Z","title":"Chinese Fine-Grained Financial Sentiment Analysis with Large Language\n  Models","summary":"  Entity-level fine-grained sentiment analysis in the financial domain is a\ncrucial subtask of sentiment analysis and currently faces numerous challenges.\nThe primary challenge stems from the lack of high-quality and large-scale\nannotated corpora specifically designed for financial text sentiment analysis,\nwhich in turn limits the availability of data necessary for developing\neffective text processing techniques. Recent advancements in large language\nmodels (LLMs) have yielded remarkable performance in natural language\nprocessing tasks, primarily centered around language pattern matching. In this\npaper, we propose a novel and extensive Chinese fine-grained financial\nsentiment analysis dataset, FinChina SA, for enterprise early warning. We\nthoroughly evaluate and experiment with well-known existing open-source LLMs\nusing our dataset. We firmly believe that our dataset will serve as a valuable\nresource to advance the exploration of real-world financial sentiment analysis\ntasks, which should be the focus of future research. Our dataset and all code\nto replicate the experimental results will be released.\n","authors":["Yinyu Lan","Yanru Wu","Wang Xu","Weiqiang Feng","Youhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.14096v3.pdf","comment":"FinLLM Symposium at IJCAI 2023"},{"id":"http://arxiv.org/abs/2306.02250v2","updated":"2023-07-21T07:46:03Z","published":"2023-06-04T03:46:45Z","title":"Large Language Model Augmented Narrative Driven Recommendations","summary":"  Narrative-driven recommendation (NDR) presents an information access problem\nwhere users solicit recommendations with verbose descriptions of their\npreferences and context, for example, travelers soliciting recommendations for\npoints of interest while describing their likes/dislikes and travel\ncircumstances. These requests are increasingly important with the rise of\nnatural language-based conversational interfaces for search and recommendation\nsystems. However, NDR lacks abundant training data for models, and current\nplatforms commonly do not support these requests. Fortunately, classical\nuser-item interaction datasets contain rich textual data, e.g., reviews, which\noften describe user preferences and context - this may be used to bootstrap\ntraining for NDR models. In this work, we explore using large language models\n(LLMs) for data augmentation to train NDR models. We use LLMs for authoring\nsynthetic narrative queries from user-item interactions with few-shot prompting\nand train retrieval models for NDR on synthetic queries and user-item\ninteraction data. Our experiments demonstrate that this is an effective\nstrategy for training small-parameter retrieval models that outperform other\nretrieval and LLM baselines for narrative-driven recommendation.\n","authors":["Sheshera Mysore","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2306.02250v2.pdf","comment":"RecSys 2023 Camera-ready"},{"id":"http://arxiv.org/abs/2304.04250v2","updated":"2023-07-21T07:39:58Z","published":"2023-04-09T14:52:18Z","title":"Editable User Profiles for Controllable Text Recommendation","summary":"  Methods for making high-quality recommendations often rely on learning latent\nrepresentations from interaction data. These methods, while performant, do not\nprovide ready mechanisms for users to control the recommendation they receive.\nOur work tackles this problem by proposing LACE, a novel concept value\nbottleneck model for controllable text recommendations. LACE represents each\nuser with a succinct set of human-readable concepts through retrieval given\nuser-interacted documents and learns personalized representations of the\nconcepts based on user documents. This concept based user profile is then\nleveraged to make recommendations. The design of our model affords control over\nthe recommendations through a number of intuitive interactions with a\ntransparent user profile. We first establish the quality of recommendations\nobtained from LACE in an offline evaluation on three recommendation tasks\nspanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we\nvalidate the controllability of LACE under simulated user interactions.\nFinally, we implement LACE in an interactive controllable recommender system\nand conduct a user study to demonstrate that users are able to improve the\nquality of recommendations they receive through interactions with an editable\nuser profile.\n","authors":["Sheshera Mysore","Mahmood Jasim","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2304.04250v2.pdf","comment":"SIGIR-2023 Camera Ready"},{"id":"http://arxiv.org/abs/2307.11394v1","updated":"2023-07-21T07:22:18Z","published":"2023-07-21T07:22:18Z","title":"MeetEval: A Toolkit for Computation of Word Error Rates for Meeting\n  Transcription Systems","summary":"  MeetEval is an open-source toolkit to evaluate all kinds of meeting\ntranscription systems. It provides a unified interface for the computation of\ncommonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER\nalong other WER definitions. We extend the cpWER computation by a temporal\nconstraint to ensure that only words are identified as correct when the\ntemporal alignment is plausible. This leads to a better quality of the matching\nof the hypothesis string to the reference string that more closely resembles\nthe actual transcription quality, and a system is penalized if it provides poor\ntime annotations. Since word-level timing information is often not available,\nwe present a way to approximate exact word-level timings from segment-level\ntimings (e.g., a sentence) and show that the approximation leads to a similar\nWER as a matching with exact word-level annotations. At the same time, the time\nconstraint leads to a speedup of the matching algorithm, which outweighs the\nadditional overhead caused by processing the time stamps.\n","authors":["Thilo von Neumann","Christoph Boeddeker","Marc Delcroix","Reinhold Haeb-Umbach"],"pdf_url":"https://arxiv.org/pdf/2307.11394v1.pdf","comment":"Accepted for presentation at the Chime7 workshop 2023"},{"id":"http://arxiv.org/abs/2306.17519v2","updated":"2023-07-21T06:57:49Z","published":"2023-06-30T10:12:30Z","title":"GPT-FinRE: In-context Learning for Financial Relation Extraction using\n  Large Language Models","summary":"  Relation extraction (RE) is a crucial task in natural language processing\n(NLP) that aims to identify and classify relationships between entities\nmentioned in text. In the financial domain, relation extraction plays a vital\nrole in extracting valuable information from financial documents, such as news\narticles, earnings reports, and company filings. This paper describes our\nsolution to relation extraction on one such dataset REFinD. The dataset was\nreleased along with shared task as a part of the Fourth Workshop on Knowledge\nDiscovery from Unstructured Data in Financial Services, co-located with SIGIR\n2023. In this paper, we employed OpenAI models under the framework of\nin-context learning (ICL). We utilized two retrieval strategies to find top K\nrelevant in-context learning demonstrations / examples from training data for a\ngiven test example. The first retrieval mechanism, we employed, is a\nlearning-free dense retriever and the other system is a learning-based\nretriever. We were able to achieve 3rd rank overall. Our best F1-score is\n0.718.\n","authors":["Pawan Kumar Rajpoot","Ankur Parikh"],"pdf_url":"https://arxiv.org/pdf/2306.17519v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.02105 by other authors"},{"id":"http://arxiv.org/abs/2307.11380v1","updated":"2023-07-21T06:38:37Z","published":"2023-07-21T06:38:37Z","title":"Is ChatGPT Involved in Texts? Measure the Polish Ratio to Detect\n  ChatGPT-Generated Text","summary":"  The remarkable capabilities of large-scale language models, such as ChatGPT,\nin text generation have incited awe and spurred researchers to devise detectors\nto mitigate potential risks, including misinformation, phishing, and academic\ndishonesty. Despite this, most previous studies, including HC3, have been\npredominantly geared towards creating detectors that differentiate between\npurely ChatGPT-generated texts and human-authored texts. This approach,\nhowever, fails to work on discerning texts generated through human-machine\ncollaboration, such as ChatGPT-polished texts. Addressing this gap, we\nintroduce a novel dataset termed HPPT (ChatGPT-polished academic abstracts),\nfacilitating the construction of more robust detectors. It diverges from extant\ncorpora by comprising pairs of human-written and ChatGPT-polished abstracts\ninstead of purely ChatGPT-generated texts. Additionally, we propose the \"Polish\nRatio\" method, an innovative measure of ChatGPT's involvement in text\ngeneration based on editing distance. It provides a mechanism to measure the\ndegree of human originality in the resulting text. Our experimental results\nshow our proposed model has better robustness on the HPPT dataset and two\nexisting datasets (HC3 and CDB). Furthermore, the \"Polish Ratio\" we proposed\noffers a more comprehensive explanation by quantifying the degree of ChatGPT\ninvolvement, which indicates that a Polish Ratio value greater than 0.2\nsignifies ChatGPT involvement and a value exceeding 0.6 implies that ChatGPT\ngenerates most of the text.\n","authors":["Lingyi Yang","Feng Jiang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2307.11380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11346v1","updated":"2023-07-21T04:43:00Z","published":"2023-07-21T04:43:00Z","title":"CohortGPT: An Enhanced GPT for Participant Recruitment in Clinical Study","summary":"  Participant recruitment based on unstructured medical texts such as clinical\nnotes and radiology reports has been a challenging yet important task for the\ncohort establishment in clinical research. Recently, Large Language Models\n(LLMs) such as ChatGPT have achieved tremendous success in various downstream\ntasks thanks to their promising performance in language understanding,\ninference, and generation. It is then natural to test their feasibility in\nsolving the cohort recruitment task, which involves the classification of a\ngiven paragraph of medical text into disease label(s). However, when applied to\nknowledge-intensive problem settings such as medical text classification, where\nthe LLMs are expected to understand the decision made by human experts and\naccurately identify the implied disease labels, the LLMs show a mediocre\nperformance. A possible explanation is that, by only using the medical text,\nthe LLMs neglect to use the rich context of additional information that\nlanguages afford. To this end, we propose to use a knowledge graph as auxiliary\ninformation to guide the LLMs in making predictions. Moreover, to further boost\nthe LLMs adapt to the problem setting, we apply a chain-of-thought (CoT) sample\nselection strategy enhanced by reinforcement learning, which selects a set of\nCoT samples given each individual medical report. Experimental results and\nvarious ablation studies show that our few-shot learning method achieves\nsatisfactory performance compared with fine-tuning strategies and gains superb\nadvantages when the available data is limited. The code and sample dataset of\nthe proposed CohortGPT model is available at:\nhttps://anonymous.4open.science/r/CohortGPT-4872/\n","authors":["Zihan Guan","Zihao Wu","Zhengliang Liu","Dufan Wu","Hui Ren","Quanzheng Li","Xiang Li","Ninghao Liu"],"pdf_url":"https://arxiv.org/pdf/2307.11346v1.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.11344v1","updated":"2023-07-21T04:22:43Z","published":"2023-07-21T04:22:43Z","title":"DEFTri: A Few-Shot Label Fused Contextual Representation Learning For\n  Product Defect Triage in e-Commerce","summary":"  Defect Triage is a time-sensitive and critical process in a large-scale agile\nsoftware development lifecycle for e-commerce. Inefficiencies arising from\nhuman and process dependencies in this domain have motivated research in\nautomated approaches using machine learning to accurately assign defects to\nqualified teams. This work proposes a novel framework for automated defect\ntriage (DEFTri) using fine-tuned state-of-the-art pre-trained BERT on labels\nfused text embeddings to improve contextual representations from\nhuman-generated product defects. For our multi-label text classification defect\ntriage task, we also introduce a Walmart proprietary dataset of product defects\nusing weak supervision and adversarial learning, in a few-shot setting.\n","authors":["Ipsita Mohanty"],"pdf_url":"https://arxiv.org/pdf/2307.11344v1.pdf","comment":"In Proceedings of the Fifth Workshop on e-Commerce and NLP ECNLP 5\n  2022 Pages 1-7"},{"id":"http://arxiv.org/abs/2307.11316v1","updated":"2023-07-21T02:51:41Z","published":"2023-07-21T02:51:41Z","title":"Making Pre-trained Language Models both Task-solvers and\n  Self-calibrators","summary":"  Pre-trained language models (PLMs) serve as backbones for various real-world\nsystems. For high-stake applications, it's equally essential to have reasonable\nconfidence estimations in predictions. While the vanilla confidence scores of\nPLMs can already be effectively utilized, PLMs consistently become\noverconfident in their wrong predictions, which is not desirable in practice.\nPrevious work shows that introducing an extra calibration task can mitigate\nthis issue. The basic idea involves acquiring additional data to train models\nin predicting the confidence of their initial predictions. However, it only\ndemonstrates the feasibility of this kind of method, assuming that there are\nabundant extra available samples for the introduced calibration task. In this\nwork, we consider the practical scenario that we need to effectively utilize\ntraining samples to make PLMs both task-solvers and self-calibrators. Three\nchallenges are presented, including limited training samples, data imbalance,\nand distribution shifts. We first conduct pilot experiments to quantify various\ndecisive factors in the calibration task. Based on the empirical analysis\nresults, we propose a training algorithm LM-TOAST to tackle the challenges.\nExperimental results show that LM-TOAST can effectively utilize the training\ndata to make PLMs have reasonable confidence estimations while maintaining the\noriginal task performance. Further, we consider three downstream applications,\nnamely selective classification, adversarial defense, and model cascading, to\nshow the practical usefulness of LM-TOAST. The code will be made public at\n\\url{https://github.com/Yangyi-Chen/LM-TOAST}.\n","authors":["Yangyi Chen","Xingyao Wang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2307.11316v1.pdf","comment":"Accepted to Findings of ACL 2023"},{"id":"http://arxiv.org/abs/2307.11315v1","updated":"2023-07-21T02:47:18Z","published":"2023-07-21T02:47:18Z","title":"Generating Image-Specific Text Improves Fine-grained Image\n  Classification","summary":"  Recent vision-language models outperform vision-only models on many image\nclassification tasks. However, because of the absence of paired text/image\ndescriptions, it remains difficult to fine-tune these models for fine-grained\nimage classification. In this work, we propose a method, GIST, for generating\nimage-specific fine-grained text descriptions from image-only datasets, and\nshow that these text descriptions can be used to improve classification. Key\nparts of our method include 1. prompting a pretrained large language model with\ndomain-specific prompts to generate diverse fine-grained text descriptions for\neach class and 2. using a pretrained vision-language model to match each image\nto label-preserving text descriptions that capture relevant visual features in\nthe image. We demonstrate the utility of GIST by fine-tuning vision-language\nmodels on the image-and-generated-text pairs to learn an aligned\nvision-language representation space for improved classification. We evaluate\nour learned representation space in full-shot and few-shot scenarios across\nfour diverse fine-grained classification datasets, each from a different\ndomain. Our method achieves an average improvement of $4.1\\%$ in accuracy over\nCLIP linear probes and an average of $1.1\\%$ improvement in accuracy over the\nprevious state-of-the-art image-text classification method on the full-shot\ndatasets. Our method achieves similar improvements across few-shot regimes.\nCode is available at https://github.com/emu1729/GIST.\n","authors":["Emily Mu","Kathleen M. Lewis","Adrian V. Dalca","John Guttag"],"pdf_url":"https://arxiv.org/pdf/2307.11315v1.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2307.10291v2","updated":"2023-07-21T02:34:58Z","published":"2023-07-18T14:30:36Z","title":"Mutual Reinforcement Effects in Japanese Sentence Classification and\n  Named Entity Recognition Tasks","summary":"  Information extraction(IE) is a crucial subfield within natural language\nprocessing. However, for the traditionally segmented approach to sentence\nclassification and Named Entity Recognition, the intricate interactions between\nthese individual subtasks remain largely uninvestigated. In this study, we\npropose an integrative analysis, converging sentence classification with Named\nEntity Recognition, with the objective to unveil and comprehend the mutual\nreinforcement effect within these two information extraction subtasks. To\nachieve this, we introduce a Sentence Classification and Named Entity\nRecognition Multi-task (SCNM) approach that combines Sentence Classification\n(SC) and Named Entity Recognition (NER). We develop a Sentence-to-Label\nGeneration (SLG) framework for SCNM and construct a Wikipedia dataset\ncontaining both SC and NER. Using a format converter, we unify input formats\nand employ a generative model to generate SC-labels, NER-labels, and associated\ntext segments. We propose a Constraint Mechanism (CM) to improve generated\nformat accuracy. Our results show SC accuracy increased by 1.13 points and NER\nby 1.06 points in SCNM compared to standalone tasks, with CM raising format\naccuracy from 63.61 to 100. The findings indicate mutual reinforcement effects\nbetween SC and NER, and integration enhances both tasks' performance. We\nadditionally implemented the SLG framework on single SC task. It yielded\nsuperior accuracies compared to the baseline on two distinct Japanese SC\ndatasets. Notably, in the experiment of few-shot learning, SLG framework shows\nmuch better performance than fine-tune method. These empirical findings\ncontribute additional evidence to affirm the efficacy of the SLG framework.\n","authors":["Chengguang Gan","Qinghao Zhang","Tatsunori Mori"],"pdf_url":"https://arxiv.org/pdf/2307.10291v2.pdf","comment":"25 pages, 12 figures, 19 tables. arXiv admin note: substantial text\n  overlap with arXiv:2306.15978"},{"id":"http://arxiv.org/abs/2307.10432v2","updated":"2023-07-21T02:22:14Z","published":"2023-07-19T19:40:34Z","title":"PharmacyGPT: The AI Pharmacist","summary":"  In this study, we introduce PharmacyGPT, a novel framework to assess the\ncapabilities of large language models (LLMs) such as ChatGPT and GPT-4 in\nemulating the role of clinical pharmacists. Our methodology encompasses the\nutilization of LLMs to generate comprehensible patient clusters, formulate\nmedication plans, and forecast patient outcomes. We conduct our investigation\nusing real data acquired from the intensive care unit (ICU) at the University\nof North Carolina Chapel Hill (UNC) Hospital. Our analysis offers valuable\ninsights into the potential applications and limitations of LLMs in the field\nof clinical pharmacy, with implications for both patient care and the\ndevelopment of future AI-driven healthcare solutions. By evaluating the\nperformance of PharmacyGPT, we aim to contribute to the ongoing discourse\nsurrounding the integration of artificial intelligence in healthcare settings,\nultimately promoting the responsible and efficacious use of such technologies.\n","authors":["Zhengliang Liu","Zihao Wu","Mengxuan Hu","Bokai Zhao","Lin Zhao","Tianyi Zhang","Haixing Dai","Xianyan Chen","Ye Shen","Sheng Li","Brian Murray","Tianming Liu","Andrea Sikora"],"pdf_url":"https://arxiv.org/pdf/2307.10432v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13971v2","updated":"2023-07-21T01:58:13Z","published":"2023-06-24T13:57:32Z","title":"Towards Robust Aspect-based Sentiment Analysis through\n  Non-counterfactual Augmentations","summary":"  While state-of-the-art NLP models have demonstrated excellent performance for\naspect based sentiment analysis (ABSA), substantial evidence has been presented\non their lack of robustness. This is especially manifested as significant\ndegradation in performance when faced with out-of-distribution data. Recent\nsolutions that rely on counterfactually augmented datasets show promising\nresults, but they are inherently limited because of the lack of access to\nexplicit causal structure. In this paper, we present an alternative approach\nthat relies on non-counterfactual data augmentation. Our proposal instead\nrelies on using noisy, cost-efficient data augmentations that preserve\nsemantics associated with the target aspect. Our approach then relies on\nmodelling invariances between different versions of the data to improve\nrobustness. A comprehensive suite of experiments shows that our proposal\nsignificantly improves upon strong pre-trained baselines on both standard and\nrobustness-specific datasets. Our approach further establishes a new\nstate-of-the-art on the ABSA robustness benchmark and transfers well across\ndomains.\n","authors":["Xinyu Liu","Yan Ding","Kaikai An","Chunyang Xiao","Pranava Madhyastha","Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2306.13971v2.pdf","comment":"10pages,1 figure,10 tables"},{"id":"http://arxiv.org/abs/2307.11278v1","updated":"2023-07-21T00:34:38Z","published":"2023-07-21T00:34:38Z","title":"Generator-Retriever-Generator: A Novel Approach to Open-domain Question\n  Answering","summary":"  Open-domain question answering (QA) tasks usually require the retrieval of\nrelevant information from a large corpus to generate accurate answers. We\npropose a novel approach called Generator-Retriever-Generator (GRG) that\ncombines document retrieval techniques with a large language model (LLM), by\nfirst prompting the model to generate contextual documents based on a given\nquestion. In parallel, a dual-encoder network retrieves documents that are\nrelevant to the question from an external corpus. The generated and retrieved\ndocuments are then passed to the second LLM, which generates the final answer.\nBy combining document retrieval and LLM generation, our approach addresses the\nchallenges of open-domain QA, such as generating informative and contextually\nrelevant answers. GRG outperforms the state-of-the-art generate-then-read and\nretrieve-then-read pipelines (GENREAD and RFiD) improving their performance at\nleast by +5.2, +4.2, and +1.6 on TriviaQA, NQ, and WebQ datasets, respectively.\nWe provide code, datasets, and checkpoints\n\\footnote{\\url{https://github.com/abdoelsayed2016/GRG}}\n","authors":["Abdelrahman Abdallah","Adam Jatowt"],"pdf_url":"https://arxiv.org/pdf/2307.11278v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08584v3","updated":"2023-07-21T22:08:45Z","published":"2022-11-15T23:57:34Z","title":"Toward expanding the scope of radiology report summarization to multiple\n  anatomies and modalities","summary":"  Radiology report summarization (RRS) is a growing area of research. Given the\nFindings section of a radiology report, the goal is to generate a summary\n(called an Impression section) that highlights the key observations and\nconclusions of the radiology study. However, RRS currently faces essential\nlimitations.First, many prior studies conduct experiments on private datasets,\npreventing reproduction of results and fair comparisons across different\nsystems and solutions. Second, most prior approaches are evaluated solely on\nchest X-rays. To address these limitations, we propose a dataset (MIMIC-RRS)\ninvolving three new modalities and seven new anatomies based on the MIMIC-III\nand MIMIC-CXR datasets. We then conduct extensive experiments to evaluate the\nperformance of models both within and across modality-anatomy pairs in\nMIMIC-RRS. In addition, we evaluate their clinical efficacy via RadGraph, a\nfactual correctness metric.\n","authors":["Zhihong Chen","Maya Varma","Xiang Wan","Curtis Langlotz","Jean-Benoit Delbrouck"],"pdf_url":"https://arxiv.org/pdf/2211.08584v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11922v1","updated":"2023-07-21T22:02:50Z","published":"2023-07-21T22:02:50Z","title":"Selective Perception: Optimizing State Descriptions with Reinforcement\n  Learning for Language Model Actors","summary":"  Large language models (LLMs) are being applied as actors for sequential\ndecision making tasks in domains such as robotics and games, utilizing their\ngeneral world knowledge and planning abilities. However, previous work does\nlittle to explore what environment state information is provided to LLM actors\nvia language. Exhaustively describing high-dimensional states can impair\nperformance and raise inference costs for LLM actors. Previous LLM actors avoid\nthe issue by relying on hand-engineered, task-specific protocols to determine\nwhich features to communicate about a state and which to leave out. In this\nwork, we propose Brief Language INputs for DEcision-making Responses (BLINDER),\na method for automatically selecting concise state descriptions by learning a\nvalue function for task-conditioned state descriptions. We evaluate BLINDER on\nthe challenging video game NetHack and a robotic manipulation task. Our method\nimproves task success rate, reduces input size and compute costs, and\ngeneralizes between LLM actors.\n","authors":["Kolby Nottingham","Yasaman Razeghi","Kyungmin Kim","JB Lanier","Pierre Baldi","Roy Fox","Sameer Singh"],"pdf_url":"https://arxiv.org/pdf/2307.11922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03341v3","updated":"2023-07-21T19:11:58Z","published":"2023-06-06T01:26:53Z","title":"Inference-Time Intervention: Eliciting Truthful Answers from a Language\n  Model","summary":"  We introduce Inference-Time Intervention (ITI), a technique designed to\nenhance the truthfulness of large language models (LLMs). ITI operates by\nshifting model activations during inference, following a set of directions\nacross a limited number of attention heads. This intervention significantly\nimproves the performance of LLaMA models on the TruthfulQA benchmark. On an\ninstruction-finetuned LLaMA called Alpaca, ITI improves its truthfulness from\n32.5% to 65.1%. We identify a tradeoff between truthfulness and helpfulness and\ndemonstrate how to balance it by tuning the intervention strength. ITI is\nminimally invasive and computationally inexpensive. Moreover, the technique is\ndata efficient: while approaches like RLHF require extensive annotations, ITI\nlocates truthful directions using only few hundred examples. Our findings\nsuggest that LLMs may have an internal representation of the likelihood of\nsomething being true, even as they produce falsehoods on the surface.\n","authors":["Kenneth Li","Oam Patel","Fernanda Viégas","Hanspeter Pfister","Martin Wattenberg"],"pdf_url":"https://arxiv.org/pdf/2306.03341v3.pdf","comment":"code: https://github.com/likenneth/honest_llama"},{"id":"http://arxiv.org/abs/2307.11865v1","updated":"2023-07-21T19:09:37Z","published":"2023-07-21T19:09:37Z","title":"CARTIER: Cartographic lAnguage Reasoning Targeted at Instruction\n  Execution for Robots","summary":"  This work explores the capacity of large language models (LLMs) to address\nproblems at the intersection of spatial planning and natural language\ninterfaces for navigation.Our focus is on following relatively complex\ninstructions that are more akin to natural conversation than traditional\nexplicit procedural directives seen in robotics. Unlike most prior work, where\nnavigation directives are provided as imperative commands (e.g., go to the\nfridge), we examine implicit directives within conversational interactions. We\nleverage the 3D simulator AI2Thor to create complex and repeatable scenarios at\nscale, and augment it by adding complex language queries for 40 object types.\nWe demonstrate that a robot can better parse descriptive language queries than\nexisting methods by using an LLM to interpret the user interaction in the\ncontext of a list of the objects in the scene.\n","authors":["Nikhil Kakodkar","Dmitriy Rivkin","Bobak H. Baghi","Francois Hogan","Gregory Dudek"],"pdf_url":"https://arxiv.org/pdf/2307.11865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11864v1","updated":"2023-07-21T19:09:24Z","published":"2023-07-21T19:09:24Z","title":"The Looming Threat of Fake and LLM-generated LinkedIn Profiles:\n  Challenges and Opportunities for Detection and Prevention","summary":"  In this paper, we present a novel method for detecting fake and Large\nLanguage Model (LLM)-generated profiles in the LinkedIn Online Social Network\nimmediately upon registration and before establishing connections. Early fake\nprofile identification is crucial to maintaining the platform's integrity since\nit prevents imposters from acquiring the private and sensitive information of\nlegitimate users and from gaining an opportunity to increase their credibility\nfor future phishing and scamming activities. This work uses textual information\nprovided in LinkedIn profiles and introduces the Section and Subsection Tag\nEmbedding (SSTE) method to enhance the discriminative characteristics of these\ndata for distinguishing between legitimate profiles and those created by\nimposters manually or by using an LLM. Additionally, the dearth of a large\npublicly available LinkedIn dataset motivated us to collect 3600 LinkedIn\nprofiles for our research. We will release our dataset publicly for research\npurposes. This is, to the best of our knowledge, the first large publicly\navailable LinkedIn dataset for fake LinkedIn account detection. Within our\nparadigm, we assess static and contextualized word embeddings, including GloVe,\nFlair, BERT, and RoBERTa. We show that the suggested method can distinguish\nbetween legitimate and fake profiles with an accuracy of about 95% across all\nword embeddings. In addition, we show that SSTE has a promising accuracy for\nidentifying LLM-generated profiles, despite the fact that no LLM-generated\nprofiles were employed during the training phase, and can achieve an accuracy\nof approximately 90% when only 20 LLM-generated profiles are added to the\ntraining set. It is a significant finding since the proliferation of several\nLLMs in the near future makes it extremely challenging to design a single\nsystem that can identify profiles created with various LLMs.\n","authors":["Navid Ayoobi","Sadat Shahriar","Arjun Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2307.11864v1.pdf","comment":"33rd ACM Conference on Hypertext and Social Media (HT '23)"},{"id":"http://arxiv.org/abs/2307.11848v1","updated":"2023-07-21T18:35:24Z","published":"2023-07-21T18:35:24Z","title":"MythQA: Query-Based Large-Scale Check-Worthy Claim Detection through\n  Multi-Answer Open-Domain Question Answering","summary":"  Check-worthy claim detection aims at providing plausible misinformation to\ndownstream fact-checking systems or human experts to check. This is a crucial\nstep toward accelerating the fact-checking process. Many efforts have been put\ninto how to identify check-worthy claims from a small scale of pre-collected\nclaims, but how to efficiently detect check-worthy claims directly from a\nlarge-scale information source, such as Twitter, remains underexplored. To fill\nthis gap, we introduce MythQA, a new multi-answer open-domain question\nanswering(QA) task that involves contradictory stance mining for query-based\nlarge-scale check-worthy claim detection. The idea behind this is that\ncontradictory claims are a strong indicator of misinformation that merits\nscrutiny by the appropriate authorities. To study this task, we construct\nTweetMythQA, an evaluation dataset containing 522 factoid multi-answer\nquestions based on controversial topics. Each question is annotated with\nmultiple answers. Moreover, we collect relevant tweets for each distinct\nanswer, then classify them into three categories: \"Supporting\", \"Refuting\", and\n\"Neutral\". In total, we annotated 5.3K tweets. Contradictory evidence is\ncollected for all answers in the dataset. Finally, we present a baseline system\nfor MythQA and evaluate existing NLP models for each system component using the\nTweetMythQA dataset. We provide initial benchmarks and identify key challenges\nfor future models to improve upon. Code and data are available at:\nhttps://github.com/TonyBY/Myth-QA\n","authors":["Yang Bai","Anthony Colas","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11848v1.pdf","comment":"Accepted by SIGIR 2023"},{"id":"http://arxiv.org/abs/2307.11845v1","updated":"2023-07-21T18:29:04Z","published":"2023-07-21T18:29:04Z","title":"Multimodal Document Analytics for Banking Process Automation","summary":"  In response to growing FinTech competition and the need for improved\noperational efficiency, this research focuses on understanding the potential of\nadvanced document analytics, particularly using multimodal models, in banking\nprocesses. We perform a comprehensive analysis of the diverse banking document\nlandscape, highlighting the opportunities for efficiency gains through\nautomation and advanced analytics techniques in the customer business. Building\non the rapidly evolving field of natural language processing (NLP), we\nillustrate the potential of models such as LayoutXLM, a cross-lingual,\nmultimodal, pre-trained model, for analyzing diverse documents in the banking\nsector. This model performs a text token classification on German company\nregister extracts with an overall F1 score performance of around 80\\%. Our\nempirical evidence confirms the critical role of layout information in\nimproving model performance and further underscores the benefits of integrating\nimage information. Interestingly, our study shows that over 75% F1 score can be\nachieved with only 30% of the training data, demonstrating the efficiency of\nLayoutXLM. Through addressing state-of-the-art document analysis frameworks,\nour study aims to enhance process efficiency and demonstrate the real-world\napplicability and benefits of multimodal models within banking.\n","authors":["Christopher Gerling","Stefan Lessmann"],"pdf_url":"https://arxiv.org/pdf/2307.11845v1.pdf","comment":"A Preprint"},{"id":"http://arxiv.org/abs/2307.11795v1","updated":"2023-07-21T08:39:15Z","published":"2023-07-21T08:39:15Z","title":"Prompting Large Language Models with Speech Recognition Abilities","summary":"  Large language models have proven themselves highly flexible, able to solve a\nwide range of generative tasks, such as abstractive summarization and\nopen-ended question answering. In this paper we extend the capabilities of LLMs\nby directly attaching a small audio encoder allowing it to perform speech\nrecognition. By directly prepending a sequence of audial embeddings to the text\ntoken embeddings, the LLM can be converted to an automatic speech recognition\n(ASR) system, and be used in the exact same manner as its textual counterpart.\nExperiments on Multilingual LibriSpeech (MLS) show that incorporating a\nconformer encoder into the open sourced LLaMA-7B allows it to outperform\nmonolingual baselines by 18% and perform multilingual speech recognition\ndespite LLaMA being trained overwhelmingly on English text. Furthermore, we\nperform ablation studies to investigate whether the LLM can be completely\nfrozen during training to maintain its original capabilities, scaling up the\naudio encoder, and increasing the audio encoder striding to generate fewer\nembeddings. The results from these studies show that multilingual ASR is\npossible even when the LLM is frozen or when strides of almost 1 second are\nused in the audio encoder opening up the possibility for LLMs to operate on\nlong-form audio.\n","authors":["Yassir Fathullah","Chunyang Wu","Egor Lakomkin","Junteng Jia","Yuan Shangguan","Ke Li","Jinxi Guo","Wenhan Xiong","Jay Mahadeokar","Ozlem Kalinli","Christian Fuegen","Mike Seltzer"],"pdf_url":"https://arxiv.org/pdf/2307.11795v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.11748v1","updated":"2023-07-21T17:58:47Z","published":"2023-07-21T17:58:47Z","title":"BandRe: Rethinking Band-Pass Filters for Scale-Wise Object Detection\n  Evaluation","summary":"  Scale-wise evaluation of object detectors is important for real-world\napplications. However, existing metrics are either coarse or not sufficiently\nreliable. In this paper, we propose novel scale-wise metrics that strike a\nbalance between fineness and reliability, using a filter bank consisting of\ntriangular and trapezoidal band-pass filters. We conduct experiments with two\nmethods on two datasets and show that the proposed metrics can highlight the\ndifferences between the methods and between the datasets. Code is available at\nhttps://github.com/shinya7y/UniverseNet .\n","authors":["Yosuke Shinya"],"pdf_url":"https://arxiv.org/pdf/2307.11748v1.pdf","comment":"Honorable Mention Solution Award in Small Object Detection Challenge\n  for Spotting Birds, International Conference on Machine Vision Applications\n  (MVA) 2023"},{"id":"http://arxiv.org/abs/2108.02226v2","updated":"2023-07-21T17:27:10Z","published":"2021-08-04T18:08:28Z","title":"Terabyte-scale supervised 3D training and benchmarking dataset of the\n  mouse kidney","summary":"  The performance of machine learning algorithms, when used for segmenting 3D\nbiomedical images, does not reach the level expected based on results achieved\nwith 2D photos. This may be explained by the comparative lack of high-volume,\nhigh-quality training datasets, which require state-of-the-art imaging\nfacilities, domain experts for annotation and large computational and personal\nresources. The HR-Kidney dataset presented in this work bridges this gap by\nproviding 1.7 TB of artefact-corrected synchrotron radiation-based X-ray\nphase-contrast microtomography images of whole mouse kidneys and validated\nsegmentations of 33 729 glomeruli, which corresponds to a one to two orders of\nmagnitude increase over currently available biomedical datasets. The image sets\nalso contain the underlying raw data, threshold- and morphology-based\nsemi-automatic segmentations of renal vasculature and uriniferous tubules, as\nwell as true 3D manual annotations. We therewith provide a broad basis for the\nscientific community to build upon and expand in the fields of image\nprocessing, data augmentation and machine learning, in particular unsupervised\nand semi-supervised learning investigations, as well as transfer learning and\ngenerative adversarial networks.\n","authors":["Willy Kuo","Diego Rossinelli","Georg Schulz","Roland H. Wenger","Simone Hieber","Bert Müller","Vartan Kurtcuoglu"],"pdf_url":"https://arxiv.org/pdf/2108.02226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10656v2","updated":"2023-07-21T17:15:24Z","published":"2023-03-19T13:41:59Z","title":"More From Less: Self-Supervised Knowledge Distillation for Routine\n  Histopathology Data","summary":"  Medical imaging technologies are generating increasingly large amounts of\nhigh-quality, information-dense data. Despite the progress, practical use of\nadvanced imaging technologies for research and diagnosis remains limited by\ncost and availability, so information-sparse data such as H&E stains are relied\non in practice. The study of diseased tissue requires methods which can\nleverage these information-dense data to extract more value from routine,\ninformation-sparse data. Using self-supervised deep learning, we demonstrate\nthat it is possible to distil knowledge during training from information-dense\ndata into models which only require information-sparse data for inference. This\nimproves downstream classification accuracy on information-sparse data, making\nit comparable with the fully-supervised baseline. We find substantial effects\non the learned representations, and this training process identifies subtle\nfeatures which otherwise go undetected. This approach enables the design of\nmodels which require only routine images, but contain insights from\nstate-of-the-art data, allowing better use of the available resources.\n","authors":["Lucas Farndale","Robert Insall","Ke Yuan"],"pdf_url":"https://arxiv.org/pdf/2303.10656v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11706v1","updated":"2023-07-21T17:02:55Z","published":"2023-07-21T17:02:55Z","title":"3D Skeletonization of Complex Grapevines for Robotic Pruning","summary":"  Robotic pruning of dormant grapevines is an area of active research in order\nto promote vine balance and grape quality, but so far robotic efforts have\nlargely focused on planar, simplified vines not representative of commercial\nvineyards. This paper aims to advance the robotic perception capabilities\nnecessary for pruning in denser and more complex vine structures by extending\nplant skeletonization techniques. The proposed pipeline generates skeletal\ngrapevine models that have lower reprojection error and higher connectivity\nthan baseline algorithms. We also show how 3D and skeletal information enables\nprediction accuracy of pruning weight for dense vines surpassing prior work,\nwhere pruning weight is an important vine metric influencing pruning site\nselection.\n","authors":["Eric Schneider","Sushanth Jayanth","Abhisesh Silwal","George Kantor"],"pdf_url":"https://arxiv.org/pdf/2307.11706v1.pdf","comment":"6 pages, IROS 2023 Computer Vision for Automation"},{"id":"http://arxiv.org/abs/2307.11702v1","updated":"2023-07-21T16:56:36Z","published":"2023-07-21T16:56:36Z","title":"SACReg: Scene-Agnostic Coordinate Regression for Visual Localization","summary":"  Scene coordinates regression (SCR), i.e., predicting 3D coordinates for every\npixel of a given image, has recently shown promising potential. However,\nexisting methods remain mostly scene-specific or limited to small scenes and\nthus hardly scale to realistic datasets. In this paper, we propose a new\nparadigm where a single generic SCR model is trained once to be then deployed\nto new test scenes, regardless of their scale and without further finetuning.\nFor a given query image, it collects inputs from off-the-shelf image retrieval\ntechniques and Structure-from-Motion databases: a list of relevant database\nimages with sparse pointwise 2D-3D annotations. The model is based on the\ntransformer architecture and can take a variable number of images and sparse\n2D-3D annotations as input. It is trained on a few diverse datasets and\nsignificantly outperforms other scene regression approaches on several\nbenchmarks, including scene-specific models, for visual localization. In\nparticular, we set a new state of the art on the Cambridge localization\nbenchmark, even outperforming feature-matching-based approaches.\n","authors":["Jerome Revaud","Yohann Cabon","Romain Brégier","JongMin Lee","Philippe Weinzaepfel"],"pdf_url":"https://arxiv.org/pdf/2307.11702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06828v3","updated":"2023-07-21T16:54:18Z","published":"2022-11-13T06:03:28Z","title":"Enhancing Few-shot Image Classification with Cosine Transformer","summary":"  This paper addresses the few-shot image classification problem, where the\nclassification task is performed on unlabeled query samples given a small\namount of labeled support samples only. One major challenge of the few-shot\nlearning problem is the large variety of object visual appearances that\nprevents the support samples to represent that object comprehensively. This\nmight result in a significant difference between support and query samples,\ntherefore undermining the performance of few-shot algorithms. In this paper, we\ntackle the problem by proposing Few-shot Cosine Transformer (FS-CT), where the\nrelational map between supports and queries is effectively obtained for the\nfew-shot tasks. The FS-CT consists of two parts, a learnable prototypical\nembedding network to obtain categorical representations from support samples\nwith hard cases, and a transformer encoder to effectively achieve the\nrelational map from two different support and query samples. We introduce\nCosine Attention, a more robust and stable attention module that enhances the\ntransformer module significantly and therefore improves FS-CT performance from\n5% to over 20% in accuracy compared to the default scaled dot-product\nmechanism. Our method performs competitive results in mini-ImageNet, CUB-200,\nand CIFAR-FS on 1-shot learning and 5-shot learning tasks across backbones and\nfew-shot configurations. We also developed a custom few-shot dataset for Yoga\npose recognition to demonstrate the potential of our algorithm for practical\napplication. Our FS-CT with cosine attention is a lightweight, simple few-shot\nalgorithm that can be applied for a wide range of applications, such as\nhealthcare, medical, and security surveillance. The official implementation\ncode of our Few-shot Cosine Transformer is available at\nhttps://github.com/vinuni-vishc/Few-Shot-Cosine-Transformer\n","authors":["Quang-Huy Nguyen","Cuong Q. Nguyen","Dung D. Le","Hieu H. Pham"],"pdf_url":"https://arxiv.org/pdf/2211.06828v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11661v1","updated":"2023-07-21T15:49:59Z","published":"2023-07-21T15:49:59Z","title":"Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts","summary":"  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have\nrevolutionized visual representation learning by providing good performance on\ndownstream datasets. VLMs are 0-shot adapted to a downstream dataset by\ndesigning prompts that are relevant to the dataset. Such prompt engineering\nmakes use of domain expertise and a validation dataset. Meanwhile, recent\ndevelopments in generative pretrained models like GPT-4 mean they can be used\nas advanced internet search tools. They can also be manipulated to provide\nvisual information in any structure. In this work, we show that GPT-4 can be\nused to generate text that is visually descriptive and how this can be used to\nadapt CLIP to downstream tasks. We show considerable improvements in 0-shot\ntransfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD\n(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.\nWe also design a simple few-shot adapter that learns to choose the best\npossible sentences to construct generalizable classifiers that outperform the\nrecently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized\nfine-grained datasets. We will release the code, prompts, and auxiliary text\ndataset upon acceptance.\n","authors":["Mayug Maniparambil","Chris Vorster","Derek Molloy","Noel Murphy","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11661v1.pdf","comment":"10 pages, Pre-print"},{"id":"http://arxiv.org/abs/2307.11654v1","updated":"2023-07-21T15:42:01Z","published":"2023-07-21T15:42:01Z","title":"FEDD -- Fair, Efficient, and Diverse Diffusion-based Lesion Segmentation\n  and Malignancy Classification","summary":"  Skin diseases affect millions of people worldwide, across all ethnicities.\nIncreasing diagnosis accessibility requires fair and accurate segmentation and\nclassification of dermatology images. However, the scarcity of annotated\nmedical images, especially for rare diseases and underrepresented skin tones,\nposes a challenge to the development of fair and accurate models. In this\nstudy, we introduce a Fair, Efficient, and Diverse Diffusion-based framework\nfor skin lesion segmentation and malignancy classification. FEDD leverages\nsemantically meaningful feature embeddings learned through a denoising\ndiffusion probabilistic backbone and processes them via linear probes to\nachieve state-of-the-art performance on Diverse Dermatology Images (DDI). We\nachieve an improvement in intersection over union of 0.18, 0.13, 0.06, and 0.07\nwhile using only 5%, 10%, 15%, and 20% labeled samples, respectively.\nAdditionally, FEDD trained on 10% of DDI demonstrates malignancy classification\naccuracy of 81%, 14% higher compared to the state-of-the-art. We showcase high\nefficiency in data-constrained scenarios while providing fair performance for\ndiverse skin tones and rare malignancy conditions. Our newly annotated DDI\nsegmentation masks and training code can be found on\nhttps://github.com/hectorcarrion/fedd.\n","authors":["Héctor Carrión","Narges Norouzi"],"pdf_url":"https://arxiv.org/pdf/2307.11654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11643v1","updated":"2023-07-21T15:22:32Z","published":"2023-07-21T15:22:32Z","title":"Morphological Image Analysis and Feature Extraction for Reasoning with\n  AI-based Defect Detection and Classification Models","summary":"  As the use of artificial intelligent (AI) models becomes more prevalent in\nindustries such as engineering and manufacturing, it is essential that these\nmodels provide transparent reasoning behind their predictions. This paper\nproposes the AI-Reasoner, which extracts the morphological characteristics of\ndefects (DefChars) from images and utilises decision trees to reason with the\nDefChar values. Thereafter, the AI-Reasoner exports visualisations (i.e.\ncharts) and textual explanations to provide insights into outputs made by\nmasked-based defect detection and classification models. It also provides\neffective mitigation strategies to enhance data pre-processing and overall\nmodel performance. The AI-Reasoner was tested on explaining the outputs of an\nIE Mask R-CNN model using a set of 366 images containing defects. The results\ndemonstrated its effectiveness in explaining the IE Mask R-CNN model's\npredictions. Overall, the proposed AI-Reasoner provides a solution for\nimproving the performance of AI models in industrial applications that require\ndefect analysis.\n","authors":["Jiajun Zhang","Georgina Cosma","Sarah Bugby","Axel Finke","Jason Watkins"],"pdf_url":"https://arxiv.org/pdf/2307.11643v1.pdf","comment":"8 pages, 3 figures, 5 tables; submitted to 2023 IEEE symposium series\n  on computational intelligence (SSCI)"},{"id":"http://arxiv.org/abs/2307.11638v1","updated":"2023-07-21T15:04:21Z","published":"2023-07-21T15:04:21Z","title":"Deep Reinforcement Learning Based System for Intraoperative\n  Hyperspectral Video Autofocusing","summary":"  Hyperspectral imaging (HSI) captures a greater level of spectral detail than\ntraditional optical imaging, making it a potentially valuable intraoperative\ntool when precise tissue differentiation is essential. Hardware limitations of\ncurrent optical systems used for handheld real-time video HSI result in a\nlimited focal depth, thereby posing usability issues for integration of the\ntechnology into the operating room. This work integrates a focus-tunable liquid\nlens into a video HSI exoscope, and proposes novel video autofocusing methods\nbased on deep reinforcement learning. A first-of-its-kind robotic focal-time\nscan was performed to create a realistic and reproducible testing dataset. We\nbenchmarked our proposed autofocus algorithm against traditional policies, and\nfound our novel approach to perform significantly ($p<0.05$) better than\ntraditional techniques ($0.070\\pm.098$ mean absolute focal error compared to\n$0.146\\pm.148$). In addition, we performed a blinded usability trial by having\ntwo neurosurgeons compare the system with different autofocus policies, and\nfound our novel approach to be the most favourable, making our system a\ndesirable addition for intraoperative HSI.\n","authors":["Charlie Budd","Jianrong Qiu","Oscar MacCormac","Martin Huber","Christopher Mower","Mirek Janatka","Théo Trotouin","Jonathan Shapey","Mads S. Bergholt","Tom Vercauteren"],"pdf_url":"https://arxiv.org/pdf/2307.11638v1.pdf","comment":"To be presented at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.11636v1","updated":"2023-07-21T14:58:44Z","published":"2023-07-21T14:58:44Z","title":"OxfordTVG-HIC: Can Machine Make Humorous Captions from Images?","summary":"  This paper presents OxfordTVG-HIC (Humorous Image Captions), a large-scale\ndataset for humour generation and understanding. Humour is an abstract,\nsubjective, and context-dependent cognitive construct involving several\ncognitive factors, making it a challenging task to generate and interpret.\nHence, humour generation and understanding can serve as a new task for\nevaluating the ability of deep-learning methods to process abstract and\nsubjective information. Due to the scarcity of data, humour-related generation\ntasks such as captioning remain under-explored. To address this gap,\nOxfordTVG-HIC offers approximately 2.9M image-text pairs with humour scores to\ntrain a generalizable humour captioning model. Contrary to existing captioning\ndatasets, OxfordTVG-HIC features a wide range of emotional and semantic\ndiversity resulting in out-of-context examples that are particularly conducive\nto generating humour. Moreover, OxfordTVG-HIC is curated devoid of offensive\ncontent. We also show how OxfordTVG-HIC can be leveraged for evaluating the\nhumour of a generated text. Through explainability analysis of the trained\nmodels, we identify the visual and linguistic cues influential for evoking\nhumour prediction (and generation). We observe qualitatively that these cues\nare aligned with the benign violation theory of humour in cognitive psychology.\n","authors":["Runjia Li","Shuyang Sun","Mohamed Elhoseiny","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2307.11636v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.15823v2","updated":"2023-07-21T14:55:21Z","published":"2023-03-28T08:51:15Z","title":"Automated wildlife image classification: An active learning tool for\n  ecological applications","summary":"  Wildlife camera trap images are being used extensively to investigate animal\nabundance, habitat associations, and behavior, which is complicated by the fact\nthat experts must first classify the images manually. Artificial intelligence\nsystems can take over this task but usually need a large number of\nalready-labeled training images to achieve sufficient performance. This\nrequirement necessitates human expert labor and poses a particular challenge\nfor projects with few cameras or short durations. We propose a label-efficient\nlearning strategy that enables researchers with small or medium-sized image\ndatabases to leverage the potential of modern machine learning, thus freeing\ncrucial resources for subsequent analyses.\n  Our methodological proposal is two-fold: (1) We improve current strategies of\ncombining object detection and image classification by tuning the\nhyperparameters of both models. (2) We provide an active learning (AL) system\nthat allows training deep learning models very efficiently in terms of required\nhuman-labeled training images. We supply a software package that enables\nresearchers to use these methods directly and thereby ensure the broad\napplicability of the proposed framework in ecological practice.\n  We show that our tuning strategy improves predictive performance. We\ndemonstrate how the AL pipeline reduces the amount of pre-labeled data needed\nto achieve a specific predictive performance and that it is especially valuable\nfor improving out-of-sample predictive performance.\n  We conclude that the combination of tuning and AL increases predictive\nperformance substantially. Furthermore, we argue that our work can broadly\nimpact the community through the ready-to-use software package provided.\nFinally, the publication of our models tailored to European wildlife data\nenriches existing model bases mostly trained on data from Africa and North\nAmerica.\n","authors":["Ludwig Bothmann","Lisa Wimmer","Omid Charrakh","Tobias Weber","Hendrik Edelhoff","Wibke Peters","Hien Nguyen","Caryl Benjamin","Annette Menzel"],"pdf_url":"https://arxiv.org/pdf/2303.15823v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.03056v3","updated":"2023-07-21T14:45:20Z","published":"2023-03-06T11:59:13Z","title":"MOISST: Multimodal Optimization of Implicit Scene for SpatioTemporal\n  calibration","summary":"  With the recent advances in autonomous driving and the decreasing cost of\nLiDARs, the use of multimodal sensor systems is on the rise. However, in order\nto make use of the information provided by a variety of complimentary sensors,\nit is necessary to accurately calibrate them. We take advantage of recent\nadvances in computer graphics and implicit volumetric scene representation to\ntackle the problem of multi-sensor spatial and temporal calibration. Thanks to\na new formulation of the Neural Radiance Field (NeRF) optimization, we are able\nto jointly optimize calibration parameters along with scene representation\nbased on radiometric and geometric measurements. Our method enables accurate\nand robust calibration from data captured in uncontrolled and unstructured\nurban environments, making our solution more scalable than existing calibration\nsolutions. We demonstrate the accuracy and robustness of our method in urban\nscenes typically encountered in autonomous driving scenarios.\n","authors":["Quentin Herau","Nathan Piasco","Moussab Bennehar","Luis Roldão","Dzmitry Tsishkou","Cyrille Migniot","Pascal Vasseur","Cédric Demonceaux"],"pdf_url":"https://arxiv.org/pdf/2303.03056v3.pdf","comment":"Accepted at IROS2023 Project site: https://qherau.github.io/MOISST/"},{"id":"http://arxiv.org/abs/2307.11618v1","updated":"2023-07-21T14:37:17Z","published":"2023-07-21T14:37:17Z","title":"Divide and Adapt: Active Domain Adaptation via Customized Learning","summary":"  Active domain adaptation (ADA) aims to improve the model adaptation\nperformance by incorporating active learning (AL) techniques to label a\nmaximally-informative subset of target samples. Conventional AL methods do not\nconsider the existence of domain shift, and hence, fail to identify the truly\nvaluable samples in the context of domain adaptation. To accommodate active\nlearning and domain adaption, the two naturally different tasks, in a\ncollaborative framework, we advocate that a customized learning strategy for\nthe target data is the key to the success of ADA solutions. We present\nDivide-and-Adapt (DiaNA), a new ADA framework that partitions the target\ninstances into four categories with stratified transferable properties. With a\nnovel data subdivision protocol based on uncertainty and domainness, DiaNA can\naccurately recognize the most gainful samples. While sending the informative\ninstances for annotation, DiaNA employs tailored learning strategies for the\nremaining categories. Furthermore, we propose an informativeness score that\nunifies the data partitioning criteria. This enables the use of a Gaussian\nmixture model (GMM) to automatically sample unlabeled data into the proposed\nfour categories. Thanks to the \"divideand-adapt\" spirit, DiaNA can handle data\nwith large variations of domain gap. In addition, we show that DiaNA can\ngeneralize to different domain adaptation settings, such as unsupervised domain\nadaptation (UDA), semi-supervised domain adaptation (SSDA), source-free domain\nadaptation (SFDA), etc.\n","authors":["Duojun Huang","Jichang Li","Weikai Chen","Junshi Huang","Zhenhua Chai","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2307.11618v1.pdf","comment":"CVPR2023, Highlight paper"},{"id":"http://arxiv.org/abs/2307.11604v1","updated":"2023-07-21T14:14:29Z","published":"2023-07-21T14:14:29Z","title":"Consistency-guided Meta-Learning for Bootstrapping Semi-Supervised\n  Medical Image Segmentation","summary":"  Medical imaging has witnessed remarkable progress but usually requires a\nlarge amount of high-quality annotated data which is time-consuming and costly\nto obtain. To alleviate this burden, semi-supervised learning has garnered\nattention as a potential solution. In this paper, we present Meta-Learning for\nBootstrapping Medical Image Segmentation (MLB-Seg), a novel method for tackling\nthe challenge of semi-supervised medical image segmentation. Specifically, our\napproach first involves training a segmentation model on a small set of clean\nlabeled images to generate initial labels for unlabeled data. To further\noptimize this bootstrapping process, we introduce a per-pixel weight mapping\nsystem that dynamically assigns weights to both the initialized labels and the\nmodel's own predictions. These weights are determined using a meta-process that\nprioritizes pixels with loss gradient directions closer to those of clean data,\nwhich is based on a small set of precisely annotated images. To facilitate the\nmeta-learning process, we additionally introduce a consistency-based Pseudo\nLabel Enhancement (PLE) scheme that improves the quality of the model's own\npredictions by ensembling predictions from various augmented versions of the\nsame input. In order to improve the quality of the weight maps obtained through\nmultiple augmentations of a single input, we introduce a mean teacher into the\nPLE scheme. This method helps to reduce noise in the weight maps and stabilize\nits generation process. Our extensive experimental results on public atrial and\nprostate segmentation datasets demonstrate that our proposed method achieves\nstate-of-the-art results under semi-supervision. Our code is available at\nhttps://github.com/aijinrjinr/MLB-Seg.\n","authors":["Qingyue Wei","Lequan Yu","Xianhang Li","Wei Shao","Cihang Xie","Lei Xing","Yuyin Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.11604v1.pdf","comment":"Accepted to MICCAI 2023. Code is publicly available at\n  https://github.com/aijinrjinr/MLB-Seg"},{"id":"http://arxiv.org/abs/2307.11603v1","updated":"2023-07-21T14:12:28Z","published":"2023-07-21T14:12:28Z","title":"Cascaded multitask U-Net using topological loss for vessel segmentation\n  and centerline extraction","summary":"  Vessel segmentation and centerline extraction are two crucial preliminary\ntasks for many computer-aided diagnosis tools dealing with vascular diseases.\nRecently, deep-learning based methods have been widely applied to these tasks.\nHowever, classic deep-learning approaches struggle to capture the complex\ngeometry and specific topology of vascular networks, which is of the utmost\nimportance in most applications. To overcome these limitations, the clDice\nloss, a topological loss that focuses on the vessel centerlines, has been\nrecently proposed. This loss requires computing, with a proposed soft-skeleton\nalgorithm, the skeletons of both the ground truth and the predicted\nsegmentation. However, the soft-skeleton algorithm provides suboptimal results\non 3D images, which makes the clDice hardly suitable on 3D images. In this\npaper, we propose to replace the soft-skeleton algorithm by a U-Net which\ncomputes the vascular skeleton directly from the segmentation. We show that our\nmethod provides more accurate skeletons than the soft-skeleton algorithm. We\nthen build upon this network a cascaded U-Net trained with the clDice loss to\nembed topological constraints during the segmentation. The resulting model is\nable to predict both the vessel segmentation and centerlines with a more\naccurate topology.\n","authors":["Pierre Rougé","Nicolas Passat","Odyssée Merveille"],"pdf_url":"https://arxiv.org/pdf/2307.11603v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2305.18453v2","updated":"2023-07-21T13:26:21Z","published":"2023-05-29T04:14:38Z","title":"Conditional Diffusion Models for Semantic 3D Medical Image Synthesis","summary":"  The demand for artificial intelligence (AI) in healthcare is rapidly\nincreasing. However, significant challenges arise from data scarcity and\nprivacy concerns, particularly in medical imaging. While existing generative\nmodels have achieved success in image synthesis and image-to-image translation\ntasks, there remains a gap in the generation of 3D semantic medical images. To\naddress this gap, we introduce Med-DDPM, a diffusion model specifically\ndesigned for semantic 3D medical image synthesis, effectively tackling data\nscarcity and privacy issues. The novelty of Med-DDPM lies in its incorporation\nof semantic conditioning, enabling precise control during the image generation\nprocess. Our model outperforms Generative Adversarial Networks (GANs) in terms\nof stability and performance, generating diverse and anatomically coherent\nimages with high visual fidelity. Comparative analysis against state-of-the-art\naugmentation techniques demonstrates that Med-DDPM produces comparable results,\nhighlighting its potential as a data augmentation tool for enhancing model\naccuracy. In conclusion, Med-DDPM pioneers 3D semantic medical image synthesis\nby delivering high-quality and anatomically coherent images. Furthermore, the\nintegration of semantic conditioning with Med-DDPM holds promise for image\nanonymization in the field of biomedical imaging, showcasing the capabilities\nof the model in addressing challenges related to data scarcity and privacy\nconcerns.\n","authors":["Zolnamar Dorjsembe","Hsing-Kuo Pao","Sodtavilan Odonchimed","Furen Xiao"],"pdf_url":"https://arxiv.org/pdf/2305.18453v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11567v1","updated":"2023-07-21T13:18:43Z","published":"2023-07-21T13:18:43Z","title":"CortexMorph: fast cortical thickness estimation via diffeomorphic\n  registration using VoxelMorph","summary":"  The thickness of the cortical band is linked to various neurological and\npsychiatric conditions, and is often estimated through surface-based methods\nsuch as Freesurfer in MRI studies. The DiReCT method, which calculates cortical\nthickness using a diffeomorphic deformation of the gray-white matter interface\ntowards the pial surface, offers an alternative to surface-based methods.\nRecent studies using a synthetic cortical thickness phantom have demonstrated\nthat the combination of DiReCT and deep-learning-based segmentation is more\nsensitive to subvoxel cortical thinning than Freesurfer.\n  While anatomical segmentation of a T1-weighted image now takes seconds,\nexisting implementations of DiReCT rely on iterative image registration methods\nwhich can take up to an hour per volume. On the other hand, learning-based\ndeformable image registration methods like VoxelMorph have been shown to be\nfaster than classical methods while improving registration accuracy. This paper\nproposes CortexMorph, a new method that employs unsupervised deep learning to\ndirectly regress the deformation field needed for DiReCT. By combining\nCortexMorph with a deep-learning-based segmentation model, it is possible to\nestimate region-wise thickness in seconds from a T1-weighted image, while\nmaintaining the ability to detect cortical atrophy. We validate this claim on\nthe OASIS-3 dataset and the synthetic cortical thickness phantom of Rusak et\nal.\n","authors":["Richard McKinley","Christian Rummel"],"pdf_url":"https://arxiv.org/pdf/2307.11567v1.pdf","comment":"Accepted (early acceptance) at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.11558v1","updated":"2023-07-21T13:06:02Z","published":"2023-07-21T13:06:02Z","title":"Advancing Visual Grounding with Scene Knowledge: Benchmark and Method","summary":"  Visual grounding (VG) aims to establish fine-grained alignment between vision\nand language. Ideally, it can be a testbed for vision-and-language models to\nevaluate their understanding of the images and texts and their reasoning\nabilities over their joint space. However, most existing VG datasets are\nconstructed using simple description texts, which do not require sufficient\nreasoning over the images and texts. This has been demonstrated in a recent\nstudy~\\cite{luo2022goes}, where a simple LSTM-based text encoder without\npretraining can achieve state-of-the-art performance on mainstream VG datasets.\nTherefore, in this paper, we propose a novel benchmark of \\underline{S}cene\n\\underline{K}nowledge-guided \\underline{V}isual \\underline{G}rounding (SK-VG),\nwhere the image content and referring expressions are not sufficient to ground\nthe target objects, forcing the models to have a reasoning ability on the\nlong-form scene knowledge. To perform this task, we propose two approaches to\naccept the triple-type input, where the former embeds knowledge into the image\nfeatures before the image-query interaction; the latter leverages linguistic\nstructure to assist in computing the image-text matching. We conduct extensive\nexperiments to analyze the above methods and show that the proposed approaches\nachieve promising results but still leave room for improvement, including\nperformance and interpretability. The dataset and code are available at\n\\url{https://github.com/zhjohnchan/SK-VG}.\n","authors":["Zhihong Chen","Ruifei Zhang","Yibing Song","Xiang Wan","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2307.11558v1.pdf","comment":"Computer Vision and Natural Language Processing. 21 pages, 14\n  figures. CVPR-2023"},{"id":"http://arxiv.org/abs/2307.11550v1","updated":"2023-07-21T12:53:54Z","published":"2023-07-21T12:53:54Z","title":"YOLOPose V2: Understanding and Improving Transformer-based 6D Pose\n  Estimation","summary":"  6D object pose estimation is a crucial prerequisite for autonomous robot\nmanipulation applications. The state-of-the-art models for pose estimation are\nconvolutional neural network (CNN)-based. Lately, Transformers, an architecture\noriginally proposed for natural language processing, is achieving\nstate-of-the-art results in many computer vision tasks as well. Equipped with\nthe multi-head self-attention mechanism, Transformers enable simple\nsingle-stage end-to-end architectures for learning object detection and 6D\nobject pose estimation jointly. In this work, we propose YOLOPose (short form\nfor You Only Look Once Pose estimation), a Transformer-based multi-object 6D\npose estimation method based on keypoint regression and an improved variant of\nthe YOLOPose model. In contrast to the standard heatmaps for predicting\nkeypoints in an image, we directly regress the keypoints. Additionally, we\nemploy a learnable orientation estimation module to predict the orientation\nfrom the keypoints. Along with a separate translation estimation module, our\nmodel is end-to-end differentiable. Our method is suitable for real-time\napplications and achieves results comparable to state-of-the-art methods. We\nanalyze the role of object queries in our architecture and reveal that the\nobject queries specialize in detecting objects in specific image regions.\nFurthermore, we quantify the accuracy trade-off of using datasets of smaller\nsizes to train our model.\n","authors":["Arul Selvam Periyasamy","Arash Amini","Vladimir Tsaturyan","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2307.11550v1.pdf","comment":"Robotics and Autonomous Systems Journal, Elsevier, to appear 2023.\n  arXiv admin note: substantial text overlap with arXiv:2205.02536"},{"id":"http://arxiv.org/abs/2307.11545v1","updated":"2023-07-21T12:46:15Z","published":"2023-07-21T12:46:15Z","title":"Bridging Vision and Language Encoders: Parameter-Efficient Tuning for\n  Referring Image Segmentation","summary":"  Parameter Efficient Tuning (PET) has gained attention for reducing the number\nof parameters while maintaining performance and providing better hardware\nresource savings, but few studies investigate dense prediction tasks and\ninteraction between modalities. In this paper, we do an investigation of\nefficient tuning problems on referring image segmentation. We propose a novel\nadapter called Bridger to facilitate cross-modal information exchange and\ninject task-specific information into the pre-trained model. We also design a\nlightweight decoder for image segmentation. Our approach achieves comparable or\nsuperior performance with only 1.61\\% to 3.38\\% backbone parameter updates,\nevaluated on challenging benchmarks. The code is available at\n\\url{https://github.com/kkakkkka/ETRIS}.\n","authors":["Zunnan Xu","Zhihong Chen","Yong Zhang","Yibing Song","Xiang Wan","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2307.11545v1.pdf","comment":"Computer Vision and Natural Language Processing. 14 pages, 8 figures.\n  ICCV-2023"},{"id":"http://arxiv.org/abs/2303.11057v3","updated":"2023-07-21T12:43:23Z","published":"2023-03-20T12:14:13Z","title":"Learning Foresightful Dense Visual Affordance for Deformable Object\n  Manipulation","summary":"  Understanding and manipulating deformable objects (e.g., ropes and fabrics)\nis an essential yet challenging task with broad applications. Difficulties come\nfrom complex states and dynamics, diverse configurations and high-dimensional\naction space of deformable objects. Besides, the manipulation tasks usually\nrequire multiple steps to accomplish, and greedy policies may easily lead to\nlocal optimal states. Existing studies usually tackle this problem using\nreinforcement learning or imitating expert demonstrations, with limitations in\nmodeling complex states or requiring hand-crafted expert policies. In this\npaper, we study deformable object manipulation using dense visual affordance,\nwith generalization towards diverse states, and propose a novel kind of\nforesightful dense affordance, which avoids local optima by estimating states'\nvalues for long-term manipulation. We propose a framework for learning this\nrepresentation, with novel designs such as multi-stage stable learning and\nefficient self-supervised data collection without experts. Experiments\ndemonstrate the superiority of our proposed foresightful dense affordance.\nProject page: https://hyperplane-lab.github.io/DeformableAffordance\n","authors":["Ruihai Wu","Chuanruo Ning","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2303.11057v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.11543v1","updated":"2023-07-21T12:43:07Z","published":"2023-07-21T12:43:07Z","title":"KVN: Keypoints Voting Network with Differentiable RANSAC for Stereo Pose\n  Estimation","summary":"  Object pose estimation is a fundamental computer vision task exploited in\nseveral robotics and augmented reality applications. Many established\napproaches rely on predicting 2D-3D keypoint correspondences using RANSAC\n(Random sample consensus) and estimating the object pose using the PnP\n(Perspective-n-Point) algorithm. Being RANSAC non-differentiable,\ncorrespondences cannot be directly learned in an end-to-end fashion. In this\npaper, we address the stereo image-based object pose estimation problem by (i)\nintroducing a differentiable RANSAC layer into a well-known monocular pose\nestimation network; (ii) exploiting an uncertainty-driven multi-view PnP solver\nwhich can fuse information from multiple views. We evaluate our approach on a\nchallenging public stereo object pose estimation dataset, yielding\nstate-of-the-art results against other recent approaches. Furthermore, in our\nablation study, we show that the differentiable RANSAC layer plays a\nsignificant role in the accuracy of the proposed method. We release with this\npaper the open-source implementation of our method.\n","authors":["Ivano Donadi","Alberto Pretto"],"pdf_url":"https://arxiv.org/pdf/2307.11543v1.pdf","comment":"Submitted to IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2307.11530v1","updated":"2023-07-21T12:23:39Z","published":"2023-07-21T12:23:39Z","title":"UWAT-GAN: Fundus Fluorescein Angiography Synthesis via Ultra-wide-angle\n  Transformation Multi-scale GAN","summary":"  Fundus photography is an essential examination for clinical and differential\ndiagnosis of fundus diseases. Recently, Ultra-Wide-angle Fundus (UWF)\ntechniques, UWF Fluorescein Angiography (UWF-FA) and UWF Scanning Laser\nOphthalmoscopy (UWF-SLO) have been gradually put into use. However, Fluorescein\nAngiography (FA) and UWF-FA require injecting sodium fluorescein which may have\ndetrimental influences. To avoid negative impacts, cross-modality medical image\ngeneration algorithms have been proposed. Nevertheless, current methods in\nfundus imaging could not produce high-resolution images and are unable to\ncapture tiny vascular lesion areas. This paper proposes a novel conditional\ngenerative adversarial network (UWAT-GAN) to synthesize UWF-FA from UWF-SLO.\nUsing multi-scale generators and a fusion module patch to better extract global\nand local information, our model can generate high-resolution images. Moreover,\nan attention transmit module is proposed to help the decoder learn effectively.\nBesides, a supervised approach is used to train the network using multiple new\nweighted losses on different scales of data. Experiments on an in-house UWF\nimage dataset demonstrate the superiority of the UWAT-GAN over the\nstate-of-the-art methods. The source code is available at:\nhttps://github.com/Tinysqua/UWAT-GAN.\n","authors":["Zhaojie Fang","Zhanghao Chen","Pengxue Wei","Wangting Li","Shaochong Zhang","Ahmed Elazab","Gangyong Jia","Ruiquan Ge","Changmiao Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11530v1.pdf","comment":"26th International Conference on Medical Image Computing and Computer\n  Assisted Intervention"},{"id":"http://arxiv.org/abs/2307.11528v1","updated":"2023-07-21T12:18:35Z","published":"2023-07-21T12:18:35Z","title":"Improving Viewpoint Robustness for Visual Recognition via Adversarial\n  Training","summary":"  Viewpoint invariance remains challenging for visual recognition in the 3D\nworld, as altering the viewing directions can significantly impact predictions\nfor the same object. While substantial efforts have been dedicated to making\nneural networks invariant to 2D image translations and rotations, viewpoint\ninvariance is rarely investigated. Motivated by the success of adversarial\ntraining in enhancing model robustness, we propose Viewpoint-Invariant\nAdversarial Training (VIAT) to improve the viewpoint robustness of image\nclassifiers. Regarding viewpoint transformation as an attack, we formulate VIAT\nas a minimax optimization problem, where the inner maximization characterizes\ndiverse adversarial viewpoints by learning a Gaussian mixture distribution\nbased on the proposed attack method GMVFool. The outer minimization obtains a\nviewpoint-invariant classifier by minimizing the expected loss over the\nworst-case viewpoint distributions that can share the same one for different\nobjects within the same category. Based on GMVFool, we contribute a large-scale\ndataset called ImageNet-V+ to benchmark viewpoint robustness. Experimental\nresults show that VIAT significantly improves the viewpoint robustness of\nvarious image classifiers based on the diversity of adversarial viewpoints\ngenerated by GMVFool. Furthermore, we propose ViewRS, a certified viewpoint\nrobustness method that provides a certified radius and accuracy to demonstrate\nthe effectiveness of VIAT from the theoretical perspective.\n","authors":["Shouwei Ruan","Yinpeng Dong","Hang Su","Jianteng Peng","Ning Chen","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2307.11528v1.pdf","comment":"14 pages, 12 figures. arXiv admin note: substantial text overlap with\n  arXiv:2307.10235"},{"id":"http://arxiv.org/abs/2303.11630v2","updated":"2023-07-21T12:15:41Z","published":"2023-03-21T06:54:18Z","title":"BoxSnake: Polygonal Instance Segmentation with Box Supervision","summary":"  Box-supervised instance segmentation has gained much attention as it requires\nonly simple box annotations instead of costly mask or polygon annotations.\nHowever, existing box-supervised instance segmentation models mainly focus on\nmask-based frameworks. We propose a new end-to-end training technique, termed\nBoxSnake, to achieve effective polygonal instance segmentation using only box\nannotations for the first time. Our method consists of two loss functions: (1)\na point-based unary loss that constrains the bounding box of predicted polygons\nto achieve coarse-grained segmentation; and (2) a distance-aware pairwise loss\nthat encourages the predicted polygons to fit the object boundaries. Compared\nwith the mask-based weakly-supervised methods, BoxSnake further reduces the\nperformance gap between the predicted segmentation and the bounding box, and\nshows significant superiority on the Cityscapes dataset.\n","authors":["Rui Yang","Lin Song","Yixiao Ge","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2303.11630v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.06666v2","updated":"2023-07-21T12:15:16Z","published":"2023-07-13T10:19:04Z","title":"Transformer-based end-to-end classification of variable-length\n  volumetric data","summary":"  The automatic classification of 3D medical data is memory-intensive. Also,\nvariations in the number of slices between samples is common. Na\\\"ive solutions\nsuch as subsampling can solve these problems, but at the cost of potentially\neliminating relevant diagnosis information. Transformers have shown promising\nperformance for sequential data analysis. However, their application for long\nsequences is data, computationally, and memory demanding. In this paper, we\npropose an end-to-end Transformer-based framework that allows to classify\nvolumetric data of variable length in an efficient fashion. Particularly, by\nrandomizing the input volume-wise resolution(#slices) during training, we\nenhance the capacity of the learnable positional embedding assigned to each\nvolume slice. Consequently, the accumulated positional information in each\npositional embedding can be generalized to the neighbouring slices, even for\nhigh-resolution volumes at the test time. By doing so, the model will be more\nrobust to variable volume length and amenable to different computational\nbudgets. We evaluated the proposed approach in retinal OCT volume\nclassification and achieved 21.96% average improvement in balanced accuracy on\na 9-class diagnostic task, compared to state-of-the-art video transformers. Our\nfindings show that varying the volume-wise resolution of the input during\ntraining results in more informative volume representation as compared to\ntraining with fixed number of slices per volume.\n","authors":["Marzieh Oghbaie","Teresa Araujo","Taha Emre","Ursula Schmidt-Erfurth","Hrvoje Bogunovic"],"pdf_url":"https://arxiv.org/pdf/2307.06666v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11526v1","updated":"2023-07-21T12:14:33Z","published":"2023-07-21T12:14:33Z","title":"CopyRNeRF: Protecting the CopyRight of Neural Radiance Fields","summary":"  Neural Radiance Fields (NeRF) have the potential to be a major representation\nof media. Since training a NeRF has never been an easy task, the protection of\nits model copyright should be a priority. In this paper, by analyzing the pros\nand cons of possible copyright protection solutions, we propose to protect the\ncopyright of NeRF models by replacing the original color representation in NeRF\nwith a watermarked color representation. Then, a distortion-resistant rendering\nscheme is designed to guarantee robust message extraction in 2D renderings of\nNeRF. Our proposed method can directly protect the copyright of NeRF models\nwhile maintaining high rendering quality and bit accuracy when compared among\noptional solutions.\n","authors":["Ziyuan Luo","Qing Guo","Ka Chun Cheung","Simon See","Renjie Wan"],"pdf_url":"https://arxiv.org/pdf/2307.11526v1.pdf","comment":"11 pages, 6 figures, accepted by iccv 2023 non-camera-ready version"},{"id":"http://arxiv.org/abs/2304.14133v2","updated":"2023-07-21T12:06:17Z","published":"2023-04-27T12:28:29Z","title":"VERITE: A Robust Benchmark for Multimodal Misinformation Detection\n  Accounting for Unimodal Bias","summary":"  Multimedia content has become ubiquitous on social media platforms, leading\nto the rise of multimodal misinformation (MM) and the urgent need for effective\nstrategies to detect and prevent its spread. In recent years, the challenge of\nmultimodal misinformation detection (MMD) has garnered significant attention by\nresearchers and has mainly involved the creation of annotated, weakly\nannotated, or synthetically generated training datasets, along with the\ndevelopment of various deep learning MMD models. However, the problem of\nunimodal bias in MMD benchmarks -- where biased or unimodal methods outperform\ntheir multimodal counterparts on an inherently multimodal task -- has been\noverlooked. In this study, we systematically investigate and identify the\npresence of unimodal bias in widely-used MMD benchmarks (VMU-Twitter, COSMOS),\nraising concerns about their suitability for reliable evaluation. To address\nthis issue, we introduce the \"VERification of Image-TExtpairs\" (VERITE)\nbenchmark for MMD which incorporates real-world data, excludes \"asymmetric\nmultimodal misinformation\" and utilizes \"modality balancing\". We conduct an\nextensive comparative study with a Transformer-based architecture that shows\nthe ability of VERITE to effectively address unimodal bias, rendering it a\nrobust evaluation framework for MMD. Furthermore, we introduce a new method --\ntermed Crossmodal HArd Synthetic MisAlignment (CHASMA) -- for generating\nrealistic synthetic training data that preserve crossmodal relations between\nlegitimate images and false human-written captions. By leveraging CHASMA in the\ntraining process, we observe consistent and notable improvements in predictive\nperformance on VERITE; with a 9.2% increase in accuracy. We release our code\nat: https://github.com/stevejpapad/image-text-verification\n","authors":["Stefanos-Iordanis Papadopoulos","Christos Koutlis","Symeon Papadopoulos","Panagiotis C. Petrantonakis"],"pdf_url":"https://arxiv.org/pdf/2304.14133v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11518v1","updated":"2023-07-21T12:03:39Z","published":"2023-07-21T12:03:39Z","title":"BatMobility: Towards Flying Without Seeing for Autonomous Drones","summary":"  Unmanned aerial vehicles (UAVs) rely on optical sensors such as cameras and\nlidar for autonomous operation. However, such optical sensors are error-prone\nin bad lighting, inclement weather conditions including fog and smoke, and\naround textureless or transparent surfaces. In this paper, we ask: is it\npossible to fly UAVs without relying on optical sensors, i.e., can UAVs fly\nwithout seeing? We present BatMobility, a lightweight mmWave radar-only\nperception system for UAVs that eliminates the need for optical sensors.\nBatMobility enables two core functionalities for UAVs -- radio flow estimation\n(a novel FMCW radar-based alternative for optical flow based on\nsurface-parallel doppler shift) and radar-based collision avoidance. We build\nBatMobility using commodity sensors and deploy it as a real-time system on a\nsmall off-the-shelf quadcopter running an unmodified flight controller. Our\nevaluation shows that BatMobility achieves comparable or better performance\nthan commercial-grade optical sensors across a wide range of scenarios.\n","authors":["Emerson Sie","Zikun Liu","Deepak Vasisht"],"pdf_url":"https://arxiv.org/pdf/2307.11518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07308v3","updated":"2023-07-21T11:52:28Z","published":"2023-06-12T13:48:37Z","title":"Self-Supervised Hyperspectral Inpainting with the Optimisation inspired\n  Deep Neural Network Prior","summary":"  Hyperspectral Image (HSI)s cover hundreds or thousands of narrow spectral\nbands, conveying a wealth of spatial and spectral information. However, due to\nthe instrumental errors and the atmospheric changes, the HSI obtained in\npractice are often contaminated by noise and dead pixels(lines), resulting in\nmissing information that may severely compromise the subsequent applications.\nWe introduce here a novel HSI missing pixel prediction algorithm, called Low\nRank and Sparsity Constraint Plug-and-Play (LRS-PnP). It is shown that LRS-PnP\nis able to predict missing pixels and bands even when all spectral bands of the\nimage are missing. The proposed LRS-PnP algorithm is further extended to a\nself-supervised model by combining the LRS-PnP with the Deep Image Prior (DIP),\ncalled LRS-PnP-DIP. In a series of experiments with real data, It is shown that\nthe LRS-PnP-DIP either achieves state-of-the-art inpainting performance\ncompared to other learning-based methods, or outperforms them.\n","authors":["Shuo Li","Mehrdad Yaghoobi"],"pdf_url":"https://arxiv.org/pdf/2306.07308v3.pdf","comment":"Presented in ISCS23"},{"id":"http://arxiv.org/abs/2208.05788v2","updated":"2023-07-21T11:50:11Z","published":"2022-08-10T12:29:01Z","title":"Semantic Self-adaptation: Enhancing Generalization with a Single Sample","summary":"  The lack of out-of-domain generalization is a critical weakness of deep\nnetworks for semantic segmentation. Previous studies relied on the assumption\nof a static model, i. e., once the training process is complete, model\nparameters remain fixed at test time. In this work, we challenge this premise\nwith a self-adaptive approach for semantic segmentation that adjusts the\ninference process to each input sample. Self-adaptation operates on two levels.\nFirst, it fine-tunes the parameters of convolutional layers to the input image\nusing consistency regularization. Second, in Batch Normalization layers,\nself-adaptation interpolates between the training and the reference\ndistribution derived from a single test sample. Despite both techniques being\nwell known in the literature, their combination sets new state-of-the-art\naccuracy on synthetic-to-real generalization benchmarks. Our empirical study\nsuggests that self-adaptation may complement the established practice of model\nregularization at training time for improving deep network generalization to\nout-of-domain data. Our code and pre-trained models are available at\nhttps://github.com/visinf/self-adaptive.\n","authors":["Sherwin Bahmani","Oliver Hahn","Eduard Zamfir","Nikita Araslanov","Daniel Cremers","Stefan Roth"],"pdf_url":"https://arxiv.org/pdf/2208.05788v2.pdf","comment":"Published in TMLR (July 2023); OpenReview:\n  https://openreview.net/forum?id=ILNqQhGbLx; Code:\n  https://github.com/visinf/self-adaptive; Video: https://youtu.be/s4DG65ic0EA"},{"id":"http://arxiv.org/abs/2307.11514v1","updated":"2023-07-21T11:50:05Z","published":"2023-07-21T11:50:05Z","title":"CORE: Cooperative Reconstruction for Multi-Agent Perception","summary":"  This paper presents CORE, a conceptually simple, effective and\ncommunication-efficient model for multi-agent cooperative perception. It\naddresses the task from a novel perspective of cooperative reconstruction,\nbased on two key insights: 1) cooperating agents together provide a more\nholistic observation of the environment, and 2) the holistic observation can\nserve as valuable supervision to explicitly guide the model learning how to\nreconstruct the ideal observation based on collaboration. CORE instantiates the\nidea with three major components: a compressor for each agent to create more\ncompact feature representation for efficient broadcasting, a lightweight\nattentive collaboration component for cross-agent message aggregation, and a\nreconstruction module to reconstruct the observation based on aggregated\nfeature representations. This learning-to-reconstruct idea is task-agnostic,\nand offers clear and reasonable supervision to inspire more effective\ncollaboration, eventually promoting perception tasks. We validate CORE on\nOPV2V, a large-scale multi-agent percetion dataset, in two tasks, i.e., 3D\nobject detection and semantic segmentation. Results demonstrate that the model\nachieves state-of-the-art performance on both tasks, and is more\ncommunication-efficient.\n","authors":["Binglu Wang","Lei Zhang","Zhaozhong Wang","Yongqiang Zhao","Tianfei Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.11514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11513v1","updated":"2023-07-21T11:49:30Z","published":"2023-07-21T11:49:30Z","title":"Bone mineral density estimation from a plain X-ray image by learning\n  decomposition into projections of bone-segmented computed tomography","summary":"  Osteoporosis is a prevalent bone disease that causes fractures in fragile\nbones, leading to a decline in daily living activities. Dual-energy X-ray\nabsorptiometry (DXA) and quantitative computed tomography (QCT) are highly\naccurate for diagnosing osteoporosis; however, these modalities require special\nequipment and scan protocols. To frequently monitor bone health, low-cost,\nlow-dose, and ubiquitously available diagnostic methods are highly anticipated.\nIn this study, we aim to perform bone mineral density (BMD) estimation from a\nplain X-ray image for opportunistic screening, which is potentially useful for\nearly diagnosis. Existing methods have used multi-stage approaches consisting\nof extraction of the region of interest and simple regression to estimate BMD,\nwhich require a large amount of training data. Therefore, we propose an\nefficient method that learns decomposition into projections of bone-segmented\nQCT for BMD estimation under limited datasets. The proposed method achieved\nhigh accuracy in BMD estimation, where Pearson correlation coefficients of\n0.880 and 0.920 were observed for DXA-measured BMD and QCT-measured BMD\nestimation tasks, respectively, and the root mean square of the coefficient of\nvariation values were 3.27 to 3.79% for four measurements with different poses.\nFurthermore, we conducted extensive validation experiments, including\nmulti-pose, uncalibrated-CT, and compression experiments toward actual\napplication in routine clinical practice.\n","authors":["Yi Gu","Yoshito Otake","Keisuke Uemura","Mazen Soufi","Masaki Takao","Hugues Talbot","Seiji Okada","Nobuhiko Sugano","Yoshinobu Sato"],"pdf_url":"https://arxiv.org/pdf/2307.11513v1.pdf","comment":"20 pages and 22 figures"},{"id":"http://arxiv.org/abs/2305.19920v2","updated":"2023-07-21T11:27:30Z","published":"2023-05-31T14:56:18Z","title":"MSKdeX: Musculoskeletal (MSK) decomposition from an X-ray image for\n  fine-grained estimation of lean muscle mass and muscle volume","summary":"  Musculoskeletal diseases such as sarcopenia and osteoporosis are major\nobstacles to health during aging. Although dual-energy X-ray absorptiometry\n(DXA) and computed tomography (CT) can be used to evaluate musculoskeletal\nconditions, frequent monitoring is difficult due to the cost and accessibility\n(as well as high radiation exposure in the case of CT). We propose a method\n(named MSKdeX) to estimate fine-grained muscle properties from a plain X-ray\nimage, a low-cost, low-radiation, and highly accessible imaging modality,\nthrough musculoskeletal decomposition leveraging fine-grained segmentation in\nCT. We train a multi-channel quantitative image translation model to decompose\nan X-ray image into projections of CT of individual muscles to infer the lean\nmuscle mass and muscle volume. We propose the object-wise intensity-sum loss, a\nsimple yet surprisingly effective metric invariant to muscle deformation and\nprojection direction, utilizing information in CT and X-ray images collected\nfrom the same patient. While our method is basically an unpaired image-to-image\ntranslation, we also exploit the nature of the bone's rigidity, which provides\nthe paired data through 2D-3D rigid registration, adding strong pixel-wise\nsupervision in unpaired training. Through the evaluation using a 539-patient\ndataset, we showed that the proposed method significantly outperformed\nconventional methods. The average Pearson correlation coefficient between the\npredicted and CT-derived ground truth metrics was increased from 0.460 to\n0.863. We believe our method opened up a new musculoskeletal diagnosis method\nand has the potential to be extended to broader applications in multi-channel\nquantitative image translation tasks. Our source code will be released soon.\n","authors":["Yi Gu","Yoshito Otake","Keisuke Uemura","Masaki Takao","Mazen Soufi","Yuta Hiasa","Hugues Talbot","Seiji Okata","Nobuhiko Sugano","Yoshinobu Sato"],"pdf_url":"https://arxiv.org/pdf/2305.19920v2.pdf","comment":"MICCAI 2023 early acceptance (12 pages and 6 figures)"},{"id":"http://arxiv.org/abs/2306.00988v2","updated":"2023-07-21T11:27:10Z","published":"2023-06-01T17:59:57Z","title":"Continual Learning for Abdominal Multi-Organ and Tumor Segmentation","summary":"  The ability to dynamically extend a model to new data and classes is critical\nfor multiple organ and tumor segmentation. However, due to privacy regulations,\naccessing previous data and annotations can be problematic in the medical\ndomain. This poses a significant barrier to preserving the high segmentation\naccuracy of the old classes when learning from new classes because of the\ncatastrophic forgetting problem. In this paper, we first empirically\ndemonstrate that simply using high-quality pseudo labels can fairly mitigate\nthis problem in the setting of organ segmentation. Furthermore, we put forward\nan innovative architecture designed specifically for continuous organ and tumor\nsegmentation, which incurs minimal computational overhead. Our proposed design\ninvolves replacing the conventional output layer with a suite of lightweight,\nclass-specific heads, thereby offering the flexibility to accommodate newly\nemerging classes. These heads enable independent predictions for newly\nintroduced and previously learned classes, effectively minimizing the impact of\nnew classes on old ones during the course of continual learning. We further\npropose incorporating Contrastive Language-Image Pretraining (CLIP) embeddings\ninto the organ-specific heads. These embeddings encapsulate the semantic\ninformation of each class, informed by extensive image-text co-training. The\nproposed method is evaluated on both in-house and public abdominal CT datasets\nunder organ and tumor segmentation tasks. Empirical results suggest that the\nproposed design improves the segmentation performance of a baseline neural\nnetwork on newly-introduced and previously-learned classes along the learning\ntrajectory.\n","authors":["Yixiao Zhang","Xinyi Li","Huimiao Chen","Alan Yuille","Yaoyao Liu","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.00988v2.pdf","comment":"MICCAI-2023"},{"id":"http://arxiv.org/abs/2303.05966v2","updated":"2023-07-21T11:21:30Z","published":"2023-03-10T14:55:35Z","title":"Score-Based Generative Models for Medical Image Segmentation using\n  Signed Distance Functions","summary":"  Medical image segmentation is a crucial task that relies on the ability to\naccurately identify and isolate regions of interest in medical images. Thereby,\ngenerative approaches allow to capture the statistical properties of\nsegmentation masks that are dependent on the respective structures. In this\nwork we propose a conditional score-based generative modeling framework to\nrepresent the signed distance function (SDF) leading to an implicit\ndistribution of segmentation masks. The advantage of leveraging the SDF is a\nmore natural distortion when compared to that of binary masks. By learning the\nscore function of the conditional distribution of SDFs we can accurately sample\nfrom the distribution of segmentation masks, allowing for the evaluation of\nstatistical quantities. Thus, this probabilistic representation allows for the\ngeneration of uncertainty maps represented by the variance, which can aid in\nfurther analysis and enhance the predictive robustness. We qualitatively and\nquantitatively illustrate competitive performance of the proposed method on a\npublic nuclei and gland segmentation data set, highlighting its potential\nutility in medical image segmentation applications.\n","authors":["Lea Bogensperger","Dominik Narnhofer","Filip Ilic","Thomas Pock"],"pdf_url":"https://arxiv.org/pdf/2303.05966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11482v1","updated":"2023-07-21T10:36:05Z","published":"2023-07-21T10:36:05Z","title":"Redemption from Range-view for Accurate 3D Object Detection","summary":"  Most recent approaches for 3D object detection predominantly rely on\npoint-view or bird's-eye view representations, with limited exploration of\nrange-view-based methods. The range-view representation suffers from scale\nvariation and surface texture deficiency, both of which pose significant\nlimitations for developing corresponding methods. Notably, the surface texture\nloss problem has been largely ignored by all existing methods, despite its\nsignificant impact on the accuracy of range-view-based 3D object detection. In\nthis study, we propose Redemption from Range-view R-CNN (R2 R-CNN), a novel and\naccurate approach that comprehensively explores the range-view representation.\nOur proposed method addresses scale variation through the HD Meta Kernel, which\ncaptures range-view geometry information in multiple scales. Additionally, we\nintroduce Feature Points Redemption (FPR) to recover the lost 3D surface\ntexture information from the range view, and Synchronous-Grid RoI Pooling\n(S-Grid RoI Pooling), a multi-scaled approach with multiple receptive fields\nfor accurate box refinement. Our R2 R-CNN outperforms existing range-view-based\nmethods, achieving state-of-the-art performance on both the KITTI benchmark and\nthe Waymo Open Dataset. Our study highlights the critical importance of\naddressing the surface texture loss problem for accurate 3D object detection in\nrange-view-based methods. Codes will be made publicly available.\n","authors":["Yihan Wang","Qiao Yan"],"pdf_url":"https://arxiv.org/pdf/2307.11482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11477v1","updated":"2023-07-21T10:28:19Z","published":"2023-07-21T10:28:19Z","title":"SA-BEV: Generating Semantic-Aware Bird's-Eye-View Feature for Multi-view\n  3D Object Detection","summary":"  Recently, the pure camera-based Bird's-Eye-View (BEV) perception provides a\nfeasible solution for economical autonomous driving. However, the existing\nBEV-based multi-view 3D detectors generally transform all image features into\nBEV features, without considering the problem that the large proportion of\nbackground information may submerge the object information. In this paper, we\npropose Semantic-Aware BEV Pooling (SA-BEVPool), which can filter out\nbackground information according to the semantic segmentation of image features\nand transform image features into semantic-aware BEV features. Accordingly, we\npropose BEV-Paste, an effective data augmentation strategy that closely matches\nwith semantic-aware BEV feature. In addition, we design a Multi-Scale\nCross-Task (MSCT) head, which combines task-specific and cross-task information\nto predict depth distribution and semantic segmentation more accurately,\nfurther improving the quality of semantic-aware BEV feature. Finally, we\nintegrate the above modules into a novel multi-view 3D object detection\nframework, namely SA-BEV. Experiments on nuScenes show that SA-BEV achieves\nstate-of-the-art performance. Code has been available at\nhttps://github.com/mengtan00/SA-BEV.git.\n","authors":["Jinqing Zhang","Yanan Zhang","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11471v1","updated":"2023-07-21T10:12:09Z","published":"2023-07-21T10:12:09Z","title":"Robust Visual Question Answering: Datasets, Methods, and Future\n  Challenges","summary":"  Visual question answering requires a system to provide an accurate natural\nlanguage answer given an image and a natural language question. However, it is\nwidely recognized that previous generic VQA methods often exhibit a tendency to\nmemorize biases present in the training data rather than learning proper\nbehaviors, such as grounding images before predicting answers. Therefore, these\nmethods usually achieve high in-distribution but poor out-of-distribution\nperformance. In recent years, various datasets and debiasing methods have been\nproposed to evaluate and enhance the VQA robustness, respectively. This paper\nprovides the first comprehensive survey focused on this emerging fashion.\nSpecifically, we first provide an overview of the development process of\ndatasets from in-distribution and out-of-distribution perspectives. Then, we\nexamine the evaluation metrics employed by these datasets. Thirdly, we propose\na typology that presents the development process, similarities and differences,\nrobustness comparison, and technical features of existing debiasing methods.\nFurthermore, we analyze and discuss the robustness of representative\nvision-and-language pre-training models on VQA. Finally, through a thorough\nreview of the available literature and experimental analysis, we discuss the\nkey areas for future research from various viewpoints.\n","authors":["Jie Ma","Pinghui Wang","Dechen Kong","Zewei Wang","Jun Liu","Hongbin Pei","Junzhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.11471v1.pdf","comment":"IEEE TPAMI (Under Review)"},{"id":"http://arxiv.org/abs/2307.11470v1","updated":"2023-07-21T10:10:18Z","published":"2023-07-21T10:10:18Z","title":"Physics-Aware Semi-Supervised Underwater Image Enhancement","summary":"  Underwater images normally suffer from degradation due to the transmission\nmedium of water bodies. Both traditional prior-based approaches and deep\nlearning-based methods have been used to address this problem. However, the\ninflexible assumption of the former often impairs their effectiveness in\nhandling diverse underwater scenes, while the generalization of the latter to\nunseen images is usually weakened by insufficient data. In this study, we\nleverage both the physics-based underwater Image Formation Model (IFM) and deep\nlearning techniques for Underwater Image Enhancement (UIE). To this end, we\npropose a novel Physics-Aware Dual-Stream Underwater Image Enhancement Network,\ni.e., PA-UIENet, which comprises a Transmission Estimation Steam (T-Stream) and\nan Ambient Light Estimation Stream (A-Stream). This network fulfills the UIE\ntask by explicitly estimating the degradation parameters of the IFM. We also\nadopt an IFM-inspired semi-supervised learning framework, which exploits both\nthe labeled and unlabeled images, to address the issue of insufficient data.\nOur method performs better than, or at least comparably to, eight baselines\nacross five testing sets in the degradation estimation and UIE tasks. This\nshould be due to the fact that it not only can model the degradation but also\ncan learn the characteristics of diverse underwater scenes.\n","authors":["Hao Qi","Xinghui Dong"],"pdf_url":"https://arxiv.org/pdf/2307.11470v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.11469v1","updated":"2023-07-21T10:08:58Z","published":"2023-07-21T10:08:58Z","title":"Distribution Shift Matters for Knowledge Distillation with Webly\n  Collected Images","summary":"  Knowledge distillation aims to learn a lightweight student network from a\npre-trained teacher network. In practice, existing knowledge distillation\nmethods are usually infeasible when the original training data is unavailable\ndue to some privacy issues and data management considerations. Therefore,\ndata-free knowledge distillation approaches proposed to collect training\ninstances from the Internet. However, most of them have ignored the common\ndistribution shift between the instances from original training data and webly\ncollected data, affecting the reliability of the trained student network. To\nsolve this problem, we propose a novel method dubbed ``Knowledge Distillation\nbetween Different Distributions\" (KD$^{3}$), which consists of three\ncomponents. Specifically, we first dynamically select useful training instances\nfrom the webly collected data according to the combined predictions of teacher\nnetwork and student network. Subsequently, we align both the weighted features\nand classifier parameters of the two networks for knowledge memorization.\nMeanwhile, we also build a new contrastive learning block called\nMixDistribution to generate perturbed data with a new distribution for instance\nalignment, so that the student network can further learn a\ndistribution-invariant representation. Intensive experiments on various\nbenchmark datasets demonstrate that our proposed KD$^{3}$ can outperform the\nstate-of-the-art data-free knowledge distillation approaches.\n","authors":["Jialiang Tang","Shuo Chen","Gang Niu","Masashi Sugiyama","Chen Gong"],"pdf_url":"https://arxiv.org/pdf/2307.11469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11466v1","updated":"2023-07-21T10:02:02Z","published":"2023-07-21T10:02:02Z","title":"MatSpectNet: Material Segmentation Network with Domain-Aware and\n  Physically-Constrained Hyperspectral Reconstruction","summary":"  Achieving accurate material segmentation for 3-channel RGB images is\nchallenging due to the considerable variation in a material's appearance.\nHyperspectral images, which are sets of spectral measurements sampled at\nmultiple wavelengths, theoretically offer distinct information for material\nidentification, as variations in intensity of electromagnetic radiation\nreflected by a surface depend on the material composition of a scene. However,\nexisting hyperspectral datasets are impoverished regarding the number of images\nand material categories for the dense material segmentation task, and\ncollecting and annotating hyperspectral images with a spectral camera is\nprohibitively expensive. To address this, we propose a new model, the\nMatSpectNet to segment materials with recovered hyperspectral images from RGB\nimages. The network leverages the principles of colour perception in modern\ncameras to constrain the reconstructed hyperspectral images and employs the\ndomain adaptation method to generalise the hyperspectral reconstruction\ncapability from a spectral recovery dataset to material segmentation datasets.\nThe reconstructed hyperspectral images are further filtered using learned\nresponse curves and enhanced with human perception. The performance of\nMatSpectNet is evaluated on the LMD dataset as well as the OpenSurfaces\ndataset. Our experiments demonstrate that MatSpectNet attains a 1.60% increase\nin average pixel accuracy and a 3.42% improvement in mean class accuracy\ncompared with the most recent publication. The project code is attached to the\nsupplementary material and will be published on GitHub.\n","authors":["Yuwen Heng","Yihong Wu","Jiawen Chen","Srinandan Dasmahapatra","Hansung Kim"],"pdf_url":"https://arxiv.org/pdf/2307.11466v1.pdf","comment":"7 pages main content"},{"id":"http://arxiv.org/abs/2210.09563v2","updated":"2023-07-21T10:01:25Z","published":"2022-10-18T03:32:18Z","title":"FedForgery: Generalized Face Forgery Detection with Residual Federated\n  Learning","summary":"  With the continuous development of deep learning in the field of image\ngeneration models, a large number of vivid forged faces have been generated and\nspread on the Internet. These high-authenticity artifacts could grow into a\nthreat to society security. Existing face forgery detection methods directly\nutilize the obtained public shared or centralized data for training but ignore\nthe personal privacy and security issues when personal data couldn't be\ncentralizedly shared in real-world scenarios. Additionally, different\ndistributions caused by diverse artifact types would further bring adverse\ninfluences on the forgery detection task. To solve the mentioned problems, the\npaper proposes a novel generalized residual Federated learning for face Forgery\ndetection (FedForgery). The designed variational autoencoder aims to learn\nrobust discriminative residual feature maps to detect forgery faces (with\ndiverse or even unknown artifact types). Furthermore, the general federated\nlearning strategy is introduced to construct distributed detection model\ntrained collaboratively with multiple local decentralized devices, which could\nfurther boost the representation generalization. Experiments conducted on\npublicly available face forgery detection datasets prove the superior\nperformance of the proposed FedForgery. The designed novel generalized face\nforgery detection protocols and source code would be publicly available.\n","authors":["Decheng Liu","Zhan Dang","Chunlei Peng","Yu Zheng","Shuang Li","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2210.09563v2.pdf","comment":"The code is available at https://github.com/GANG370/FedForgery. The\n  paper has been accepted in the IEEE Transactions on Information Forensics &\n  Security"},{"id":"http://arxiv.org/abs/2307.10926v2","updated":"2023-07-21T09:47:01Z","published":"2023-07-20T14:52:45Z","title":"Confidence intervals for performance estimates in 3D medical image\n  segmentation","summary":"  Medical segmentation models are evaluated empirically. As such an evaluation\nis based on a limited set of example images, it is unavoidably noisy. Beyond a\nmean performance measure, reporting confidence intervals is thus crucial.\nHowever, this is rarely done in medical image segmentation. The width of the\nconfidence interval depends on the test set size and on the spread of the\nperformance measure (its standard-deviation across of the test set). For\nclassification, many test images are needed to avoid wide confidence intervals.\nSegmentation, however, has not been studied, and it differs by the amount of\ninformation brought by a given test image. In this paper, we study the typical\nconfidence intervals in medical image segmentation. We carry experiments on 3D\nimage segmentation using the standard nnU-net framework, two datasets from the\nMedical Decathlon challenge and two performance measures: the Dice accuracy and\nthe Hausdorff distance. We show that the parametric confidence intervals are\nreasonable approximations of the bootstrap estimates for varying test set sizes\nand spread of the performance metric. Importantly, we show that the test size\nneeded to achieve a given precision is often much lower than for classification\ntasks. Typically, a 1% wide confidence interval requires about 100-200 test\nsamples when the spread is low (standard-deviation around 3%). More difficult\nsegmentation tasks may lead to higher spreads and require over 1000 samples.\n","authors":["R. El Jurdi","G. Varoquaux","O. Colliot"],"pdf_url":"https://arxiv.org/pdf/2307.10926v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.11458v1","updated":"2023-07-21T09:40:42Z","published":"2023-07-21T09:40:42Z","title":"Strip-MLP: Efficient Token Interaction for Vision MLP","summary":"  Token interaction operation is one of the core modules in MLP-based models to\nexchange and aggregate information between different spatial locations.\nHowever, the power of token interaction on the spatial dimension is highly\ndependent on the spatial resolution of the feature maps, which limits the\nmodel's expressive ability, especially in deep layers where the feature are\ndown-sampled to a small spatial size. To address this issue, we present a novel\nmethod called \\textbf{Strip-MLP} to enrich the token interaction power in three\nways. Firstly, we introduce a new MLP paradigm called Strip MLP layer that\nallows the token to interact with other tokens in a cross-strip manner,\nenabling the tokens in a row (or column) to contribute to the information\naggregations in adjacent but different strips of rows (or columns). Secondly, a\n\\textbf{C}ascade \\textbf{G}roup \\textbf{S}trip \\textbf{M}ixing \\textbf{M}odule\n(CGSMM) is proposed to overcome the performance degradation caused by small\nspatial feature size. The module allows tokens to interact more effectively in\nthe manners of within-patch and cross-patch, which is independent to the\nfeature spatial size. Finally, based on the Strip MLP layer, we propose a novel\n\\textbf{L}ocal \\textbf{S}trip \\textbf{M}ixing \\textbf{M}odule (LSMM) to boost\nthe token interaction power in the local region. Extensive experiments\ndemonstrate that Strip-MLP significantly improves the performance of MLP-based\nmodels on small datasets and obtains comparable or even better results on\nImageNet. In particular, Strip-MLP models achieve higher average Top-1 accuracy\nthan existing MLP-based models by +2.44\\% on Caltech-101 and +2.16\\% on\nCIFAR-100. The source codes will be available\nat~\\href{https://github.com/Med-Process/Strip_MLP{https://github.com/Med-Process/Strip\\_MLP}.\n","authors":["Guiping Cao","Shengda Luo","Wenjian Huang","Xiangyuan Lan","Dongmei Jiang","Yaowei Wang","Jianguo Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08092v2","updated":"2023-07-21T09:31:57Z","published":"2023-07-16T16:29:26Z","title":"Gait Data Augmentation using Physics-Based Biomechanical Simulation","summary":"  This paper focuses on addressing the problem of data scarcity for gait\nanalysis. Standard augmentation methods may produce gait sequences that are not\nconsistent with the biomechanical constraints of human walking. To address this\nissue, we propose a novel framework for gait data augmentation by using\nOpenSIM, a physics-based simulator, to synthesize biomechanically plausible\nwalking sequences. The proposed approach is validated by augmenting the WBDS\nand CASIA-B datasets and then training gait-based classifiers for 3D gender\ngait classification and 2D gait person identification respectively.\nExperimental results indicate that our augmentation approach can improve the\nperformance of model-based gait classifiers and deliver state-of-the-art\nresults for gait-based person identification with an accuracy of up to 96.11%\non the CASIA-B dataset.\n","authors":["Mritula Chandrasekaran","Jarek Francik","Dimitrios Makris"],"pdf_url":"https://arxiv.org/pdf/2307.08092v2.pdf","comment":"30 pages including references, 5 Figures submitted to ESWA"},{"id":"http://arxiv.org/abs/2307.02953v2","updated":"2023-07-21T09:26:06Z","published":"2023-07-06T12:39:06Z","title":"SegNetr: Rethinking the local-global interactions and skip connections\n  in U-shaped networks","summary":"  Recently, U-shaped networks have dominated the field of medical image\nsegmentation due to their simple and easily tuned structure. However, existing\nU-shaped segmentation networks: 1) mostly focus on designing complex\nself-attention modules to compensate for the lack of long-term dependence based\non convolution operation, which increases the overall number of parameters and\ncomputational complexity of the network; 2) simply fuse the features of encoder\nand decoder, ignoring the connection between their spatial locations. In this\npaper, we rethink the above problem and build a lightweight medical image\nsegmentation network, called SegNetr. Specifically, we introduce a novel\nSegNetr block that can perform local-global interactions dynamically at any\nstage and with only linear complexity. At the same time, we design a general\ninformation retention skip connection (IRSC) to preserve the spatial location\ninformation of encoder features and achieve accurate fusion with the decoder\nfeatures. We validate the effectiveness of SegNetr on four mainstream medical\nimage segmentation datasets, with 59\\% and 76\\% fewer parameters and GFLOPs\nthan vanilla U-Net, while achieving segmentation performance comparable to\nstate-of-the-art methods. Notably, the components proposed in this paper can be\napplied to other U-shaped networks to improve their segmentation performance.\n","authors":["Junlong Cheng","Chengrui Gao","Fengjie Wang","Min Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.02953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04246v2","updated":"2023-07-21T09:15:42Z","published":"2023-02-08T18:26:10Z","title":"Shortcut Detection with Variational Autoencoders","summary":"  For real-world applications of machine learning (ML), it is essential that\nmodels make predictions based on well-generalizing features rather than\nspurious correlations in the data. The identification of such spurious\ncorrelations, also known as shortcuts, is a challenging problem and has so far\nbeen scarcely addressed. In this work, we present a novel approach to detect\nshortcuts in image and audio datasets by leveraging variational autoencoders\n(VAEs). The disentanglement of features in the latent space of VAEs allows us\nto discover feature-target correlations in datasets and semi-automatically\nevaluate them for ML shortcuts. We demonstrate the applicability of our method\non several real-world datasets and identify shortcuts that have not been\ndiscovered before.\n","authors":["Nicolas M. Müller","Simon Roschmann","Shahbaz Khan","Philip Sperl","Konstantin Böttinger"],"pdf_url":"https://arxiv.org/pdf/2302.04246v2.pdf","comment":"Accepted at the ICML 2023 Workshop on Spurious Correlations,\n  Invariance and Stability"},{"id":"http://arxiv.org/abs/2307.04378v3","updated":"2023-07-21T09:13:55Z","published":"2023-07-10T07:24:44Z","title":"Towards Generalizable Diabetic Retinopathy Grading in Unseen Domains","summary":"  Diabetic Retinopathy (DR) is a common complication of diabetes and a leading\ncause of blindness worldwide. Early and accurate grading of its severity is\ncrucial for disease management. Although deep learning has shown great\npotential for automated DR grading, its real-world deployment is still\nchallenging due to distribution shifts among source and target domains, known\nas the domain generalization problem. Existing works have mainly attributed the\nperformance degradation to limited domain shifts caused by simple visual\ndiscrepancies, which cannot handle complex real-world scenarios. Instead, we\npresent preliminary evidence suggesting the existence of three-fold\ngeneralization issues: visual and degradation style shifts, diagnostic pattern\ndiversity, and data imbalance. To tackle these issues, we propose a novel\nunified framework named Generalizable Diabetic Retinopathy Grading Network\n(GDRNet). GDRNet consists of three vital components: fundus visual-artifact\naugmentation (FundusAug), dynamic hybrid-supervised loss (DahLoss), and\ndomain-class-aware re-balancing (DCR). FundusAug generates realistic augmented\nimages via visual transformation and image degradation, while DahLoss jointly\nleverages pixel-level consistency and image-level semantics to capture the\ndiverse diagnostic patterns and build generalizable feature representations.\nMoreover, DCR mitigates the data imbalance from a domain-class view and avoids\nundesired over-emphasis on rare domain-class pairs. Finally, we design a\npublicly available benchmark for fair evaluations. Extensive comparison\nexperiments against advanced methods and exhaustive ablation studies\ndemonstrate the effectiveness and generalization ability of GDRNet.\n","authors":["Haoxuan Che","Yuhan Cheng","Haibo Jin","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2307.04378v3.pdf","comment":"Early Accepted by MICCAI 2023, the 26th International Conference on\n  Medical Image Computing and Computer Assisted Intervention"},{"id":"http://arxiv.org/abs/2305.18310v2","updated":"2023-07-21T09:12:17Z","published":"2023-05-17T14:14:31Z","title":"Motion-Scenario Decoupling for Rat-Aware Video Position Prediction:\n  Strategy and Benchmark","summary":"  Recently significant progress has been made in human action recognition and\nbehavior prediction using deep learning techniques, leading to improved\nvision-based semantic understanding. However, there is still a lack of\nhigh-quality motion datasets for small bio-robotics, which presents more\nchallenging scenarios for long-term movement prediction and behavior control\nbased on third-person observation. In this study, we introduce RatPose, a\nbio-robot motion prediction dataset constructed by considering the influence\nfactors of individuals and environments based on predefined annotation rules.\nTo enhance the robustness of motion prediction against these factors, we\npropose a Dual-stream Motion-Scenario Decoupling (\\textit{DMSD}) framework that\neffectively separates scenario-oriented and motion-oriented features and\ndesigns a scenario contrast loss and motion clustering loss for overall\ntraining. With such distinctive architecture, the dual-branch feature flow\ninformation is interacted and compensated in a decomposition-then-fusion\nmanner. Moreover, we demonstrate significant performance improvements of the\nproposed \\textit{DMSD} framework on different difficulty-level tasks. We also\nimplement long-term discretized trajectory prediction tasks to verify the\ngeneralization ability of the proposed dataset.\n","authors":["Xiaofeng Liu","Jiaxin Gao","Yaohua Liu","Risheng Liu","Nenggan Zheng"],"pdf_url":"https://arxiv.org/pdf/2305.18310v2.pdf","comment":"Rat, Video Position Prediction"},{"id":"http://arxiv.org/abs/2303.09975v4","updated":"2023-07-21T09:05:53Z","published":"2023-03-17T13:48:17Z","title":"MedNeXt: Transformer-driven Scaling of ConvNets for Medical Image\n  Segmentation","summary":"  There has been exploding interest in embracing Transformer-based\narchitectures for medical image segmentation. However, the lack of large-scale\nannotated medical datasets make achieving performances equivalent to those in\nnatural images challenging. Convolutional networks, in contrast, have higher\ninductive biases and consequently, are easily trainable to high performance.\nRecently, the ConvNeXt architecture attempted to modernize the standard ConvNet\nby mirroring Transformer blocks. In this work, we improve upon this to design a\nmodernized and scalable convolutional architecture customized to challenges of\ndata-scarce medical settings. We introduce MedNeXt, a Transformer-inspired\nlarge kernel segmentation network which introduces - 1) A fully ConvNeXt 3D\nEncoder-Decoder Network for medical image segmentation, 2) Residual ConvNeXt up\nand downsampling blocks to preserve semantic richness across scales, 3) A novel\ntechnique to iteratively increase kernel sizes by upsampling small kernel\nnetworks, to prevent performance saturation on limited medical data, 4)\nCompound scaling at multiple levels (depth, width, kernel size) of MedNeXt.\nThis leads to state-of-the-art performance on 4 tasks on CT and MRI modalities\nand varying dataset sizes, representing a modernized deep architecture for\nmedical image segmentation. Our code is made publicly available at:\nhttps://github.com/MIC-DKFZ/MedNeXt.\n","authors":["Saikat Roy","Gregor Koehler","Constantin Ulrich","Michael Baumgartner","Jens Petersen","Fabian Isensee","Paul F. Jaeger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2303.09975v4.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.11438v1","updated":"2023-07-21T08:58:49Z","published":"2023-07-21T08:58:49Z","title":"Attention Consistency Refined Masked Frequency Forgery Representation\n  for Generalizing Face Forgery Detection","summary":"  Due to the successful development of deep image generation technology, visual\ndata forgery detection would play a more important role in social and economic\nsecurity. Existing forgery detection methods suffer from unsatisfactory\ngeneralization ability to determine the authenticity in the unseen domain. In\nthis paper, we propose a novel Attention Consistency Refined masked frequency\nforgery representation model toward generalizing face forgery detection\nalgorithm (ACMF). Most forgery technologies always bring in high-frequency\naware cues, which make it easy to distinguish source authenticity but difficult\nto generalize to unseen artifact types. The masked frequency forgery\nrepresentation module is designed to explore robust forgery cues by randomly\ndiscarding high-frequency information. In addition, we find that the forgery\nattention map inconsistency through the detection network could affect the\ngeneralizability. Thus, the forgery attention consistency is introduced to\nforce detectors to focus on similar attention regions for better generalization\nability. Experiment results on several public face forgery datasets\n(FaceForensic++, DFD, Celeb-DF, and WDF datasets) demonstrate the superior\nperformance of the proposed method compared with the state-of-the-art methods.\n","authors":["Decheng Liu","Tao Chen","Chunlei Peng","Nannan Wang","Ruimin Hu","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2307.11438v1.pdf","comment":"The source code and models are publicly available at\n  https://github.com/chenboluo/ACMF"},{"id":"http://arxiv.org/abs/2307.11434v1","updated":"2023-07-21T08:55:23Z","published":"2023-07-21T08:55:23Z","title":"Batching for Green AI -- An Exploratory Study on Inference","summary":"  The batch size is an essential parameter to tune during the development of\nnew neural networks. Amongst other quality indicators, it has a large degree of\ninfluence on the model's accuracy, generalisability, training times and\nparallelisability. This fact is generally known and commonly studied. However,\nduring the application phase of a deep learning model, when the model is\nutilised by an end-user for inference, we find that there is a disregard for\nthe potential benefits of introducing a batch size. In this study, we examine\nthe effect of input batching on the energy consumption and response times of\nfive fully-trained neural networks for computer vision that were considered\nstate-of-the-art at the time of their publication. The results suggest that\nbatching has a significant effect on both of these metrics. Furthermore, we\npresent a timeline of the energy efficiency and accuracy of neural networks\nover the past decade. We find that in general, energy consumption rises at a\nmuch steeper pace than accuracy and question the necessity of this evolution.\nAdditionally, we highlight one particular network, ShuffleNetV2(2018), that\nachieved a competitive performance for its time while maintaining a much lower\nenergy consumption. Nevertheless, we highlight that the results are model\ndependent.\n","authors":["Tim Yarally","Luís Cruz","Daniel Feitosa","June Sallou","Arie van Deursen"],"pdf_url":"https://arxiv.org/pdf/2307.11434v1.pdf","comment":"8 pages, 4 figures, 1 table. Accepted at Euromicro Conference Series\n  on Software Engineering and Advanced Applications (SEAA) 2023"},{"id":"http://arxiv.org/abs/2307.09004v2","updated":"2023-07-21T08:41:23Z","published":"2023-07-18T06:44:20Z","title":"Ord2Seq: Regarding Ordinal Regression as Label Sequence Prediction","summary":"  Ordinal regression refers to classifying object instances into ordinal\ncategories. It has been widely studied in many scenarios, such as medical\ndisease grading, movie rating, etc. Known methods focused only on learning\ninter-class ordinal relationships, but still incur limitations in\ndistinguishing adjacent categories thus far. In this paper, we propose a simple\nsequence prediction framework for ordinal regression called Ord2Seq, which, for\nthe first time, transforms each ordinal category label into a special label\nsequence and thus regards an ordinal regression task as a sequence prediction\nprocess. In this way, we decompose an ordinal regression task into a series of\nrecursive binary classification steps, so as to subtly distinguish adjacent\ncategories. Comprehensive experiments show the effectiveness of distinguishing\nadjacent categories for performance improvement and our new approach exceeds\nstate-of-the-art performances in four different scenarios. Codes are available\nat https://github.com/wjh892521292/Ord2Seq.\n","authors":["Jinhong Wang","Yi Cheng","Jintai Chen","Tingting Chen","Danny Chen","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2307.09004v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2208.00657v2","updated":"2023-07-21T08:39:22Z","published":"2022-08-01T07:35:45Z","title":"SiamixFormer: a fully-transformer Siamese network with temporal Fusion\n  for accurate building detection and change detection in bi-temporal remote\n  sensing images","summary":"  Building detection and change detection using remote sensing images can help\nurban and rescue planning. Moreover, they can be used for building damage\nassessment after natural disasters. Currently, most of the existing models for\nbuilding detection use only one image (pre-disaster image) to detect buildings.\nThis is based on the idea that post-disaster images reduce the model's\nperformance because of presence of destroyed buildings. In this paper, we\npropose a siamese model, called SiamixFormer, which uses pre- and post-disaster\nimages as input. Our model has two encoders and has a hierarchical transformer\narchitecture. The output of each stage in both encoders is given to a temporal\ntransformer for feature fusion in a way that query is generated from\npre-disaster images and (key, value) is generated from post-disaster images. To\nthis end, temporal features are also considered in feature fusion. Another\nadvantage of using temporal transformers in feature fusion is that they can\nbetter maintain large receptive fields generated by transformer encoders\ncompared with CNNs. Finally, the output of the temporal transformer is given to\na simple MLP decoder at each stage. The SiamixFormer model is evaluated on xBD,\nand WHU datasets, for building detection and on LEVIR-CD and CDD datasets for\nchange detection and could outperform the state-of-the-art.\n","authors":["Amir Mohammadian","Foad Ghaderi"],"pdf_url":"https://arxiv.org/pdf/2208.00657v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11418v1","updated":"2023-07-21T08:22:14Z","published":"2023-07-21T08:22:14Z","title":"FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural\n  Radiance Fields","summary":"  As recent advances in Neural Radiance Fields (NeRF) have enabled\nhigh-fidelity 3D face reconstruction and novel view synthesis, its manipulation\nalso became an essential task in 3D vision. However, existing manipulation\nmethods require extensive human labor, such as a user-provided semantic mask\nand manual attribute search unsuitable for non-expert users. Instead, our\napproach is designed to require a single text to manipulate a face\nreconstructed with NeRF. To do so, we first train a scene manipulator, a latent\ncode-conditional deformable NeRF, over a dynamic scene to control a face\ndeformation using the latent code. However, representing a scene deformation\nwith a single latent code is unfavorable for compositing local deformations\nobserved in different instances. As so, our proposed Position-conditional\nAnchor Compositor (PAC) learns to represent a manipulated scene with spatially\nvarying latent codes. Their renderings with the scene manipulator are then\noptimized to yield high cosine similarity to a target text in CLIP embedding\nspace for text-driven manipulation. To the best of our knowledge, our approach\nis the first to address the text-driven manipulation of a face reconstructed\nwith NeRF. Extensive results, comparisons, and ablation studies demonstrate the\neffectiveness of our approach.\n","authors":["Sungwon Hwang","Junha Hyung","Daejin Kim","Min-Jung Kim","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2307.11418v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.11413v1","updated":"2023-07-21T08:15:39Z","published":"2023-07-21T08:15:39Z","title":"A Video-based Detector for Suspicious Activity in Examination with\n  OpenPose","summary":"  Examinations are a crucial part of the learning process, and academic\ninstitutions invest significant resources into maintaining their integrity by\npreventing cheating from students or facilitators. However, cheating has become\nrampant in examination setups, compromising their integrity. The traditional\nmethod of relying on invigilators to monitor every student is impractical and\nineffective. To address this issue, there is a need to continuously record exam\nsessions to monitor students for suspicious activities. However, these\nrecordings are often too lengthy for invigilators to analyze effectively, and\nfatigue may cause them to miss significant details. To widen the coverage,\ninvigilators could use fixed overhead or wearable cameras. This paper\nintroduces a framework that uses automation to analyze videos and detect\nsuspicious activities during examinations efficiently and effectively. We\nutilized the OpenPose framework and Convolutional Neural Network (CNN) to\nidentify students exchanging objects during exams. This detection system is\nvital in preventing cheating and promoting academic integrity, fairness, and\nquality education for institutions.\n","authors":["Reuben Moyo","Stanley Ndebvu","Michael Zimba","Jimmy Mbelwa"],"pdf_url":"https://arxiv.org/pdf/2307.11413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11411v1","updated":"2023-07-21T08:10:26Z","published":"2023-07-21T08:10:26Z","title":"Deep Directly-Trained Spiking Neural Networks for Object Detection","summary":"  Spiking neural networks (SNNs) are brain-inspired energy-efficient models\nthat encode information in spatiotemporal dynamics. Recently, deep SNNs trained\ndirectly have shown great success in achieving high performance on\nclassification tasks with very few time steps. However, how to design a\ndirectly-trained SNN for the regression task of object detection still remains\na challenging problem. To address this problem, we propose EMS-YOLO, a novel\ndirectly-trained SNN framework for object detection, which is the first trial\nto train a deep SNN with surrogate gradients for object detection rather than\nANN-SNN conversion strategies. Specifically, we design a full-spike residual\nblock, EMS-ResNet, which can effectively extend the depth of the\ndirectly-trained SNN with low power consumption. Furthermore, we theoretically\nanalyze and prove the EMS-ResNet could avoid gradient vanishing or exploding.\nThe results demonstrate that our approach outperforms the state-of-the-art\nANN-SNN conversion methods (at least 500 time steps) in extremely fewer time\nsteps (only 4 time steps). It is shown that our model could achieve comparable\nperformance to the ANN with the same architecture while consuming 5.83 times\nless energy on the frame-based COCO Dataset and the event-based Gen1 Dataset.\n","authors":["Qiaoyi Su","Yuhong Chou","Yifan Hu","Jianing Li","Shijie Mei","Ziyang Zhang","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2307.11411v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.11410v1","updated":"2023-07-21T08:09:47Z","published":"2023-07-21T08:09:47Z","title":"Subject-Diffusion:Open Domain Personalized Text-to-Image Generation\n  without Test-time Fine-tuning","summary":"  Recent progress in personalized image generation using diffusion models has\nbeen significant. However, development in the area of open-domain and\nnon-fine-tuning personalized image generation is proceeding rather slowly. In\nthis paper, we propose Subject-Diffusion, a novel open-domain personalized\nimage generation model that, in addition to not requiring test-time\nfine-tuning, also only requires a single reference image to support\npersonalized generation of single- or multi-subject in any domain. Firstly, we\nconstruct an automatic data labeling tool and use the LAION-Aesthetics dataset\nto construct a large-scale dataset consisting of 76M images and their\ncorresponding subject detection bounding boxes, segmentation masks and text\ndescriptions. Secondly, we design a new unified framework that combines text\nand image semantics by incorporating coarse location and fine-grained reference\nimage control to maximize subject fidelity and generalization. Furthermore, we\nalso adopt an attention control mechanism to support multi-subject generation.\nExtensive qualitative and quantitative results demonstrate that our method\noutperforms other SOTA frameworks in single, multiple, and human customized\nimage generation. Please refer to our\n\\href{https://oppo-mente-lab.github.io/subject_diffusion/}{project page}\n","authors":["Jian Ma","Junhao Liang","Chen Chen","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2307.11410v1.pdf","comment":"14 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.11404v1","updated":"2023-07-21T07:56:32Z","published":"2023-07-21T07:56:32Z","title":"Latent-OFER: Detect, Mask, and Reconstruct with Latent Vectors for\n  Occluded Facial Expression Recognition","summary":"  Most research on facial expression recognition (FER) is conducted in highly\ncontrolled environments, but its performance is often unacceptable when applied\nto real-world situations. This is because when unexpected objects occlude the\nface, the FER network faces difficulties extracting facial features and\naccurately predicting facial expressions. Therefore, occluded FER (OFER) is a\nchallenging problem. Previous studies on occlusion-aware FER have typically\nrequired fully annotated facial images for training. However, collecting facial\nimages with various occlusions and expression annotations is time-consuming and\nexpensive. Latent-OFER, the proposed method, can detect occlusions, restore\noccluded parts of the face as if they were unoccluded, and recognize them,\nimproving FER accuracy. This approach involves three steps: First, the vision\ntransformer (ViT)-based occlusion patch detector masks the occluded position by\ntraining only latent vectors from the unoccluded patches using the support\nvector data description algorithm. Second, the hybrid reconstruction network\ngenerates the masking position as a complete image using the ViT and\nconvolutional neural network (CNN). Last, the expression-relevant latent vector\nextractor retrieves and uses expression-related information from all latent\nvectors by applying a CNN-based class activation map. This mechanism has a\nsignificant advantage in preventing performance degradation from occlusion by\nunseen objects. The experimental results on several databases demonstrate the\nsuperiority of the proposed method over state-of-the-art methods.\n","authors":["Isack Lee","Eungi Lee","Seok Bong Yoo"],"pdf_url":"https://arxiv.org/pdf/2307.11404v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.11397v1","updated":"2023-07-21T07:29:38Z","published":"2023-07-21T07:29:38Z","title":"Probabilistic Modeling of Inter- and Intra-observer Variability in\n  Medical Image Segmentation","summary":"  Medical image segmentation is a challenging task, particularly due to inter-\nand intra-observer variability, even between medical experts. In this paper, we\npropose a novel model, called Probabilistic Inter-Observer and iNtra-Observer\nvariation NetwOrk (Pionono). It captures the labeling behavior of each rater\nwith a multidimensional probability distribution and integrates this\ninformation with the feature maps of the image to produce probabilistic\nsegmentation predictions. The model is optimized by variational inference and\ncan be trained end-to-end. It outperforms state-of-the-art models such as\nSTAPLE, Probabilistic U-Net, and models based on confusion matrices.\nAdditionally, Pionono predicts multiple coherent segmentation maps that mimic\nthe rater's expert opinion, which provides additional valuable information for\nthe diagnostic process. Experiments on real-world cancer segmentation datasets\ndemonstrate the high accuracy and efficiency of Pionono, making it a powerful\ntool for medical image analysis.\n","authors":["Arne Schmidt","Pablo Morales-Álvarez","Rafael Molina"],"pdf_url":"https://arxiv.org/pdf/2307.11397v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.09815v2","updated":"2023-07-21T07:10:28Z","published":"2023-07-19T08:03:53Z","title":"LDP: Language-driven Dual-Pixel Image Defocus Deblurring Network","summary":"  Recovering sharp images from dual-pixel (DP) pairs with disparity-dependent\nblur is a challenging task.~Existing blur map-based deblurring methods have\ndemonstrated promising results. In this paper, we propose, to the best of our\nknowledge, the first framework to introduce the contrastive language-image\npre-training framework (CLIP) to achieve accurate blur map estimation from DP\npairs unsupervisedly. To this end, we first carefully design text prompts to\nenable CLIP to understand blur-related geometric prior knowledge from the DP\npair. Then, we propose a format to input stereo DP pair to the CLIP without any\nfine-tuning, where the CLIP is pre-trained on monocular images. Given the\nestimated blur map, we introduce a blur-prior attention block, a blur-weighting\nloss and a blur-aware loss to recover the all-in-focus image. Our method\nachieves state-of-the-art performance in extensive experiments.\n","authors":["Hao Yang","Liyuan Pan","Yan Yang","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2307.09815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10577v2","updated":"2023-07-21T06:59:21Z","published":"2023-07-20T04:41:39Z","title":"Ethosight: A Reasoning-Guided Iterative Learning System for Nuanced\n  Perception based on Joint-Embedding & Contextual Label Affinity","summary":"  Traditional computer vision models often require extensive manual effort for\ndata acquisition, annotation and validation, particularly when detecting subtle\nbehavioral nuances or events. The difficulty in distinguishing routine\nbehaviors from potential risks in real-world applications, such as\ndifferentiating routine shopping from potential shoplifting, further\ncomplicates the process. Moreover, these models may demonstrate high false\npositive rates and imprecise event detection when exposed to real-world\nscenarios that differ significantly from the conditions of the training data.\n  To overcome these hurdles, we present Ethosight, a novel zero-shot computer\nvision system. Ethosight initiates with a clean slate based on user\nrequirements and semantic knowledge of interest. Using localized label affinity\ncalculations and a reasoning-guided iterative learning loop, Ethosight infers\nscene details and iteratively refines the label set. Reasoning mechanisms can\nbe derived from large language models like GPT4, symbolic reasoners like\nOpenNARS\\cite{wang2013}\\cite{wang2006}, or hybrid systems.\n  Our evaluations demonstrate Ethosight's efficacy across 40 complex use cases,\nspanning domains such as health, safety, and security. Detailed results and\ncase studies within the main body of this paper and an appendix underscore a\npromising trajectory towards enhancing the adaptability and resilience of\ncomputer vision models in detecting and extracting subtle and nuanced\nbehaviors.\n","authors":["Hugo Latapie","Kristinn R. Thorisson","Shan Yu","Vahagn Petrosyan","Patrick Hammer","Pei Wang","Brandon Kynoch","Hanning Chen","Tangrui Li"],"pdf_url":"https://arxiv.org/pdf/2307.10577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11386v1","updated":"2023-07-21T06:56:21Z","published":"2023-07-21T06:56:21Z","title":"CLR: Channel-wise Lightweight Reprogramming for Continual Learning","summary":"  Continual learning aims to emulate the human ability to continually\naccumulate knowledge over sequential tasks. The main challenge is to maintain\nperformance on previously learned tasks after learning new tasks, i.e., to\navoid catastrophic forgetting. We propose a Channel-wise Lightweight\nReprogramming (CLR) approach that helps convolutional neural networks (CNNs)\novercome catastrophic forgetting during continual learning. We show that a CNN\nmodel trained on an old task (or self-supervised proxy task) could be\n``reprogrammed\" to solve a new task by using our proposed lightweight (very\ncheap) reprogramming parameter. With the help of CLR, we have a better\nstability-plasticity trade-off to solve continual learning problems: To\nmaintain stability and retain previous task ability, we use a common\ntask-agnostic immutable part as the shared ``anchor\" parameter set. We then add\ntask-specific lightweight reprogramming parameters to reinterpret the outputs\nof the immutable parts, to enable plasticity and integrate new knowledge. To\nlearn sequential tasks, we only train the lightweight reprogramming parameters\nto learn each new task. Reprogramming parameters are task-specific and\nexclusive to each task, which makes our method immune to catastrophic\nforgetting. To minimize the parameter requirement of reprogramming to learn new\ntasks, we make reprogramming lightweight by only adjusting essential kernels\nand learning channel-wise linear mappings from anchor parameters to\ntask-specific domain knowledge. We show that, for general CNNs, the CLR\nparameter increase is less than 0.6\\% for any new task. Our method outperforms\n13 state-of-the-art continual learning baselines on a new challenging sequence\nof 53 image classification datasets. Code and data are available at\nhttps://github.com/gyhandy/Channel-wise-Lightweight-Reprogramming\n","authors":["Yunhao Ge","Yuecheng Li","Shuo Ni","Jiaping Zhao","Ming-Hsuan Yang","Laurent Itti"],"pdf_url":"https://arxiv.org/pdf/2307.11386v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2303.06146v2","updated":"2023-07-21T06:34:54Z","published":"2023-03-10T18:59:33Z","title":"StyleGANEX: StyleGAN-Based Manipulation Beyond Cropped Aligned Faces","summary":"  Recent advances in face manipulation using StyleGAN have produced impressive\nresults. However, StyleGAN is inherently limited to cropped aligned faces at a\nfixed image resolution it is pre-trained on. In this paper, we propose a simple\nand effective solution to this limitation by using dilated convolutions to\nrescale the receptive fields of shallow layers in StyleGAN, without altering\nany model parameters. This allows fixed-size small features at shallow layers\nto be extended into larger ones that can accommodate variable resolutions,\nmaking them more robust in characterizing unaligned faces. To enable real face\ninversion and manipulation, we introduce a corresponding encoder that provides\nthe first-layer feature of the extended StyleGAN in addition to the latent\nstyle code. We validate the effectiveness of our method using unaligned face\ninputs of various resolutions in a diverse set of face manipulation tasks,\nincluding facial attribute editing, super-resolution, sketch/mask-to-face\ntranslation, and face toonification.\n","authors":["Shuai Yang","Liming Jiang","Ziwei Liu","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2303.06146v2.pdf","comment":"ICCV 2023. Code: https://github.com/williamyang1991/StyleGANEX\n  Project page: https://www.mmlab-ntu.com/project/styleganex/"},{"id":"http://arxiv.org/abs/2307.11375v1","updated":"2023-07-21T06:17:09Z","published":"2023-07-21T06:17:09Z","title":"LatentAugment: Data Augmentation via Guided Manipulation of GAN's Latent\n  Space","summary":"  Data Augmentation (DA) is a technique to increase the quantity and diversity\nof the training data, and by that alleviate overfitting and improve\ngeneralisation. However, standard DA produces synthetic data for augmentation\nwith limited diversity. Generative Adversarial Networks (GANs) may unlock\nadditional information in a dataset by generating synthetic samples having the\nappearance of real images. However, these models struggle to simultaneously\naddress three key requirements: fidelity and high-quality samples; diversity\nand mode coverage; and fast sampling. Indeed, GANs generate high-quality\nsamples rapidly, but have poor mode coverage, limiting their adoption in DA\napplications. We propose LatentAugment, a DA strategy that overcomes the low\ndiversity of GANs, opening up for use in DA applications. Without external\nsupervision, LatentAugment modifies latent vectors and moves them into latent\nspace regions to maximise the synthetic images' diversity and fidelity. It is\nalso agnostic to the dataset and the downstream task. A wide set of experiments\nshows that LatentAugment improves the generalisation of a deep model\ntranslating from MRI-to-CT beating both standard DA as well GAN-based sampling.\nMoreover, still in comparison with GAN-based sampling, LatentAugment synthetic\nsamples show superior mode coverage and diversity. Code is available at:\nhttps://github.com/ltronchin/LatentAugment.\n","authors":["Lorenzo Tronchin","Minh H. Vu","Paolo Soda","Tommy Löfstedt"],"pdf_url":"https://arxiv.org/pdf/2307.11375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16649v4","updated":"2023-07-21T05:46:30Z","published":"2023-05-26T05:41:20Z","title":"FSD: Fully-Specialized Detector via Neural Architecture Search","summary":"  Most generic object detectors are mainly built for standard object detection\ntasks such as COCO and PASCAL VOC. They might not work well and/or efficiently\non tasks of other domains consisting of images that are visually different from\nstandard datasets. To this end, many advances have been focused on adapting a\ngeneral-purposed object detector with limited domain-specific designs. However,\ndesigning a successful task-specific detector requires extraneous manual\nexperiments and parameter tuning through trial and error. In this paper, we\nfirst propose and examine a fully-automatic pipeline to design a\nfully-specialized detector (FSD) which mainly incorporates a\nneural-architectural-searched model by exploring ideal network structures over\nthe backbone and task-specific head. On the DeepLesion dataset, extensive\nresults show that FSD can achieve 3.1 mAP gain while using approximately 40%\nfewer parameters on binary lesion detection task and improved the mAP by around\n10% on multi-type lesion detection task via our region-aware graph modeling\ncompared with existing general-purposed medical lesion detection networks.\n","authors":["Zhe Huang","Yudian Li"],"pdf_url":"https://arxiv.org/pdf/2305.16649v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11364v1","updated":"2023-07-21T05:33:57Z","published":"2023-07-21T05:33:57Z","title":"Photo2Relief: Let Human in the Photograph Stand Out","summary":"  In this paper, we propose a technique for making humans in photographs\nprotrude like reliefs. Unlike previous methods which mostly focus on the face\nand head, our method aims to generate art works that describe the whole body\nactivity of the character. One challenge is that there is no ground-truth for\nsupervised deep learning. We introduce a sigmoid variant function to manipulate\ngradients tactfully and train our neural networks by equipping with a loss\nfunction defined in gradient domain. The second challenge is that actual\nphotographs often across different light conditions. We used image-based\nrendering technique to address this challenge and acquire rendering images and\ndepth data under different lighting conditions. To make a clear division of\nlabor in network modules, a two-scale architecture is proposed to create\nhigh-quality relief from a single photograph. Extensive experimental results on\na variety of scenes show that our method is a highly effective solution for\ngenerating digital 2.5D artwork from photographs.\n","authors":["Zhongping Ji","Feifei Che","Hanshuo Liu","Ziyi Zhao","Yu-Wei Zhang","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11364v1.pdf","comment":"10 pages, 11 figures"},{"id":"http://arxiv.org/abs/2307.11360v1","updated":"2023-07-21T05:26:32Z","published":"2023-07-21T05:26:32Z","title":"ParGANDA: Making Synthetic Pedestrians A Reality For Object Detection","summary":"  Object detection is the key technique to a number of Computer Vision\napplications, but it often requires large amounts of annotated data to achieve\ndecent results. Moreover, for pedestrian detection specifically, the collected\ndata might contain some personally identifiable information (PII), which is\nhighly restricted in many countries. This label intensive and privacy\nconcerning task has recently led to an increasing interest in training the\ndetection models using synthetically generated pedestrian datasets collected\nwith a photo-realistic video game engine. The engine is able to generate\nunlimited amounts of data with precise and consistent annotations, which gives\npotential for significant gains in the real-world applications. However, the\nuse of synthetic data for training introduces a synthetic-to-real domain shift\naggravating the final performance. To close the gap between the real and\nsynthetic data, we propose to use a Generative Adversarial Network (GAN), which\nperformsparameterized unpaired image-to-image translation to generate more\nrealistic images. The key benefit of using the GAN is its intrinsic preference\nof low-level changes to geometric ones, which means annotations of a given\nsynthetic image remain accurate even after domain translation is performed thus\neliminating the need for labeling real data. We extensively experimented with\nthe proposed method using MOTSynth dataset to train and MOT17 and MOT20\ndetection datasets to test, with experimental results demonstrating the\neffectiveness of this method. Our approach not only produces visually plausible\nsamples but also does not require any labels of the real domain thus making it\napplicable to the variety of downstream tasks.\n","authors":["Daria Reshetova","Guanhang Wu","Marcel Puyat","Chunhui Gu","Huizhong Chen"],"pdf_url":"https://arxiv.org/pdf/2307.11360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04541v2","updated":"2023-07-21T05:08:44Z","published":"2023-07-10T13:09:42Z","title":"Learning Large Margin Sparse Embeddings for Open Set Medical Diagnosis","summary":"  Fueled by deep learning, computer-aided diagnosis achieves huge advances.\nHowever, out of controlled lab environments, algorithms could face multiple\nchallenges. Open set recognition (OSR), as an important one, states that\ncategories unseen in training could appear in testing. In medical fields, it\ncould derive from incompletely collected training datasets and the constantly\nemerging new or rare diseases. OSR requires an algorithm to not only correctly\nclassify known classes, but also recognize unknown classes and forward them to\nexperts for further diagnosis. To tackle OSR, we assume that known classes\ncould densely occupy small parts of the embedding space and the remaining\nsparse regions could be recognized as unknowns. Following it, we propose Open\nMargin Cosine Loss (OMCL) unifying two mechanisms. The former, called Margin\nLoss with Adaptive Scale (MLAS), introduces angular margin for reinforcing\nintra-class compactness and inter-class separability, together with an adaptive\nscaling factor to strengthen the generalization capacity. The latter, called\nOpen-Space Suppression (OSS), opens the classifier by recognizing sparse\nembedding space as unknowns using proposed feature space descriptors. Besides,\nsince medical OSR is still a nascent field, two publicly available benchmark\ndatasets are proposed for comparison. Extensive ablation studies and feature\nvisualization demonstrate the effectiveness of each design. Compared with\nstate-of-the-art methods, MLAS achieves superior performances, measured by ACC,\nAUROC, and OSCR.\n","authors":["Mingyuan Liu","Lu Xu","Jicong Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.04541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10698v2","updated":"2023-07-21T05:05:52Z","published":"2023-07-20T08:39:20Z","title":"Reverse Knowledge Distillation: Training a Large Model using a Small One\n  for Retinal Image Matching on Limited Data","summary":"  Retinal image matching plays a crucial role in monitoring disease progression\nand treatment response. However, datasets with matched keypoints between\ntemporally separated pairs of images are not available in abundance to train\ntransformer-based model. We propose a novel approach based on reverse knowledge\ndistillation to train large models with limited data while preventing\noverfitting. Firstly, we propose architectural modifications to a CNN-based\nsemi-supervised method called SuperRetina that help us improve its results on a\npublicly available dataset. Then, we train a computationally heavier model\nbased on a vision transformer encoder using the lighter CNN-based model, which\nis counter-intuitive in the field knowledge-distillation research where\ntraining lighter models based on heavier ones is the norm. Surprisingly, such\nreverse knowledge distillation improves generalization even further. Our\nexperiments suggest that high-dimensional fitting in representation space may\nprevent overfitting unlike training directly to match the final output. We also\nprovide a public dataset with annotations for retinal image keypoint detection\nand matching to help the research community develop algorithms for retinal\nimage applications.\n","authors":["Sahar Almahfouz Nasser","Nihar Gupte","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2307.10698v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10829v2","updated":"2023-07-21T04:46:07Z","published":"2023-07-10T12:18:18Z","title":"Exact Diffusion Inversion via Bi-directional Integration Approximation","summary":"  Recently, different methods have been proposed to address the inconsistency\nissue of DDIM inversion to enable image editing, such as EDICT\n\\cite{Wallace23EDICT} and Null-text inversion \\cite{Mokady23NullTestInv}.\nHowever, the above methods introduce considerable computational overhead. In\nthis paper, we propose a new technique, named \\emph{bi-directional integration\napproximation} (BDIA), to perform exact diffusion inversion with neglible\ncomputational overhead. Suppose we would like to estimate the next diffusion\nstate $\\boldsymbol{z}_{i-1}$ at timestep $t_i$ with the historical information\n$(i,\\boldsymbol{z}_i)$ and $(i+1,\\boldsymbol{z}_{i+1})$. We first obtain the\nestimated Gaussian noise $\\hat{\\boldsymbol{\\epsilon}}(\\boldsymbol{z}_i,i)$, and\nthen apply the DDIM update procedure twice for approximating the ODE\nintegration over the next time-slot $[t_i, t_{i-1}]$ in the forward manner and\nthe previous time-slot $[t_i, t_{t+1}]$ in the backward manner. The DDIM step\nfor the previous time-slot is used to refine the integration approximation made\nearlier when computing $\\boldsymbol{z}_i$. One nice property with BDIA-DDIM is\nthat the update expression for $\\boldsymbol{z}_{i-1}$ is a linear combination\nof $(\\boldsymbol{z}_{i+1}, \\boldsymbol{z}_i,\n\\hat{\\boldsymbol{\\epsilon}}(\\boldsymbol{z}_i,i))$. This allows for exact\nbackward computation of $\\boldsymbol{z}_{i+1}$ given $(\\boldsymbol{z}_i,\n\\boldsymbol{z}_{i-1})$, thus leading to exact diffusion inversion. Experiments\non both image reconstruction and image editing were conducted, confirming our\nstatement. BDIA can also be applied to improve the performance of other ODE\nsolvers in addition to DDIM. In our work, it is found that applying BDIA to the\nEDM sampling procedure produces slightly better FID score over CIFAR10.\n","authors":["Guoqiang Zhang","J. P. Lewis","W. Bastiaan Kleijn"],"pdf_url":"https://arxiv.org/pdf/2307.10829v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.11328"},{"id":"http://arxiv.org/abs/2307.11342v1","updated":"2023-07-21T04:15:02Z","published":"2023-07-21T04:15:02Z","title":"Tuning Pre-trained Model via Moment Probing","summary":"  Recently, efficient fine-tuning of large-scale pre-trained models has\nattracted increasing research interests, where linear probing (LP) as a\nfundamental module is involved in exploiting the final representations for\ntask-dependent classification. However, most of the existing methods focus on\nhow to effectively introduce a few of learnable parameters, and little work\npays attention to the commonly used LP module. In this paper, we propose a\nnovel Moment Probing (MP) method to further explore the potential of LP.\nDistinguished from LP which builds a linear classification head based on the\nmean of final features (e.g., word tokens for ViT) or classification tokens,\nour MP performs a linear classifier on feature distribution, which provides the\nstronger representation ability by exploiting richer statistical information\ninherent in features. Specifically, we represent feature distribution by its\ncharacteristic function, which is efficiently approximated by using first- and\nsecond-order moments of features. Furthermore, we propose a multi-head\nconvolutional cross-covariance (MHC$^3$) to compute second-order moments in an\nefficient and effective manner. By considering that MP could affect feature\nlearning, we introduce a partially shared module to learn two recalibrating\nparameters (PSRP) for backbones based on MP, namely MP$_{+}$. Extensive\nexperiments on ten benchmarks using various models show that our MP\nsignificantly outperforms LP and is competitive with counterparts at less\ntraining cost, while our MP$_{+}$ achieves state-of-the-art performance.\n","authors":["Mingze Gao","Qilong Wang","Zhenyi Lin","Pengfei Zhu","Qinghua Hu","Jingbo Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.11342v1.pdf","comment":"Accepted to ICCV 2023; Project Page:\n  https://github.com/mingzeG/Moment-Probing"},{"id":"http://arxiv.org/abs/2307.11336v1","updated":"2023-07-21T03:50:23Z","published":"2023-07-21T03:50:23Z","title":"Character Time-series Matching For Robust License Plate Recognition","summary":"  Automatic License Plate Recognition (ALPR) is becoming a popular study area\nand is applied in many fields such as transportation or smart city. However,\nthere are still several limitations when applying many current methods to\npractical problems due to the variation in real-world situations such as light\nchanges, unclear License Plate (LP) characters, and image quality. Almost\nrecent ALPR algorithms process on a single frame, which reduces accuracy in\ncase of worse image quality. This paper presents methods to improve license\nplate recognition accuracy by tracking the license plate in multiple frames.\nFirst, the Adaptive License Plate Rotation algorithm is applied to correctly\nalign the detected license plate. Second, we propose a method called Character\nTime-series Matching to recognize license plate characters from many\nconsequence frames. The proposed method archives high performance in the\nUFPR-ALPR dataset which is \\boldmath$96.7\\%$ accuracy in real-time on RTX A5000\nGPU card. We also deploy the algorithm for the Vietnamese ALPR system. The\naccuracy for license plate detection and character recognition are 0.881 and\n0.979 $mAP^{test}$@.5 respectively. The source code is available at\nhttps://github.com/chequanghuy/Character-Time-series-Matching.git\n","authors":["Quang Huy Che","Tung Do Thanh","Cuong Truong Van"],"pdf_url":"https://arxiv.org/pdf/2307.11336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11335v1","updated":"2023-07-21T03:47:28Z","published":"2023-07-21T03:47:28Z","title":"Tri-MipRF: Tri-Mip Representation for Efficient Anti-Aliasing Neural\n  Radiance Fields","summary":"  Despite the tremendous progress in neural radiance fields (NeRF), we still\nface a dilemma of the trade-off between quality and efficiency, e.g., MipNeRF\npresents fine-detailed and anti-aliased renderings but takes days for training,\nwhile Instant-ngp can accomplish the reconstruction in a few minutes but\nsuffers from blurring or aliasing when rendering at various distances or\nresolutions due to ignoring the sampling area. To this end, we propose a novel\nTri-Mip encoding that enables both instant reconstruction and anti-aliased\nhigh-fidelity rendering for neural radiance fields. The key is to factorize the\npre-filtered 3D feature spaces in three orthogonal mipmaps. In this way, we can\nefficiently perform 3D area sampling by taking advantage of 2D pre-filtered\nfeature maps, which significantly elevates the rendering quality without\nsacrificing efficiency. To cope with the novel Tri-Mip representation, we\npropose a cone-casting rendering technique to efficiently sample anti-aliased\n3D features with the Tri-Mip encoding considering both pixel imaging and\nobserving distance. Extensive experiments on both synthetic and real-world\ndatasets demonstrate our method achieves state-of-the-art rendering quality and\nreconstruction speed while maintaining a compact representation that reduces\n25% model size compared against Instant-ngp.\n","authors":["Wenbo Hu","Yuling Wang","Lin Ma","Bangbang Yang","Lin Gao","Xiao Liu","Yuewen Ma"],"pdf_url":"https://arxiv.org/pdf/2307.11335v1.pdf","comment":"Accepted to ICCV 2023 Project page:\n  https://wbhu.github.io/projects/Tri-MipRF"},{"id":"http://arxiv.org/abs/2307.11334v1","updated":"2023-07-21T03:43:07Z","published":"2023-07-21T03:43:07Z","title":"Improving Transferability of Adversarial Examples via Bayesian Attacks","summary":"  This paper presents a substantial extension of our work published at ICLR.\nOur ICLR work advocated for enhancing transferability in adversarial examples\nby incorporating a Bayesian formulation into model parameters, which\neffectively emulates the ensemble of infinitely many deep neural networks,\nwhile, in this paper, we introduce a novel extension by incorporating the\nBayesian formulation into the model input as well, enabling the joint\ndiversification of both the model input and model parameters. Our empirical\nfindings demonstrate that: 1) the combination of Bayesian formulations for both\nthe model input and model parameters yields significant improvements in\ntransferability; 2) by introducing advanced approximations of the posterior\ndistribution over the model input, adversarial transferability achieves further\nenhancement, surpassing all state-of-the-arts when attacking without model\nfine-tuning. Moreover, we propose a principled approach to fine-tune model\nparameters in such an extended Bayesian formulation. The derived optimization\nobjective inherently encourages flat minima in the parameter space and input\nspace. Extensive experiments demonstrate that our method achieves a new\nstate-of-the-art on transfer-based attacks, improving the average success rate\non ImageNet and CIFAR-10 by 19.14% and 2.08%, respectively, when comparing with\nour ICLR basic Bayesian method. We will make our code publicly available.\n","authors":["Qizhang Li","Yiwen Guo","Xiaochen Yang","Wangmeng Zuo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2307.11334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11323v1","updated":"2023-07-21T03:08:28Z","published":"2023-07-21T03:08:28Z","title":"HVDetFusion: A Simple and Robust Camera-Radar Fusion Framework","summary":"  In the field of autonomous driving, 3D object detection is a very important\nperception module. Although the current SOTA algorithm combines Camera and\nLidar sensors, limited by the high price of Lidar, the current mainstream\nlanding schemes are pure Camera sensors or Camera+Radar sensors. In this study,\nwe propose a new detection algorithm called HVDetFusion, which is a multi-modal\ndetection algorithm that not only supports pure camera data as input for\ndetection, but also can perform fusion input of radar data and camera data. The\ncamera stream does not depend on the input of Radar data, thus addressing the\ndownside of previous methods. In the pure camera stream, we modify the\nframework of Bevdet4D for better perception and more efficient inference, and\nthis stream has the whole 3D detection output. Further, to incorporate the\nbenefits of Radar signals, we use the prior information of different object\npositions to filter the false positive information of the original radar data,\naccording to the positioning information and radial velocity information\nrecorded by the radar sensors to supplement and fuse the BEV features generated\nby the original camera data, and the effect is further improved in the process\nof fusion training. Finally, HVDetFusion achieves the new state-of-the-art\n67.4\\% NDS on the challenging nuScenes test set among all camera-radar 3D\nobject detectors. The code is available at\nhttps://github.com/HVXLab/HVDetFusion\n","authors":["Kai Lei","Zhan Chen","Shuman Jia","Xiaoteng Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11317v1","updated":"2023-07-21T02:57:40Z","published":"2023-07-21T02:57:40Z","title":"XLDA: Linear Discriminant Analysis for Scaling Continual Learning to\n  Extreme Classification at the Edge","summary":"  Streaming Linear Discriminant Analysis (LDA) while proven in\nClass-incremental Learning deployments at the edge with limited classes (upto\n1000), has not been proven for deployment in extreme classification scenarios.\nIn this paper, we present: (a) XLDA, a framework for Class-IL in edge\ndeployment where LDA classifier is proven to be equivalent to FC layer\nincluding in extreme classification scenarios, and (b) optimizations to enable\nXLDA-based training and inference for edge deployment where there is a\nconstraint on available compute resources. We show up to 42x speed up using a\nbatched training approach and up to 5x inference speedup with nearest neighbor\nsearch on extreme datasets like AliProducts (50k classes) and Google Landmarks\nV2 (81k classes)\n","authors":["Karan Shah","Vishruth Veerendranath","Anushka Hebbar","Raghavendra Bhat"],"pdf_url":"https://arxiv.org/pdf/2307.11317v1.pdf","comment":"Submitted at ICML 2023: PAC-Bayes Interactive Learning Workshop"},{"id":"http://arxiv.org/abs/2307.11315v1","updated":"2023-07-21T02:47:18Z","published":"2023-07-21T02:47:18Z","title":"Generating Image-Specific Text Improves Fine-grained Image\n  Classification","summary":"  Recent vision-language models outperform vision-only models on many image\nclassification tasks. However, because of the absence of paired text/image\ndescriptions, it remains difficult to fine-tune these models for fine-grained\nimage classification. In this work, we propose a method, GIST, for generating\nimage-specific fine-grained text descriptions from image-only datasets, and\nshow that these text descriptions can be used to improve classification. Key\nparts of our method include 1. prompting a pretrained large language model with\ndomain-specific prompts to generate diverse fine-grained text descriptions for\neach class and 2. using a pretrained vision-language model to match each image\nto label-preserving text descriptions that capture relevant visual features in\nthe image. We demonstrate the utility of GIST by fine-tuning vision-language\nmodels on the image-and-generated-text pairs to learn an aligned\nvision-language representation space for improved classification. We evaluate\nour learned representation space in full-shot and few-shot scenarios across\nfour diverse fine-grained classification datasets, each from a different\ndomain. Our method achieves an average improvement of $4.1\\%$ in accuracy over\nCLIP linear probes and an average of $1.1\\%$ improvement in accuracy over the\nprevious state-of-the-art image-text classification method on the full-shot\ndatasets. Our method achieves similar improvements across few-shot regimes.\nCode is available at https://github.com/emu1729/GIST.\n","authors":["Emily Mu","Kathleen M. Lewis","Adrian V. Dalca","John Guttag"],"pdf_url":"https://arxiv.org/pdf/2307.11315v1.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2212.03434v5","updated":"2023-07-21T02:34:02Z","published":"2022-12-07T03:39:18Z","title":"Name Your Colour For the Task: Artificially Discover Colour Naming via\n  Colour Quantisation Transformer","summary":"  The long-standing theory that a colour-naming system evolves under dual\npressure of efficient communication and perceptual mechanism is supported by\nmore and more linguistic studies, including analysing four decades of\ndiachronic data from the Nafaanra language. This inspires us to explore whether\nmachine learning could evolve and discover a similar colour-naming system via\noptimising the communication efficiency represented by high-level recognition\nperformance. Here, we propose a novel colour quantisation transformer,\nCQFormer, that quantises colour space while maintaining the accuracy of machine\nrecognition on the quantised images. Given an RGB image, Annotation Branch maps\nit into an index map before generating the quantised image with a colour\npalette; meanwhile the Palette Branch utilises a key-point detection way to\nfind proper colours in the palette among the whole colour space. By interacting\nwith colour annotation, CQFormer is able to balance both the machine vision\naccuracy and colour perceptual structure such as distinct and stable colour\ndistribution for discovered colour system. Very interestingly, we even observe\nthe consistent evolution pattern between our artificial colour system and basic\ncolour terms across human languages. Besides, our colour quantisation method\nalso offers an efficient quantisation method that effectively compresses the\nimage storage while maintaining high performance in high-level recognition\ntasks such as classification and detection. Extensive experiments demonstrate\nthe superior performance of our method with extremely low bit-rate colours,\nshowing potential to integrate into quantisation network to quantities from\nimage to network activation. The source code is available at\nhttps://github.com/ryeocthiv/CQFormer\n","authors":["Shenghan Su","Lin Gu","Yue Yang","Zenghui Zhang","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2212.03434v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10769v3","updated":"2023-07-21T02:32:23Z","published":"2023-04-21T06:35:54Z","title":"Deep Multiview Clustering by Contrasting Cluster Assignments","summary":"  Multiview clustering (MVC) aims to reveal the underlying structure of\nmultiview data by categorizing data samples into clusters. Deep learning-based\nmethods exhibit strong feature learning capabilities on large-scale datasets.\nFor most existing deep MVC methods, exploring the invariant representations of\nmultiple views is still an intractable problem. In this paper, we propose a\ncross-view contrastive learning (CVCL) method that learns view-invariant\nrepresentations and produces clustering results by contrasting the cluster\nassignments among multiple views. Specifically, we first employ deep\nautoencoders to extract view-dependent features in the pretraining stage. Then,\na cluster-level CVCL strategy is presented to explore consistent semantic label\ninformation among the multiple views in the fine-tuning stage. Thus, the\nproposed CVCL method is able to produce more discriminative cluster assignments\nby virtue of this learning strategy. Moreover, we provide a theoretical\nanalysis of soft cluster assignment alignment. Extensive experimental results\nobtained on several datasets demonstrate that the proposed CVCL method\noutperforms several state-of-the-art approaches.\n","authors":["Jie Chen","Hua Mao","Wai Lok Woo","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2304.10769v3.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.11308v1","updated":"2023-07-21T02:28:54Z","published":"2023-07-21T02:28:54Z","title":"DPM-OT: A New Diffusion Probabilistic Model Based on Optimal Transport","summary":"  Sampling from diffusion probabilistic models (DPMs) can be viewed as a\npiecewise distribution transformation, which generally requires hundreds or\nthousands of steps of the inverse diffusion trajectory to get a high-quality\nimage. Recent progress in designing fast samplers for DPMs achieves a trade-off\nbetween sampling speed and sample quality by knowledge distillation or\nadjusting the variance schedule or the denoising equation. However, it can't be\noptimal in both aspects and often suffer from mode mixture in short steps. To\ntackle this problem, we innovatively regard inverse diffusion as an optimal\ntransport (OT) problem between latents at different stages and propose the\nDPM-OT, a unified learning framework for fast DPMs with a direct expressway\nrepresented by OT map, which can generate high-quality samples within around 10\nfunction evaluations. By calculating the semi-discrete optimal transport map\nbetween the data latents and the white noise, we obtain an expressway from the\nprior distribution to the data distribution, while significantly alleviating\nthe problem of mode mixture. In addition, we give the error bound of the\nproposed method, which theoretically guarantees the stability of the algorithm.\nExtensive experiments validate the effectiveness and advantages of DPM-OT in\nterms of speed and quality (FID and mode mixture), thus representing an\nefficient solution for generative modeling. Source codes are available at\nhttps://github.com/cognaclee/DPM-OT\n","authors":["Zezeng Li","ShengHao Li","Zhanpeng Wang","Na Lei","Zhongxuan Luo","Xianfeng Gu"],"pdf_url":"https://arxiv.org/pdf/2307.11308v1.pdf","comment":"iccv2023 accepted"},{"id":"http://arxiv.org/abs/2301.06262v3","updated":"2023-07-21T02:28:28Z","published":"2023-01-16T05:08:50Z","title":"Collaborative Perception in Autonomous Driving: Methods, Datasets and\n  Challenges","summary":"  Collaborative perception is essential to address occlusion and sensor failure\nissues in autonomous driving. In recent years, theoretical and experimental\ninvestigations of novel works for collaborative perception have increased\ntremendously. So far, however, few reviews have focused on systematical\ncollaboration modules and large-scale collaborative perception datasets. This\nwork reviews recent achievements in this field to bridge this gap and motivate\nfuture research. We start with a brief overview of collaboration schemes. After\nthat, we systematically summarize the collaborative perception methods for\nideal scenarios and real-world issues. The former focuses on collaboration\nmodules and efficiency, and the latter is devoted to addressing the problems in\nactual application. Furthermore, we present large-scale public datasets and\nsummarize quantitative results on these benchmarks. Finally, we highlight gaps\nand overlook challenges between current academic research and real-world\napplications. The project page is\nhttps://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving\n","authors":["Yushan Han","Hui Zhang","Huifang Li","Yi Jin","Congyan Lang","Yidong Li"],"pdf_url":"https://arxiv.org/pdf/2301.06262v3.pdf","comment":"18 pages, 6 figures. Accepted by IEEE Intelligent Transportation\n  Systems Magazine. URL:\n  https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving"},{"id":"http://arxiv.org/abs/2307.11307v1","updated":"2023-07-21T02:28:20Z","published":"2023-07-21T02:28:20Z","title":"EndoSurf: Neural Surface Reconstruction of Deformable Tissues with\n  Stereo Endoscope Videos","summary":"  Reconstructing soft tissues from stereo endoscope videos is an essential\nprerequisite for many medical applications. Previous methods struggle to\nproduce high-quality geometry and appearance due to their inadequate\nrepresentations of 3D scenes. To address this issue, we propose a novel\nneural-field-based method, called EndoSurf, which effectively learns to\nrepresent a deforming surface from an RGBD sequence. In EndoSurf, we model\nsurface dynamics, shape, and texture with three neural fields. First, 3D points\nare transformed from the observed space to the canonical space using the\ndeformation field. The signed distance function (SDF) field and radiance field\nthen predict their SDFs and colors, respectively, with which RGBD images can be\nsynthesized via differentiable volume rendering. We constrain the learned shape\nby tailoring multiple regularization strategies and disentangling geometry and\nappearance. Experiments on public endoscope datasets demonstrate that EndoSurf\nsignificantly outperforms existing solutions, particularly in reconstructing\nhigh-fidelity shapes. Code is available at\nhttps://github.com/Ruyi-Zha/endosurf.git.\n","authors":["Ruyi Zha","Xuelian Cheng","Hongdong Li","Mehrtash Harandi","Zongyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2307.11307v1.pdf","comment":"MICCAI 2023 (Early Accept); Ruyi Zha and Xuelian Cheng made equal\n  contributions. Corresponding author: Ruyi Zha (ruyi.zha@gmail.com)"},{"id":"http://arxiv.org/abs/2307.10711v2","updated":"2023-07-21T02:06:41Z","published":"2023-07-20T09:06:21Z","title":"AdjointDPM: Adjoint Sensitivity Method for Gradient Backpropagation of\n  Diffusion Probabilistic Models","summary":"  Existing customization methods require access to multiple reference examples\nto align pre-trained diffusion probabilistic models (DPMs) with user-provided\nconcepts. This paper aims to address the challenge of DPM customization when\nthe only available supervision is a differentiable metric defined on the\ngenerated contents. Since the sampling procedure of DPMs involves recursive\ncalls to the denoising UNet, na\\\"ive gradient backpropagation requires storing\nthe intermediate states of all iterations, resulting in extremely high memory\nconsumption. To overcome this issue, we propose a novel method AdjointDPM,\nwhich first generates new samples from diffusion models by solving the\ncorresponding probability-flow ODEs. It then uses the adjoint sensitivity\nmethod to backpropagate the gradients of the loss to the models' parameters\n(including conditioning signals, network weights, and initial noises) by\nsolving another augmented ODE. To reduce numerical errors in both the forward\ngeneration and gradient backpropagation processes, we further reparameterize\nthe probability-flow ODE and augmented ODE as simple non-stiff ODEs using\nexponential integration. Finally, we demonstrate the effectiveness of\nAdjointDPM on three interesting tasks: converting visual effects into\nidentification text embeddings, finetuning DPMs for specific types of\nstylization, and optimizing initial noise to generate adversarial samples for\nsecurity auditing.\n","authors":["Jiachun Pan","Jun Hao Liew","Vincent Y. F. Tan","Jiashi Feng","Hanshu Yan"],"pdf_url":"https://arxiv.org/pdf/2307.10711v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04973v2","updated":"2023-07-21T01:40:31Z","published":"2023-02-09T23:25:28Z","title":"Invariant Slot Attention: Object Discovery with Slot-Centric Reference\n  Frames","summary":"  Automatically discovering composable abstractions from raw perceptual data is\na long-standing challenge in machine learning. Recent slot-based neural\nnetworks that learn about objects in a self-supervised manner have made\nexciting progress in this direction. However, they typically fall short at\nadequately capturing spatial symmetries present in the visual world, which\nleads to sample inefficiency, such as when entangling object appearance and\npose. In this paper, we present a simple yet highly effective method for\nincorporating spatial symmetries via slot-centric reference frames. We\nincorporate equivariance to per-object pose transformations into the attention\nand generation mechanism of Slot Attention by translating, scaling, and\nrotating position encodings. These changes result in little computational\noverhead, are easy to implement, and can result in large gains in terms of data\nefficiency and overall improvements to object discovery. We evaluate our method\non a wide range of synthetic object discovery benchmarks namely CLEVR,\nTetrominoes, CLEVRTex, Objects Room and MultiShapeNet, and show promising\nimprovements on the challenging real-world Waymo Open dataset.\n","authors":["Ondrej Biza","Sjoerd van Steenkiste","Mehdi S. M. Sajjadi","Gamaleldin F. Elsayed","Aravindh Mahendran","Thomas Kipf"],"pdf_url":"https://arxiv.org/pdf/2302.04973v2.pdf","comment":"Accepted at ICML 2023. Project page: https://invariantsa.github.io/"},{"id":"http://arxiv.org/abs/2307.11285v1","updated":"2023-07-21T01:04:52Z","published":"2023-07-21T01:04:52Z","title":"MAS: Towards Resource-Efficient Federated Multiple-Task Learning","summary":"  Federated learning (FL) is an emerging distributed machine learning method\nthat empowers in-situ model training on decentralized edge devices. However,\nmultiple simultaneous FL tasks could overload resource-constrained devices. In\nthis work, we propose the first FL system to effectively coordinate and train\nmultiple simultaneous FL tasks. We first formalize the problem of training\nsimultaneous FL tasks. Then, we present our new approach, MAS (Merge and\nSplit), to optimize the performance of training multiple simultaneous FL tasks.\nMAS starts by merging FL tasks into an all-in-one FL task with a multi-task\narchitecture. After training for a few rounds, MAS splits the all-in-one FL\ntask into two or more FL tasks by using the affinities among tasks measured\nduring the all-in-one training. It then continues training each split of FL\ntasks based on model parameters from the all-in-one training. Extensive\nexperiments demonstrate that MAS outperforms other methods while reducing\ntraining time by 2x and reducing energy consumption by 40%. We hope this work\nwill inspire the community to further study and optimize training simultaneous\nFL tasks.\n","authors":["Weiming Zhuang","Yonggang Wen","Lingjuan Lyu","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11285v1.pdf","comment":"ICCV'23. arXiv admin note: substantial text overlap with\n  arXiv:2207.04202"},{"id":"http://arxiv.org/abs/2307.11274v1","updated":"2023-07-21T00:15:56Z","published":"2023-07-21T00:15:56Z","title":"Screening Mammography Breast Cancer Detection","summary":"  Breast cancer is a leading cause of cancer-related deaths, but current\nprograms are expensive and prone to false positives, leading to unnecessary\nfollow-up and patient anxiety. This paper proposes a solution to automated\nbreast cancer detection, to improve the efficiency and accuracy of screening\nprograms. Different methodologies were tested against the RSNA dataset of\nradiographic breast images of roughly 20,000 female patients and yielded an\naverage validation case pF1 score of 0.56 across methods.\n","authors":["Debajyoti Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2307.11274v1.pdf","comment":"Released @ Apr 2023. For associated project files, see\n  https://github.com/chakrabortyde/rsna-breast-cancer"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.06576v3","updated":"2023-07-21T16:06:32Z","published":"2023-07-13T06:25:22Z","title":"Going Beyond Local: Global Graph-Enhanced Personalized News\n  Recommendations","summary":"  Precisely recommending candidate news articles to users has always been a\ncore challenge for personalized news recommendation systems. Most recent works\nprimarily focus on using advanced natural language processing techniques to\nextract semantic information from rich textual data, employing content-based\nmethods derived from local historical news. However, this approach lacks a\nglobal perspective, failing to account for users' hidden motivations and\nbehaviors beyond semantic information. To address this challenge, we propose a\nnovel model called GLORY (Global-LOcal news Recommendation sYstem), which\ncombines global representations learned from other users with local\nrepresentations to enhance personalized recommendation systems. We accomplish\nthis by constructing a Global-aware Historical News Encoder, which includes a\nglobal news graph and employs gated graph neural networks to enrich news\nrepresentations, thereby fusing historical news representations by a historical\nnews aggregator. Similarly, we extend this approach to a Global Candidate News\nEncoder, utilizing a global entity graph and a candidate news aggregator to\nenhance candidate news representation. Evaluation results on two public news\ndatasets demonstrate that our method outperforms existing approaches.\nFurthermore, our model offers more diverse recommendations.\n","authors":["Boming Yang","Dairui Liu","Toyotaro Suzumura","Ruihai Dong","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2307.06576v3.pdf","comment":"10 pages, Recsys 2023"},{"id":"http://arxiv.org/abs/2307.11650v1","updated":"2023-07-21T15:28:47Z","published":"2023-07-21T15:28:47Z","title":"Alleviating the Long-Tail Problem in Conversational Recommender Systems","summary":"  Conversational recommender systems (CRS) aim to provide the recommendation\nservice via natural language conversations. To develop an effective CRS,\nhigh-quality CRS datasets are very crucial. However, existing CRS datasets\nsuffer from the long-tail issue, \\ie a large proportion of items are rarely (or\neven never) mentioned in the conversations, which are called long-tail items.\nAs a result, the CRSs trained on these datasets tend to recommend frequent\nitems, and the diversity of the recommended items would be largely reduced,\nmaking users easier to get bored.\n  To address this issue, this paper presents \\textbf{LOT-CRS}, a novel\nframework that focuses on simulating and utilizing a balanced CRS dataset (\\ie\ncovering all the items evenly) for improving \\textbf{LO}ng-\\textbf{T}ail\nrecommendation performance of CRSs. In our approach, we design two pre-training\ntasks to enhance the understanding of simulated conversation for long-tail\nitems, and adopt retrieval-augmented fine-tuning with label smoothness strategy\nto further improve the recommendation of long-tail items. Extensive experiments\non two public CRS datasets have demonstrated the effectiveness and\nextensibility of our approach, especially on long-tail recommendation.\n","authors":["Zhipeng Zhao","Kun Zhou","Xiaolei Wang","Wayne Xin Zhao","Fan Pan","Zhao Cao","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2307.11650v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2307.11496v1","updated":"2023-07-21T11:04:20Z","published":"2023-07-21T11:04:20Z","title":"Identifying document similarity using a fast estimation of the\n  Levenshtein Distance based on compression and signatures","summary":"  Identifying document similarity has many applications, e.g., source code\nanalysis or plagiarism detection. However, identifying similarities is not\ntrivial and can be time complex. For instance, the Levenshtein Distance is a\ncommon metric to define the similarity between two documents but has quadratic\nruntime which makes it impractical for large documents where large starts with\na few hundred kilobytes. In this paper, we present a novel concept that allows\nestimating the Levenshtein Distance: the algorithm first compresses documents\nto signatures (similar to hash values) using a user-defined compression ratio.\nSignatures can then be compared against each other (some constrains apply)\nwhere the outcome is the estimated Levenshtein Distance. Our evaluation shows\npromising results in terms of runtime efficiency and accuracy. In addition, we\nintroduce a significance score allowing examiners to set a threshold and\nidentify related documents.\n","authors":["Peter Coates","Frank Breitinger"],"pdf_url":"https://arxiv.org/pdf/2307.11496v1.pdf","comment":"In: Proceedings of the Digital Forensics Research Conference Europe\n  (DFRWS EU). 2022"},{"id":"http://arxiv.org/abs/2307.10617v2","updated":"2023-07-21T09:49:15Z","published":"2023-07-20T06:35:43Z","title":"Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques","summary":"  In the contemporary digital landscape, online reviews have become an\nindispensable tool for promoting products and services across various\nbusinesses. Marketers, advertisers, and online businesses have found incentives\nto create deceptive positive reviews for their products and negative reviews\nfor their competitors' offerings. As a result, the writing of deceptive reviews\nhas become an unavoidable practice for businesses seeking to promote themselves\nor undermine their rivals. Detecting such deceptive reviews has become an\nintense and ongoing area of research. This research paper proposes a machine\nlearning model to identify deceptive reviews, with a particular focus on\nrestaurants. This study delves into the performance of numerous experiments\nconducted on a dataset of restaurant reviews known as the Deceptive Opinion\nSpam Corpus. To accomplish this, an n-gram model and max features are developed\nto effectively identify deceptive content, particularly focusing on fake\nreviews. A benchmark study is undertaken to explore the performance of two\ndifferent feature extraction techniques, which are then coupled with five\ndistinct machine learning classification algorithms. The experimental results\nreveal that the passive aggressive classifier stands out among the various\nalgorithms, showcasing the highest accuracy not only in text classification but\nalso in identifying fake reviews. Moreover, the research delves into data\naugmentation and implements various deep learning techniques to further enhance\nthe process of detecting deceptive reviews. The findings shed light on the\nefficacy of the proposed machine learning approach and offer valuable insights\ninto dealing with deceptive reviews in the realm of online businesses.\n","authors":["Anusuya Baby Hari Krishnan"],"pdf_url":"https://arxiv.org/pdf/2307.10617v2.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2306.02250v2","updated":"2023-07-21T07:46:03Z","published":"2023-06-04T03:46:45Z","title":"Large Language Model Augmented Narrative Driven Recommendations","summary":"  Narrative-driven recommendation (NDR) presents an information access problem\nwhere users solicit recommendations with verbose descriptions of their\npreferences and context, for example, travelers soliciting recommendations for\npoints of interest while describing their likes/dislikes and travel\ncircumstances. These requests are increasingly important with the rise of\nnatural language-based conversational interfaces for search and recommendation\nsystems. However, NDR lacks abundant training data for models, and current\nplatforms commonly do not support these requests. Fortunately, classical\nuser-item interaction datasets contain rich textual data, e.g., reviews, which\noften describe user preferences and context - this may be used to bootstrap\ntraining for NDR models. In this work, we explore using large language models\n(LLMs) for data augmentation to train NDR models. We use LLMs for authoring\nsynthetic narrative queries from user-item interactions with few-shot prompting\nand train retrieval models for NDR on synthetic queries and user-item\ninteraction data. Our experiments demonstrate that this is an effective\nstrategy for training small-parameter retrieval models that outperform other\nretrieval and LLM baselines for narrative-driven recommendation.\n","authors":["Sheshera Mysore","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2306.02250v2.pdf","comment":"RecSys 2023 Camera-ready"},{"id":"http://arxiv.org/abs/2304.04250v2","updated":"2023-07-21T07:39:58Z","published":"2023-04-09T14:52:18Z","title":"Editable User Profiles for Controllable Text Recommendation","summary":"  Methods for making high-quality recommendations often rely on learning latent\nrepresentations from interaction data. These methods, while performant, do not\nprovide ready mechanisms for users to control the recommendation they receive.\nOur work tackles this problem by proposing LACE, a novel concept value\nbottleneck model for controllable text recommendations. LACE represents each\nuser with a succinct set of human-readable concepts through retrieval given\nuser-interacted documents and learns personalized representations of the\nconcepts based on user documents. This concept based user profile is then\nleveraged to make recommendations. The design of our model affords control over\nthe recommendations through a number of intuitive interactions with a\ntransparent user profile. We first establish the quality of recommendations\nobtained from LACE in an offline evaluation on three recommendation tasks\nspanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we\nvalidate the controllability of LACE under simulated user interactions.\nFinally, we implement LACE in an interactive controllable recommender system\nand conduct a user study to demonstrate that users are able to improve the\nquality of recommendations they receive through interactions with an editable\nuser profile.\n","authors":["Sheshera Mysore","Mahmood Jasim","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2304.04250v2.pdf","comment":"SIGIR-2023 Camera Ready"},{"id":"http://arxiv.org/abs/2307.11325v1","updated":"2023-07-21T03:23:17Z","published":"2023-07-21T03:23:17Z","title":"Analysis of Elephant Movement in Sub-Saharan Africa: Ecological,\n  Climatic, and Conservation Perspectives","summary":"  The interaction between elephants and their environment has profound\nimplications for both ecology and conservation strategies. This study presents\nan analytical approach to decipher the intricate patterns of elephant movement\nin Sub-Saharan Africa, concentrating on key ecological drivers such as seasonal\nvariations and rainfall patterns. Despite the complexities surrounding these\ninfluential factors, our analysis provides a holistic view of elephant\nmigratory behavior in the context of the dynamic African landscape. Our\ncomprehensive approach enables us to predict the potential impact of these\necological determinants on elephant migration, a critical step in establishing\ninformed conservation strategies. This projection is particularly crucial given\nthe impacts of global climate change on seasonal and rainfall patterns, which\ncould substantially influence elephant movements in the future. The findings of\nour work aim to not only advance the understanding of movement ecology but also\nfoster a sustainable coexistence of humans and elephants in Sub-Saharan Africa.\nBy predicting potential elephant routes, our work can inform strategies to\nminimize human-elephant conflict, effectively manage land use, and enhance\nanti-poaching efforts. This research underscores the importance of integrating\nmovement ecology and climatic variables for effective wildlife management and\nconservation planning.\n","authors":["Matthew Hines","Gregory Glatzer","Shreya Ghosh","Prasenjit Mitra"],"pdf_url":"https://arxiv.org/pdf/2307.11325v1.pdf","comment":"11 pages, 17 figures, Accepted in ACM SIGCAS SIGCHI Conference on\n  Computing and Sustainable Societies (COMPASS 2023)"},{"id":"http://arxiv.org/abs/2307.10479v2","updated":"2023-07-21T19:24:10Z","published":"2023-07-19T22:20:06Z","title":"Fast Approximate Nearest Neighbor Search with a Dynamic Exploration\n  Graph using Continuous Refinement","summary":"  For approximate nearest neighbor search, graph-based algorithms have shown to\noffer the best trade-off between accuracy and search time. We propose the\nDynamic Exploration Graph (DEG) which significantly outperforms existing\nalgorithms in terms of search and exploration efficiency by combining two new\nideas: First, a single undirected even regular graph is incrementally built by\npartially replacing existing edges to integrate new vertices and to update old\nneighborhoods at the same time. Secondly, an edge optimization algorithm is\nused to continuously improve the quality of the graph. Combining this ongoing\nrefinement with the graph construction process leads to a well-organized graph\nstructure at all times, resulting in: (1) increased search efficiency, (2)\npredictable index size, (3) guaranteed connectivity and therefore reachability\nof all vertices, and (4) a dynamic graph structure. In addition we investigate\nhow well existing graph-based search systems can handle indexed queries where\nthe seed vertex of a search is the query itself. Such exploration tasks,\ndespite their good starting point, are not necessarily easy. High efficiency in\napproximate nearest neighbor search (ANNS) does not automatically imply good\nperformance in exploratory search. Extensive experiments show that our new\nDynamic Exploration Graph outperforms existing algorithms significantly for\nindexed and unindexed queries.\n","authors":["Nico Hezel","Kai Uwe Barthel","Konstantin Schall","Klaus Jung"],"pdf_url":"https://arxiv.org/pdf/2307.10479v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11848v1","updated":"2023-07-21T18:35:24Z","published":"2023-07-21T18:35:24Z","title":"MythQA: Query-Based Large-Scale Check-Worthy Claim Detection through\n  Multi-Answer Open-Domain Question Answering","summary":"  Check-worthy claim detection aims at providing plausible misinformation to\ndownstream fact-checking systems or human experts to check. This is a crucial\nstep toward accelerating the fact-checking process. Many efforts have been put\ninto how to identify check-worthy claims from a small scale of pre-collected\nclaims, but how to efficiently detect check-worthy claims directly from a\nlarge-scale information source, such as Twitter, remains underexplored. To fill\nthis gap, we introduce MythQA, a new multi-answer open-domain question\nanswering(QA) task that involves contradictory stance mining for query-based\nlarge-scale check-worthy claim detection. The idea behind this is that\ncontradictory claims are a strong indicator of misinformation that merits\nscrutiny by the appropriate authorities. To study this task, we construct\nTweetMythQA, an evaluation dataset containing 522 factoid multi-answer\nquestions based on controversial topics. Each question is annotated with\nmultiple answers. Moreover, we collect relevant tweets for each distinct\nanswer, then classify them into three categories: \"Supporting\", \"Refuting\", and\n\"Neutral\". In total, we annotated 5.3K tweets. Contradictory evidence is\ncollected for all answers in the dataset. Finally, we present a baseline system\nfor MythQA and evaluate existing NLP models for each system component using the\nTweetMythQA dataset. We provide initial benchmarks and identify key challenges\nfor future models to improve upon. Code and data are available at:\nhttps://github.com/TonyBY/Myth-QA\n","authors":["Yang Bai","Anthony Colas","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11848v1.pdf","comment":"Accepted by SIGIR 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.11749v1","updated":"2023-07-21T17:59:15Z","published":"2023-07-21T17:59:15Z","title":"Differentially Private Heavy Hitter Detection using Federated Analytics","summary":"  In this work, we study practical heuristics to improve the performance of\nprefix-tree based algorithms for differentially private heavy hitter detection.\nOur model assumes each user has multiple data points and the goal is to learn\nas many of the most frequent data points as possible across all users' data\nwith aggregate and local differential privacy. We propose an adaptive\nhyperparameter tuning algorithm that improves the performance of the algorithm\nwhile satisfying computational, communication and privacy constraints. We\nexplore the impact of different data-selection schemes as well as the impact of\nintroducing deny lists during multiple runs of the algorithm. We test these\nimprovements using extensive experimentation on the Reddit\ndataset~\\cite{caldas2018leaf} on the task of learning the most frequent words.\n","authors":["Karan Chadha","Junye Chen","John Duchi","Vitaly Feldman","Hanieh Hashemi","Omid Javidbakht","Audra McMillan","Kunal Talwar"],"pdf_url":"https://arxiv.org/pdf/2307.11749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.03327v2","updated":"2023-07-21T17:54:14Z","published":"2023-03-06T17:54:33Z","title":"Tight Bounds for $γ$-Regret via the Decision-Estimation Coefficient","summary":"  In this work, we give a statistical characterization of the $\\gamma$-regret\nfor arbitrary structured bandit problems, the regret which arises when\ncomparing against a benchmark that is $\\gamma$ times the optimal solution. The\n$\\gamma$-regret emerges in structured bandit problems over a function class\n$\\mathcal{F}$ where finding an exact optimum of $f \\in \\mathcal{F}$ is\nintractable. Our characterization is given in terms of the $\\gamma$-DEC, a\nstatistical complexity parameter for the class $\\mathcal{F}$, which is a\nmodification of the constrained Decision-Estimation Coefficient (DEC) of Foster\net al., 2023 (and closely related to the original offset DEC of Foster et al.,\n2021). Our lower bound shows that the $\\gamma$-DEC is a fundamental limit for\nany model class $\\mathcal{F}$: for any algorithm, there exists some $f \\in\n\\mathcal{F}$ for which the $\\gamma$-regret of that algorithm scales (nearly)\nwith the $\\gamma$-DEC of $\\mathcal{F}$. We provide an upper bound showing that\nthere exists an algorithm attaining a nearly matching $\\gamma$-regret. Due to\nsignificant challenges in applying the prior results on the DEC to the\n$\\gamma$-regret case, both our lower and upper bounds require novel techniques\nand a new algorithm.\n","authors":["Margalit Glasgow","Alexander Rakhlin"],"pdf_url":"https://arxiv.org/pdf/2303.03327v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11732v1","updated":"2023-07-21T17:45:28Z","published":"2023-07-21T17:45:28Z","title":"Advancing Ad Auction Realism: Practical Insights & Modeling Implications","summary":"  This paper proposes a learning model of online ad auctions that allows for\nthe following four key realistic characteristics of contemporary online\nauctions: (1) ad slots can have different values and click-through rates\ndepending on users' search queries, (2) the number and identity of competing\nadvertisers are unobserved and change with each auction, (3) advertisers only\nreceive partial, aggregated feedback, and (4) payment rules are only partially\nspecified. We model advertisers as agents governed by an adversarial bandit\nalgorithm, independent of auction mechanism intricacies. Our objective is to\nsimulate the behavior of advertisers for counterfactual analysis, prediction,\nand inference purposes. Our findings reveal that, in such richer environments,\n\"soft floors\" can enhance key performance metrics even when bidders are drawn\nfrom the same population. We further demonstrate how to infer advertiser value\ndistributions from observed bids, thereby affirming the practical efficacy of\nour approach even in a more realistic auction setting.\n","authors":["Ming Chen","Sareh Nabi","Marciano Siniscalchi"],"pdf_url":"https://arxiv.org/pdf/2307.11732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11730v1","updated":"2023-07-21T17:43:50Z","published":"2023-07-21T17:43:50Z","title":"Mitigating Communications Threats in Decentralized Federated Learning\n  through Moving Target Defense","summary":"  The rise of Decentralized Federated Learning (DFL) has enabled the training\nof machine learning models across federated participants, fostering\ndecentralized model aggregation and reducing dependence on a server. However,\nthis approach introduces unique communication security challenges that have yet\nto be thoroughly addressed in the literature. These challenges primarily\noriginate from the decentralized nature of the aggregation process, the varied\nroles and responsibilities of the participants, and the absence of a central\nauthority to oversee and mitigate threats. Addressing these challenges, this\npaper first delineates a comprehensive threat model, highlighting the potential\nrisks of DFL communications. In response to these identified risks, this work\nintroduces a security module designed for DFL platforms to counter\ncommunication-based attacks. The module combines security techniques such as\nsymmetric and asymmetric encryption with Moving Target Defense (MTD)\ntechniques, including random neighbor selection and IP/port switching. The\nsecurity module is implemented in a DFL platform called Fedstellar, allowing\nthe deployment and monitoring of the federation. A DFL scenario has been\ndeployed, involving eight physical devices implementing three security\nconfigurations: (i) a baseline with no security, (ii) an encrypted\nconfiguration, and (iii) a configuration integrating both encryption and MTD\ntechniques. The effectiveness of the security module is validated through\nexperiments with the MNIST dataset and eclipse attacks. The results indicated\nan average F1 score of 95%, with moderate increases in CPU usage (up to 63.2%\n+-3.5%) and network traffic (230 MB +-15 MB) under the most secure\nconfiguration, mitigating the risks posed by eavesdropping or eclipse attacks.\n","authors":["Enrique Tomás Martínez Beltrán","Pedro Miguel Sánchez Sánchez","Sergio López Bernal","Gérôme Bovet","Manuel Gil Pérez","Gregorio Martínez Pérez","Alberto Huertas Celdrán"],"pdf_url":"https://arxiv.org/pdf/2307.11730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10870v2","updated":"2023-07-21T17:37:26Z","published":"2023-02-21T18:34:51Z","title":"On Provable Copyright Protection for Generative Models","summary":"  There is a growing concern that learned conditional generative models may\noutput samples that are substantially similar to some copyrighted data $C$ that\nwas in their training set. We give a formal definition of $\\textit{near\naccess-freeness (NAF)}$ and prove bounds on the probability that a model\nsatisfying this definition outputs a sample similar to $C$, even if $C$ is\nincluded in its training set. Roughly speaking, a generative model $p$ is\n$\\textit{$k$-NAF}$ if for every potentially copyrighted data $C$, the output of\n$p$ diverges by at most $k$-bits from the output of a model $q$ that\n$\\textit{did not access $C$ at all}$. We also give generative model learning\nalgorithms, which efficiently modify the original generative model learning\nalgorithm in a black box manner, that output generative models with strong\nbounds on the probability of sampling protected content. Furthermore, we\nprovide promising experiments for both language (transformers) and image\n(diffusion) generative models, showing minimal degradation in output quality\nwhile ensuring strong protections against sampling protected content.\n","authors":["Nikhil Vyas","Sham Kakade","Boaz Barak"],"pdf_url":"https://arxiv.org/pdf/2302.10870v2.pdf","comment":"Accepted at ICML 2023"},{"id":"http://arxiv.org/abs/2307.10496v2","updated":"2023-07-21T17:34:51Z","published":"2023-07-19T23:29:40Z","title":"A Competitive Learning Approach for Specialized Models: A Solution for\n  Complex Physical Systems with Distinct Functional Regimes","summary":"  Complex systems in science and engineering sometimes exhibit behavior that\nchanges across different regimes. Traditional global models struggle to capture\nthe full range of this complex behavior, limiting their ability to accurately\nrepresent the system. In response to this challenge, we propose a novel\ncompetitive learning approach for obtaining data-driven models of physical\nsystems. The primary idea behind the proposed approach is to employ dynamic\nloss functions for a set of models that are trained concurrently on the data.\nEach model competes for each observation during training, allowing for the\nidentification of distinct functional regimes within the dataset. To\ndemonstrate the effectiveness of the learning approach, we coupled it with\nvarious regression methods that employ gradient-based optimizers for training.\nThe proposed approach was tested on various problems involving model discovery\nand function approximation, demonstrating its ability to successfully identify\nfunctional regimes, discover true governing equations, and reduce test errors.\n","authors":["Okezzi F. Ukorigho","Opeoluwa Owoyele"],"pdf_url":"https://arxiv.org/pdf/2307.10496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08736v2","updated":"2023-07-21T17:21:57Z","published":"2022-12-16T22:18:48Z","title":"A Neural Network Warm-Start Approach for the Inverse Acoustic Obstacle\n  Scattering Problem","summary":"  We consider the inverse acoustic obstacle problem for sound-soft star-shaped\nobstacles in two dimensions wherein the boundary of the obstacle is determined\nfrom measurements of the scattered field at a collection of receivers outside\nthe object. One of the standard approaches for solving this problem is to\nreformulate it as an optimization problem: finding the boundary of the domain\nthat minimizes the $L^2$ distance between computed values of the scattered\nfield and the given measurement data. The optimization problem is\ncomputationally challenging since the local set of convexity shrinks with\nincreasing frequency and results in an increasing number of local minima in the\nvicinity of the true solution. In many practical experimental settings, low\nfrequency measurements are unavailable due to limitations of the experimental\nsetup or the sensors used for measurement. Thus, obtaining a good initial guess\nfor the optimization problem plays a vital role in this environment.\n  We present a neural network warm-start approach for solving the inverse\nscattering problem, where an initial guess for the optimization problem is\nobtained using a trained neural network. We demonstrate the effectiveness of\nour method with several numerical examples. For high frequency problems, this\napproach outperforms traditional iterative methods such as Gauss-Newton\ninitialized without any prior (i.e., initialized using a unit circle), or\ninitialized using the solution of a direct method such as the linear sampling\nmethod. The algorithm remains robust to noise in the scattered field\nmeasurements and also converges to the true solution for limited aperture data.\nHowever, the number of training samples required to train the neural network\nscales exponentially in frequency and the complexity of the obstacles\nconsidered. We conclude with a discussion of this phenomenon and potential\ndirections for future research.\n","authors":["Mo Zhou","Jiequn Han","Manas Rachh","Carlos Borges"],"pdf_url":"https://arxiv.org/pdf/2212.08736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15471v3","updated":"2023-07-21T17:20:16Z","published":"2023-03-25T10:21:13Z","title":"Embedding Contextual Information through Reward Shaping in Multi-Agent\n  Learning: A Case Study from Google Football","summary":"  Artificial Intelligence has been used to help human complete difficult tasks\nin complicated environments by providing optimized strategies for\ndecision-making or replacing the manual labour. In environments including\nmultiple agents, such as football, the most common methods to train agents are\nImitation Learning and Multi-Agent Reinforcement Learning (MARL). However, the\nagents trained by Imitation Learning cannot outperform the expert demonstrator,\nwhich makes humans hardly get new insights from the learnt policy. Besides,\nMARL is prone to the credit assignment problem. In environments with sparse\nreward signal, this method can be inefficient. The objective of our research is\nto create a novel reward shaping method by embedding contextual information in\nreward function to solve the aforementioned challenges. We demonstrate this in\nthe Google Research Football (GRF) environment. We quantify the contextual\ninformation extracted from game state observation and use this quantification\ntogether with original sparse reward to create the shaped reward. The\nexperiment results in the GRF environment prove that our reward shaping method\nis a useful addition to state-of-the-art MARL algorithms for training agents in\nenvironments with sparse reward signal.\n","authors":["Chaoyi Gu","Varuna De Silva","Corentin Artaud","Rafael Pina"],"pdf_url":"https://arxiv.org/pdf/2303.15471v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11714v1","updated":"2023-07-21T17:19:01Z","published":"2023-07-21T17:19:01Z","title":"Convergence of SGD for Training Neural Networks with Sliced Wasserstein\n  Losses","summary":"  Optimal Transport has sparked vivid interest in recent years, in particular\nthanks to the Wasserstein distance, which provides a geometrically sensible and\nintuitive way of comparing probability measures. For computational reasons, the\nSliced Wasserstein (SW) distance was introduced as an alternative to the\nWasserstein distance, and has seen uses for training generative Neural Networks\n(NNs). While convergence of Stochastic Gradient Descent (SGD) has been observed\npractically in such a setting, there is to our knowledge no theoretical\nguarantee for this observation. Leveraging recent works on convergence of SGD\non non-smooth and non-convex functions by Bianchi et al. (2022), we aim to\nbridge that knowledge gap, and provide a realistic context under which\nfixed-step SGD trajectories for the SW loss on NN parameters converge. More\nprecisely, we show that the trajectories approach the set of (sub)-gradient\nflow equations as the step decreases. Under stricter assumptions, we show a\nmuch stronger convergence result for noised and projected SGD schemes, namely\nthat the long-run limits of the trajectories approach a set of generalised\ncritical points of the loss function.\n","authors":["Eloi Tanguy"],"pdf_url":"https://arxiv.org/pdf/2307.11714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11704v1","updated":"2023-07-21T17:00:06Z","published":"2023-07-21T17:00:06Z","title":"JoinGym: An Efficient Query Optimization Environment for Reinforcement\n  Learning","summary":"  In this paper, we present \\textsc{JoinGym}, an efficient and lightweight\nquery optimization environment for reinforcement learning (RL). Join order\nselection (JOS) is a classic NP-hard combinatorial optimization problem from\ndatabase query optimization and can serve as a practical testbed for the\ngeneralization capabilities of RL algorithms. We describe how to formulate each\nof the left-deep and bushy variants of the JOS problem as a Markov Decision\nProcess (MDP), and we provide an implementation adhering to the standard\nGymnasium API. We highlight that our implementation \\textsc{JoinGym} is\ncompletely based on offline traces of all possible joins, which enables RL\npractitioners to easily and quickly test their methods on a realistic data\nmanagement problem without needing to setup any systems. Moreover, we also\nprovide all possible join traces on $3300$ novel SQL queries generated from the\nIMDB dataset. Upon benchmarking popular RL algorithms, we find that at least\none method can obtain near-optimal performance on train-set queries but their\nperformance degrades by several orders of magnitude on test-set queries. This\ngap motivates further research for RL algorithms that generalize well in\nmulti-task combinatorial optimization problems.\n","authors":["Kaiwen Wang","Junxiong Wang","Yueying Li","Nathan Kallus","Immanuel Trummer","Wen Sun"],"pdf_url":"https://arxiv.org/pdf/2307.11704v1.pdf","comment":"We will make all the queries available soon"},{"id":"http://arxiv.org/abs/2307.10490v2","updated":"2023-07-21T16:51:15Z","published":"2023-07-19T23:03:20Z","title":"(Ab)using Images and Sounds for Indirect Instruction Injection in\n  Multi-Modal LLMs","summary":"  We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11695v1","updated":"2023-07-21T16:50:10Z","published":"2023-07-21T16:50:10Z","title":"Using simulation to calibrate real data acquisition in veterinary\n  medicine","summary":"  This paper explores the innovative use of simulation environments to enhance\ndata acquisition and diagnostics in veterinary medicine, focusing specifically\non gait analysis in dogs. The study harnesses the power of Blender and the\nBlenderproc library to generate synthetic datasets that reflect diverse\nanatomical, environmental, and behavioral conditions. The generated data,\nrepresented in graph form and standardized for optimal analysis, is utilized to\ntrain machine learning algorithms for identifying normal and abnormal gaits.\nTwo distinct datasets with varying degrees of camera angle granularity are\ncreated to further investigate the influence of camera perspective on model\naccuracy. Preliminary results suggest that this simulation-based approach holds\npromise for advancing veterinary diagnostics by enabling more precise data\nacquisition and more effective machine learning models. By integrating\nsynthetic and real-world patient data, the study lays a robust foundation for\nimproving overall effectiveness and efficiency in veterinary medicine.\n","authors":["Krystian Strzałka","Szymon Mazurek","Maciej Wielgosz","Paweł Russek","Jakub Caputa","Daria Łukasik","Jan Krupiński","Jakub Grzeszczyk","Michał Karwatowski","Rafał Frączek","Ernest Jamro","Marcin Pietroń","Sebastian Koryciak","Agnieszka Dąbrowska-Boruch","Kazimierz Wiatr"],"pdf_url":"https://arxiv.org/pdf/2307.11695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11672v1","updated":"2023-07-21T16:18:58Z","published":"2023-07-21T16:18:58Z","title":"Fast Adaptive Test-Time Defense with Robust Features","summary":"  Adaptive test-time defenses are used to improve the robustness of deep neural\nnetworks to adversarial examples. However, existing methods significantly\nincrease the inference time due to additional optimization on the model\nparameters or the input at test time. In this work, we propose a novel adaptive\ntest-time defense strategy that is easy to integrate with any existing (robust)\ntraining procedure without additional test-time computation. Based on the\nnotion of robustness of features that we present, the key idea is to project\nthe trained models to the most robust feature space, thereby reducing the\nvulnerability to adversarial attacks in non-robust directions. We theoretically\nshow that the top eigenspace of the feature matrix are more robust for a\ngeneralized additive model and support our argument for a large width neural\nnetwork with the Neural Tangent Kernel (NTK) equivalence. We conduct extensive\nexperiments on CIFAR-10 and CIFAR-100 datasets for several robustness\nbenchmarks, including the state-of-the-art methods in RobustBench, and observe\nthat the proposed method outperforms existing adaptive test-time defenses at\nmuch lower computation costs.\n","authors":["Anurag Singh","Mahalakshmi Sabanayagam","Krikamol Muandet","Debarghya Ghoshdastidar"],"pdf_url":"https://arxiv.org/pdf/2307.11672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17282v2","updated":"2023-07-21T16:15:21Z","published":"2023-05-26T22:01:47Z","title":"Universal consistency of the $k$-NN rule in metric spaces and Nagata\n  dimension. II","summary":"  We continue to investigate the $k$ nearest neighbour learning rule in\nseparable metric spaces. Thanks to the results of C\\'erou and Guyader (2006)\nand Preiss (1983), this rule is known to be universally consistent in every\nmetric space $X$ that is sigma-finite dimensional in the sense of Nagata. Here\nwe show that the rule is strongly universally consistent in such spaces in the\nabsence of ties. Under the tie-breaking strategy applied by Devroye,\nGy\\\"{o}rfi, Krzy\\.{z}ak, and Lugosi (1994) in the Euclidean setting, we manage\nto show the strong universal consistency in non-Archimedian metric spaces (that\nis, those of Nagata dimension zero). Combining the theorem of C\\'erou and\nGuyader with results of Assouad and Quentin de Gromard (2006), one deduces that\nthe $k$-NN rule is universally consistent in metric spaces having finite\ndimension in the sense of de Groot. In particular, the $k$-NN rule is\nuniversally consistent in the Heisenberg group which is not sigma-finite\ndimensional in the sense of Nagata as follows from an example independently\nconstructed by Kor\\'anyi and Reimann (1995) and Sawyer and Wheeden (1992).\n","authors":["Sushma Kumari","Vladimir G. Pestov"],"pdf_url":"https://arxiv.org/pdf/2305.17282v2.pdf","comment":"Latex 2e, 17 pages. The Heisenberg group is now presented in more\n  detail, with some proofs and more references added, and a discussion of open\n  problems added at the end"},{"id":"http://arxiv.org/abs/2307.11668v1","updated":"2023-07-21T16:12:46Z","published":"2023-07-21T16:12:46Z","title":"An Efficient Interior-Point Method for Online Convex Optimization","summary":"  A new algorithm for regret minimization in online convex optimization is\ndescribed. The regret of the algorithm after $T$ time periods is $O(\\sqrt{T\n\\log T})$ - which is the minimum possible up to a logarithmic term. In\naddition, the new algorithm is adaptive, in the sense that the regret bounds\nhold not only for the time periods $1,\\ldots,T$ but also for every sub-interval\n$s,s+1,\\ldots,t$. The running time of the algorithm matches that of newly\nintroduced interior point algorithms for regret minimization: in\n$n$-dimensional space, during each iteration the new algorithm essentially\nsolves a system of linear equations of order $n$, rather than solving some\nconstrained convex optimization problem in $n$ dimensions and possibly many\nconstraints.\n","authors":["Elad Hazan","Nimrod Megiddo"],"pdf_url":"https://arxiv.org/pdf/2307.11668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.14778v2","updated":"2023-07-21T16:05:34Z","published":"2021-09-30T00:58:27Z","title":"CALDA: Improving Multi-Source Time Series Domain Adaptation with\n  Contrastive Adversarial Learning","summary":"  Unsupervised domain adaptation (UDA) provides a strategy for improving\nmachine learning performance in data-rich (target) domains where ground truth\nlabels are inaccessible but can be found in related (source) domains. In cases\nwhere meta-domain information such as label distributions is available, weak\nsupervision can further boost performance. We propose a novel framework, CALDA,\nto tackle these two problems. CALDA synergistically combines the principles of\ncontrastive learning and adversarial learning to robustly support multi-source\nUDA (MS-UDA) for time series data. Similar to prior methods, CALDA utilizes\nadversarial learning to align source and target feature representations. Unlike\nprior approaches, CALDA additionally leverages cross-source label information\nacross domains. CALDA pulls examples with the same label close to each other,\nwhile pushing apart examples with different labels, reshaping the space through\ncontrastive learning. Unlike prior contrastive adaptation methods, CALDA\nrequires neither data augmentation nor pseudo labeling, which may be more\nchallenging for time series. We empirically validate our proposed approach.\nBased on results from human activity recognition, electromyography, and\nsynthetic datasets, we find utilizing cross-source information improves\nperformance over prior time series and contrastive methods. Weak supervision\nfurther improves performance, even in the presence of noise, allowing CALDA to\noffer generalizable strategies for MS-UDA. Code is available at:\nhttps://github.com/floft/calda\n","authors":["Garrett Wilson","Janardhan Rao Doppa","Diane J. Cook"],"pdf_url":"https://arxiv.org/pdf/2109.14778v2.pdf","comment":"Accepted at IEEE Transactions on Pattern Analysis and Machine\n  Intelligence"},{"id":"http://arxiv.org/abs/2307.11661v1","updated":"2023-07-21T15:49:59Z","published":"2023-07-21T15:49:59Z","title":"Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts","summary":"  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have\nrevolutionized visual representation learning by providing good performance on\ndownstream datasets. VLMs are 0-shot adapted to a downstream dataset by\ndesigning prompts that are relevant to the dataset. Such prompt engineering\nmakes use of domain expertise and a validation dataset. Meanwhile, recent\ndevelopments in generative pretrained models like GPT-4 mean they can be used\nas advanced internet search tools. They can also be manipulated to provide\nvisual information in any structure. In this work, we show that GPT-4 can be\nused to generate text that is visually descriptive and how this can be used to\nadapt CLIP to downstream tasks. We show considerable improvements in 0-shot\ntransfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD\n(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.\nWe also design a simple few-shot adapter that learns to choose the best\npossible sentences to construct generalizable classifiers that outperform the\nrecently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized\nfine-grained datasets. We will release the code, prompts, and auxiliary text\ndataset upon acceptance.\n","authors":["Mayug Maniparambil","Chris Vorster","Derek Molloy","Noel Murphy","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11661v1.pdf","comment":"10 pages, Pre-print"},{"id":"http://arxiv.org/abs/2307.11655v1","updated":"2023-07-21T15:43:32Z","published":"2023-07-21T15:43:32Z","title":"Bandits with Deterministically Evolving States","summary":"  We propose a model for learning with bandit feedback while accounting for\ndeterministically evolving and unobservable states that we call Bandits with\nDeterministically Evolving States. The workhorse applications of our model are\nlearning for recommendation systems and learning for online ads. In both cases,\nthe reward that the algorithm obtains at each round is a function of the\nshort-term reward of the action chosen and how ``healthy'' the system is (i.e.,\nas measured by its state). For example, in recommendation systems, the reward\nthat the platform obtains from a user's engagement with a particular type of\ncontent depends not only on the inherent features of the specific content, but\nalso on how the user's preferences have evolved as a result of interacting with\nother types of content on the platform. Our general model accounts for the\ndifferent rate $\\lambda \\in [0,1]$ at which the state evolves (e.g., how fast a\nuser's preferences shift as a result of previous content consumption) and\nencompasses standard multi-armed bandits as a special case. The goal of the\nalgorithm is to minimize a notion of regret against the best-fixed sequence of\narms pulled. We analyze online learning algorithms for any possible\nparametrization of the evolution rate $\\lambda$. Specifically, the regret rates\nobtained are: for $\\lambda \\in [0, 1/T^2]$: $\\widetilde O(\\sqrt{KT})$; for\n$\\lambda = T^{-a/b}$ with $b < a < 2b$: $\\widetilde O (T^{b/a})$; for $\\lambda\n\\in (1/T, 1 - 1/\\sqrt{T}): \\widetilde O (K^{1/3}T^{2/3})$; and for $\\lambda \\in\n[1 - 1/\\sqrt{T}, 1]: \\widetilde O (K\\sqrt{T})$.\n","authors":["Khashayar Khosravi","Renato Paes Leme","Chara Podimata","Apostolis Tsorvantzis"],"pdf_url":"https://arxiv.org/pdf/2307.11655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09208v3","updated":"2023-07-21T15:27:34Z","published":"2022-05-18T20:34:25Z","title":"Torchhd: An Open Source Python Library to Support Research on\n  Hyperdimensional Computing and Vector Symbolic Architectures","summary":"  Hyperdimensional computing (HD), also known as vector symbolic architectures\n(VSA), is a framework for computing with distributed representations by\nexploiting properties of random high-dimensional vector spaces. The commitment\nof the scientific community to aggregate and disseminate research in this\nparticularly multidisciplinary area has been fundamental for its advancement.\nJoining these efforts, we present Torchhd, a high-performance open source\nPython library for HD/VSA. Torchhd seeks to make HD/VSA more accessible and\nserves as an efficient foundation for further research and application\ndevelopment. The easy-to-use library builds on top of PyTorch and features\nstate-of-the-art HD/VSA functionality, clear documentation, and implementation\nexamples from well-known publications. Comparing publicly available code with\ntheir corresponding Torchhd implementation shows that experiments can run up to\n100x faster. Torchhd is available at:\nhttps://github.com/hyperdimensional-computing/torchhd.\n","authors":["Mike Heddes","Igor Nunes","Pere Vergés","Denis Kleyko","Danny Abraham","Tony Givargis","Alexandru Nicolau","Alexander Veidenbaum"],"pdf_url":"https://arxiv.org/pdf/2205.09208v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08647v4","updated":"2023-07-21T14:59:16Z","published":"2023-02-17T01:32:44Z","title":"Multiresolution Graph Transformers and Wavelet Positional Encoding for\n  Learning Hierarchical Structures","summary":"  Contemporary graph learning algorithms are not well-defined for large\nmolecules since they do not consider the hierarchical interactions among the\natoms, which are essential to determine the molecular properties of\nmacromolecules. In this work, we propose Multiresolution Graph Transformers\n(MGT), the first graph transformer architecture that can learn to represent\nlarge molecules at multiple scales. MGT can learn to produce representations\nfor the atoms and group them into meaningful functional groups or repeating\nunits. We also introduce Wavelet Positional Encoding (WavePE), a new positional\nencoding method that can guarantee localization in both spectral and spatial\ndomains. Our proposed model achieves competitive results on two macromolecule\ndatasets consisting of polymers and peptides, and one drug-like molecule\ndataset. Importantly, our model outperforms other state-of-the-art methods and\nachieves chemical accuracy in estimating molecular properties (e.g., GAP, HOMO\nand LUMO) calculated by Density Functional Theory (DFT) in the polymers\ndataset. Furthermore, the visualizations, including clustering results on\nmacromolecules and low-dimensional spaces of their representations, demonstrate\nthe capability of our methodology in learning to represent long-range and\nhierarchical structures. Our PyTorch implementation is publicly available at\nhttps://github.com/HySonLab/Multires-Graph-Transformer\n","authors":["Nhat Khang Ngo","Truong Son Hy","Risi Kondor"],"pdf_url":"https://arxiv.org/pdf/2302.08647v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11629v1","updated":"2023-07-21T14:53:12Z","published":"2023-07-21T14:53:12Z","title":"Scalable Multi-agent Skill Discovery based on Kronecker Graphs","summary":"  Covering skill (a.k.a., option) discovery has been developed to improve the\nexploration of RL in single-agent scenarios with sparse reward signals, through\nconnecting the most distant states in the embedding space provided by the\nFiedler vector of the state transition graph. Given that joint state space\ngrows exponentially with the number of agents in multi-agent systems, existing\nresearches still relying on single-agent option discovery either become\nprohibitive or fail to directly discover joint options that improve the\nconnectivity of the joint state space. In this paper, we show how to directly\ncompute multi-agent options with collaborative exploratory behaviors while\nstill enjoying the ease of decomposition. Our key idea is to approximate the\njoint state space as a Kronecker graph, based on which we can directly estimate\nits Fiedler vector using the Laplacian spectrum of individual agents'\ntransition graphs. Further, considering that directly computing the Laplacian\nspectrum is intractable for tasks with infinite-scale state spaces, we further\npropose a deep learning extension of our method by estimating eigenfunctions\nthrough NN-based representation learning techniques. The evaluation on\nmulti-agent tasks built with simulators like Mujoco, shows that the proposed\nalgorithm can successfully identify multi-agent options, and significantly\noutperforms the state-of-the-art. Codes are available at:\nhttps://github.itap.purdue.edu/Clan-labs/Scalable_MAOD_via_KP.\n","authors":["Jiayu Chen","Jingdi Chen","Tian Lan","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2307.11629v1.pdf","comment":"Accepted to NeurIPS 2022. arXiv admin note: substantial text overlap\n  with arXiv:2201.08227"},{"id":"http://arxiv.org/abs/2307.11620v1","updated":"2023-07-21T14:37:54Z","published":"2023-07-21T14:37:54Z","title":"Offline Multi-Agent Reinforcement Learning with Implicit Global-to-Local\n  Value Regularization","summary":"  Offline reinforcement learning (RL) has received considerable attention in\nrecent years due to its attractive capability of learning policies from offline\ndatasets without environmental interactions. Despite some success in the\nsingle-agent setting, offline multi-agent RL (MARL) remains to be a challenge.\nThe large joint state-action space and the coupled multi-agent behaviors pose\nextra complexities for offline policy optimization. Most existing offline MARL\nstudies simply apply offline data-related regularizations on individual agents,\nwithout fully considering the multi-agent system at the global level. In this\nwork, we present OMIGA, a new offline m ulti-agent RL algorithm with implicit\nglobal-to-local v alue regularization. OMIGA provides a principled framework to\nconvert global-level value regularization into equivalent implicit local value\nregularizations and simultaneously enables in-sample learning, thus elegantly\nbridging multi-agent value decomposition and policy learning with offline\nregularizations. Based on comprehensive experiments on the offline multi-agent\nMuJoCo and StarCraft II micro-management tasks, we show that OMIGA achieves\nsuperior performance over the state-of-the-art offline MARL methods in almost\nall tasks.\n","authors":["Xiangsen Wang","Haoran Xu","Yinan Zheng","Xianyuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2307.11620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11617v1","updated":"2023-07-21T14:36:40Z","published":"2023-07-21T14:36:40Z","title":"Robust Fully-Asynchronous Methods for Distributed Training over General\n  Architecture","summary":"  Perfect synchronization in distributed machine learning problems is\ninefficient and even impossible due to the existence of latency, package losses\nand stragglers. We propose a Robust Fully-Asynchronous Stochastic Gradient\nTracking method (R-FAST), where each device performs local computation and\ncommunication at its own pace without any form of synchronization. Different\nfrom existing asynchronous distributed algorithms, R-FAST can eliminate the\nimpact of data heterogeneity across devices and allow for packet losses by\nemploying a robust gradient tracking strategy that relies on properly designed\nauxiliary variables for tracking and buffering the overall gradient vector.\nMore importantly, the proposed method utilizes two spanning-tree graphs for\ncommunication so long as both share at least one common root, enabling flexible\ndesigns in communication architectures. We show that R-FAST converges in\nexpectation to a neighborhood of the optimum with a geometric rate for smooth\nand strongly convex objectives; and to a stationary point with a sublinear rate\nfor general non-convex settings. Extensive experiments demonstrate that R-FAST\nruns 1.5-2 times faster than synchronous benchmark algorithms, such as\nRing-AllReduce and D-PSGD, while still achieving comparable accuracy, and\noutperforms existing asynchronous SOTA algorithms, such as AD-PSGD and OSGP,\nespecially in the presence of stragglers.\n","authors":["Zehan Zhu","Ye Tian","Yan Huang","Jinming Xu","Shibo He"],"pdf_url":"https://arxiv.org/pdf/2307.11617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11609v1","updated":"2023-07-21T14:25:22Z","published":"2023-07-21T14:25:22Z","title":"Persistent Ballistic Entanglement Spreading with Optimal Control in\n  Quantum Spin Chains","summary":"  Entanglement propagation provides a key routine to understand quantum\nmany-body dynamics in and out of equilibrium. In this work, we uncover that the\n``variational entanglement-enhancing'' field (VEEF) robustly induces a\npersistent ballistic spreading of entanglement in quantum spin chains. The VEEF\nis time dependent, and is optimally controlled to maximize the bipartite\nentanglement entropy (EE) of the final state. Such a linear growth persists\ntill the EE reaches the genuine saturation $\\tilde{S} = - \\log_{2}\n2^{-\\frac{N}{2}}=\\frac{N}{2}$ with $N$ the total number of spins. The EE\nsatisfies $S(t) = v t$ for the time $t \\leq \\frac{N}{2v}$, with $v$ the\nvelocity. These results are in sharp contrast with the behaviors without VEEF,\nwhere the EE generally approaches a sub-saturation known as the Page value\n$\\tilde{S}_{P} =\\tilde{S} - \\frac{1}{2\\ln{2}}$ in the long-time limit, and the\nentanglement growth deviates from being linear before the Page value is\nreached. The dependence between the velocity and interactions is explored, with\n$v \\simeq 2.76$, $4.98$, and $5.75$ for the spin chains with Ising, XY, and\nHeisenberg interactions, respectively. We further show that the nonlinear\ngrowth of EE emerges with the presence of long-range interactions.\n","authors":["Ying Lu","Pei Shi","Xiao-Han Wang","Jie Hu","Shi-Ju Ran"],"pdf_url":"https://arxiv.org/pdf/2307.11609v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.11608v1","updated":"2023-07-21T14:25:06Z","published":"2023-07-21T14:25:06Z","title":"Learning minimal representations of stochastic processes with\n  variational autoencoders","summary":"  Stochastic processes have found numerous applications in science, as they are\nbroadly used to model a variety of natural phenomena. Due to their intrinsic\nrandomness and uncertainty, they are however difficult to characterize. Here,\nwe introduce an unsupervised machine learning approach to determine the minimal\nset of parameters required to effectively describe the dynamics of a stochastic\nprocess. Our method builds upon an extended $\\beta$-variational autoencoder\narchitecture. By means of simulated datasets corresponding to paradigmatic\ndiffusion models, we showcase its effectiveness in extracting the minimal\nrelevant parameters that accurately describe these dynamics. Furthermore, the\nmethod enables the generation of new trajectories that faithfully replicate the\nexpected stochastic behavior. Overall, our approach enables for the autonomous\ndiscovery of unknown parameters describing stochastic processes, hence\nenhancing our comprehension of complex phenomena across various fields.\n","authors":["Gabriel Fernández-Fernández","Carlo Manzo","Maciej Lewenstein","Alexandre Dauphin","Gorka Muñoz-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.11608v1.pdf","comment":"9 pages, 5 figures, 1 table. Code available at\n  https://github.com/GabrielFernandezFernandez/SPIVAE"},{"id":"http://arxiv.org/abs/2307.11607v1","updated":"2023-07-21T14:23:41Z","published":"2023-07-21T14:23:41Z","title":"Finding Optimal Diverse Feature Sets with Alternative Feature Selection","summary":"  Feature selection is popular for obtaining small, interpretable, yet highly\naccurate prediction models. Conventional feature-selection methods typically\nyield one feature set only, which might not suffice in some scenarios. For\nexample, users might be interested in finding alternative feature sets with\nsimilar prediction quality, offering different explanations of the data. In\nthis article, we introduce alternative feature selection and formalize it as an\noptimization problem. In particular, we define alternatives via constraints and\nenable users to control the number and dissimilarity of alternatives. Next, we\nanalyze the complexity of this optimization problem and show NP-hardness.\nFurther, we discuss how to integrate conventional feature-selection methods as\nobjectives. Finally, we evaluate alternative feature selection with 30\nclassification datasets. We observe that alternative feature sets may indeed\nhave high prediction quality, and we analyze several factors influencing this\noutcome.\n","authors":["Jakob Bach"],"pdf_url":"https://arxiv.org/pdf/2307.11607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.00211v2","updated":"2023-07-21T13:57:09Z","published":"2022-12-01T01:40:03Z","title":"A Unified Algorithm Framework for Unsupervised Discovery of Skills based\n  on Determinantal Point Process","summary":"  Learning rich skills through temporal abstractions without supervision of\nexternal rewards is at the frontier of Reinforcement Learning research.\nExisting works mainly fall into two distinctive categories: variational and\nLaplacian-based skill (a.k.a., option) discovery. The former maximizes the\ndiversity of the discovered options through a mutual information loss but\noverlooks coverage of the state space, while the latter focuses on improving\nthe coverage of options by increasing connectivity during exploration, but does\nnot consider diversity. In this paper, we propose a unified framework that\nquantifies diversity and coverage through a novel use of the Determinantal\nPoint Process (DPP) and enables unsupervised option discovery explicitly\noptimizing both objectives. Specifically, we define the DPP kernel matrix with\nthe Laplacian spectrum of the state transition graph and use the expected mode\nnumber in the trajectories as the objective to capture and enhance both\ndiversity and coverage of the learned options. The proposed option discovery\nalgorithm is extensively evaluated using challenging tasks built with Mujoco\nand Atari, demonstrating that our proposed algorithm substantially outperforms\nSOTA baselines from both diversity- and coverage-driven categories. The codes\nare available at https://github.com/LucasCJYSDL/ODPP.\n","authors":["Jiayu Chen","Vaneet Aggarwal","Tian Lan"],"pdf_url":"https://arxiv.org/pdf/2212.00211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11588v1","updated":"2023-07-21T13:51:45Z","published":"2023-07-21T13:51:45Z","title":"Transferability of Convolutional Neural Networks in Stationary Learning\n  Tasks","summary":"  Recent advances in hardware and big data acquisition have accelerated the\ndevelopment of deep learning techniques. For an extended period of time,\nincreasing the model complexity has led to performance improvements for various\ntasks. However, this trend is becoming unsustainable and there is a need for\nalternative, computationally lighter methods. In this paper, we introduce a\nnovel framework for efficient training of convolutional neural networks (CNNs)\nfor large-scale spatial problems. To accomplish this we investigate the\nproperties of CNNs for tasks where the underlying signals are stationary. We\nshow that a CNN trained on small windows of such signals achieves a nearly\nperformance on much larger windows without retraining. This claim is supported\nby our theoretical analysis, which provides a bound on the performance\ndegradation. Additionally, we conduct thorough experimental analysis on two\ntasks: multi-target tracking and mobile infrastructure on demand. Our results\nshow that the CNN is able to tackle problems with many hundreds of agents after\nbeing trained with fewer than ten. Thus, CNN architectures provide solutions to\nthese problems at previously computationally intractable scales.\n","authors":["Damian Owerko","Charilaos I. Kanatsoulis","Jennifer Bondarchuk","Donald J. Bucci Jr","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2307.11588v1.pdf","comment":"14 pages, 7 figures, for associated code see\n  https://github.com/damowerko/mtt"},{"id":"http://arxiv.org/abs/2307.11584v1","updated":"2023-07-21T13:48:11Z","published":"2023-07-21T13:48:11Z","title":"A Change of Heart: Improving Speech Emotion Recognition through\n  Speech-to-Text Modality Conversion","summary":"  Speech Emotion Recognition (SER) is a challenging task. In this paper, we\nintroduce a modality conversion concept aimed at enhancing emotion recognition\nperformance on the MELD dataset. We assess our approach through two\nexperiments: first, a method named Modality-Conversion that employs automatic\nspeech recognition (ASR) systems, followed by a text classifier; second, we\nassume perfect ASR output and investigate the impact of modality conversion on\nSER, this method is called Modality-Conversion++. Our findings indicate that\nthe first method yields substantial results, while the second method\noutperforms state-of-the-art (SOTA) speech-based approaches in terms of SER\nweighted-F1 (WF1) score on the MELD dataset. This research highlights the\npotential of modality conversion for tasks that can be conducted in alternative\nmodalities.\n","authors":["Zeinab Sadat Taghavi","Ali Satvaty","Hossein Sameti"],"pdf_url":"https://arxiv.org/pdf/2307.11584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.08227v3","updated":"2023-07-21T13:42:59Z","published":"2022-01-20T15:33:08Z","title":"Learning Multi-agent Skills for Tabular Reinforcement Learning using\n  Factor Graphs","summary":"  Covering skill (a.k.a., option) discovery has been developed to improve the\nexploration of reinforcement learning in single-agent scenarios with sparse\nreward signals, through connecting the most distant states in the embedding\nspace provided by the Fiedler vector of the state transition graph. However,\nthese option discovery methods cannot be directly extended to multi-agent\nscenarios, since the joint state space grows exponentially with the number of\nagents in the system. Thus, existing researches on adopting options in\nmulti-agent scenarios still rely on single-agent option discovery and fail to\ndirectly discover the joint options that can improve the connectivity of the\njoint state space of agents. In this paper, we show that it is indeed possible\nto directly compute multi-agent options with collaborative exploratory\nbehaviors among the agents, while still enjoying the ease of decomposition. Our\nkey idea is to approximate the joint state space as a Kronecker graph -- the\nKronecker product of individual agents' state transition graphs, based on which\nwe can directly estimate the Fiedler vector of the joint state space using the\nLaplacian spectrum of individual agents' transition graphs. This decomposition\nenables us to efficiently construct multi-agent joint options by encouraging\nagents to connect the sub-goal joint states which are corresponding to the\nminimum or maximum values of the estimated joint Fiedler vector. The evaluation\nbased on multi-agent collaborative tasks shows that the proposed algorithm can\nsuccessfully identify multi-agent options, and significantly outperforms prior\nworks using single-agent options or no options, in terms of both faster\nexploration and higher cumulative rewards.\n","authors":["Jiayu Chen","Jingdi Chen","Tian Lan","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2201.08227v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.03269v2","updated":"2023-07-21T13:27:13Z","published":"2022-10-07T00:40:59Z","title":"Multi-agent Deep Covering Skill Discovery","summary":"  The use of skills (a.k.a., options) can greatly accelerate exploration in\nreinforcement learning, especially when only sparse reward signals are\navailable. While option discovery methods have been proposed for individual\nagents, in multi-agent reinforcement learning settings, discovering\ncollaborative options that can coordinate the behavior of multiple agents and\nencourage them to visit the under-explored regions of their joint state space\nhas not been considered. In this case, we propose Multi-agent Deep Covering\nOption Discovery, which constructs the multi-agent options through minimizing\nthe expected cover time of the multiple agents' joint state space. Also, we\npropose a novel framework to adopt the multi-agent options in the MARL process.\nIn practice, a multi-agent task can usually be divided into some sub-tasks,\neach of which can be completed by a sub-group of the agents. Therefore, our\nalgorithm framework first leverages an attention mechanism to find\ncollaborative agent sub-groups that would benefit most from coordinated\nactions. Then, a hierarchical algorithm, namely HA-MSAC, is developed to learn\nthe multi-agent options for each sub-group to complete their sub-tasks first,\nand then to integrate them through a high-level policy as the solution of the\nwhole task. This hierarchical option construction allows our framework to\nstrike a balance between scalability and effective collaboration among the\nagents. The evaluation based on multi-agent collaborative tasks shows that the\nproposed algorithm can effectively capture the agent interactions with the\nattention mechanism, successfully identify multi-agent options, and\nsignificantly outperforms prior works using single-agent options or no options,\nin terms of both faster exploration and higher task rewards.\n","authors":["Jiayu Chen","Marina Haliem","Tian Lan","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2210.03269v2.pdf","comment":"This paper was presented in part at the ICML Reinforcement Learning\n  for Real Life Workshop, July 2021"},{"id":"http://arxiv.org/abs/2305.18453v2","updated":"2023-07-21T13:26:21Z","published":"2023-05-29T04:14:38Z","title":"Conditional Diffusion Models for Semantic 3D Medical Image Synthesis","summary":"  The demand for artificial intelligence (AI) in healthcare is rapidly\nincreasing. However, significant challenges arise from data scarcity and\nprivacy concerns, particularly in medical imaging. While existing generative\nmodels have achieved success in image synthesis and image-to-image translation\ntasks, there remains a gap in the generation of 3D semantic medical images. To\naddress this gap, we introduce Med-DDPM, a diffusion model specifically\ndesigned for semantic 3D medical image synthesis, effectively tackling data\nscarcity and privacy issues. The novelty of Med-DDPM lies in its incorporation\nof semantic conditioning, enabling precise control during the image generation\nprocess. Our model outperforms Generative Adversarial Networks (GANs) in terms\nof stability and performance, generating diverse and anatomically coherent\nimages with high visual fidelity. Comparative analysis against state-of-the-art\naugmentation techniques demonstrates that Med-DDPM produces comparable results,\nhighlighting its potential as a data augmentation tool for enhancing model\naccuracy. In conclusion, Med-DDPM pioneers 3D semantic medical image synthesis\nby delivering high-quality and anatomically coherent images. Furthermore, the\nintegration of semantic conditioning with Med-DDPM holds promise for image\nanonymization in the field of biomedical imaging, showcasing the capabilities\nof the model in addressing challenges related to data scarcity and privacy\nconcerns.\n","authors":["Zolnamar Dorjsembe","Hsing-Kuo Pao","Sodtavilan Odonchimed","Furen Xiao"],"pdf_url":"https://arxiv.org/pdf/2305.18453v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11565v1","updated":"2023-07-21T13:17:22Z","published":"2023-07-21T13:17:22Z","title":"FMT: Removing Backdoor Feature Maps via Feature Map Testing in Deep\n  Neural Networks","summary":"  Deep neural networks have been widely used in many critical applications,\nsuch as autonomous vehicles and medical diagnosis. However, their security is\nthreatened by backdoor attack, which is achieved by adding artificial patterns\nto specific training data. Existing defense strategies primarily focus on using\nreverse engineering to reproduce the backdoor trigger generated by attackers\nand subsequently repair the DNN model by adding the trigger into inputs and\nfine-tuning the model with ground-truth labels. However, once the trigger\ngenerated by the attackers is complex and invisible, the defender can not\nsuccessfully reproduce the trigger. Consequently, the DNN model will not be\nrepaired since the trigger is not effectively removed.\n  In this work, we propose Feature Map Testing~(FMT). Different from existing\ndefense strategies, which focus on reproducing backdoor triggers, FMT tries to\ndetect the backdoor feature maps, which are trained to extract backdoor\ninformation from the inputs. After detecting these backdoor feature maps, FMT\nwill erase them and then fine-tune the model with a secure subset of training\ndata. Our experiments demonstrate that, compared to existing defense\nstrategies, FMT can effectively reduce the Attack Success Rate (ASR) even\nagainst the most complex and invisible attack triggers. Second, unlike\nconventional defense methods that tend to exhibit low Robust Accuracy (i.e.,\nthe model's accuracy on the poisoned data), FMT achieves higher RA, indicating\nits superiority in maintaining model performance while mitigating the effects\nof backdoor attacks~(e.g., FMT obtains 87.40\\% RA in CIFAR10). Third, compared\nto existing feature map pruning techniques, FMT can cover more backdoor feature\nmaps~(e.g., FMT removes 83.33\\% of backdoor feature maps from the model in the\nCIFAR10 \\& BadNet scenario).\n","authors":["Dong Huang","Qingwen Bu","Yahao Qing","Yichao Fu","Heming Cui"],"pdf_url":"https://arxiv.org/pdf/2307.11565v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2301.09559v2","updated":"2023-07-21T13:13:01Z","published":"2023-01-23T17:20:25Z","title":"SpArX: Sparse Argumentative Explanations for Neural Networks","summary":"  Neural networks (NNs) have various applications in AI, but explaining their\ndecisions remains challenging. Existing approaches often focus on explaining\nhow changing individual inputs affects NNs' outputs. However, an explanation\nthat is consistent with the input-output behaviour of an NN is not necessarily\nfaithful to the actual mechanics thereof. In this paper, we exploit\nrelationships between multi-layer perceptrons (MLPs) and quantitative\nargumentation frameworks (QAFs) to create argumentative explanations for the\nmechanics of MLPs. Our SpArX method first sparsifies the MLP while maintaining\nas much of the original structure as possible. It then translates the sparse\nMLP into an equivalent QAF to shed light on the underlying decision process of\nthe MLP, producing global and/or local explanations. We demonstrate\nexperimentally that SpArX can give more faithful explanations than existing\napproaches, while simultaneously providing deeper insights into the actual\nreasoning process of MLPs.\n","authors":["Hamed Ayoobi","Nico Potyka","Francesca Toni"],"pdf_url":"https://arxiv.org/pdf/2301.09559v2.pdf","comment":"Accepted at the European Conference on Artificial Intelligence (ECAI)\n  2023 Conference"},{"id":"http://arxiv.org/abs/2307.11552v1","updated":"2023-07-21T12:58:03Z","published":"2023-07-21T12:58:03Z","title":"A multi-modal representation of El Niño Southern Oscillation Diversity","summary":"  The El Ni\\~no-Southern Oscillation (ENSO) is characterized by alternating\nperiods of warm (El Ni\\~no) and cold (La Ni\\~na) sea surface temperature\nanomalies (SSTA) in the equatorial Pacific. Although El Ni\\~no and La Ni\\~na\nare well-defined climate patterns, no two events are alike. To date, ENSO\ndiversity has been described primarily in terms of the longitudinal location of\npeak SSTA, used to define a bimodal classification of events in Eastern Pacific\n(EP) and Central Pacific (CP) types. Here, we use low-dimensional\nrepresentations of Pacific SSTAs to argue that binary categorical memberships\nare unsuitable to describe ENSO events. Using fuzzy unsupervised clustering, we\nrecover the four known ENSO categories, along with a fifth category: an Extreme\nEl Ni\\~no. We show that Extreme El Ni\\~nos differ both in their intensity and\ntemporal evolution from canonical EP El Ni\\~nos. We also find that CP La\nNi\\~nas, EP El Ni\\~nos, and Extreme El Ni\\~nos contribute the most to\ninterdecadal ENSO variability.\n","authors":["Jakob Schlör","Felix Strnad","Antonietta Capotondi","Bedartha Goswami"],"pdf_url":"https://arxiv.org/pdf/2307.11552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11546v1","updated":"2023-07-21T12:47:28Z","published":"2023-07-21T12:47:28Z","title":"Towards practical reinforcement learning for tokamak magnetic control","summary":"  Reinforcement learning (RL) has shown promising results for real-time control\nsystems, including the domain of plasma magnetic control. However, there are\nstill significant drawbacks compared to traditional feedback control approaches\nfor magnetic confinement. In this work, we address key drawbacks of the RL\nmethod; achieving higher control accuracy for desired plasma properties,\nreducing the steady-state error, and decreasing the required time to learn new\ntasks. We build on top of \\cite{degrave2022magnetic}, and present algorithmic\nimprovements to the agent architecture and training procedure. We present\nsimulation results that show up to 65\\% improvement in shape accuracy, achieve\nsubstantial reduction in the long-term bias of the plasma current, and\nadditionally reduce the training time required to learn new tasks by a factor\nof 3 or more. We present new experiments using the upgraded RL-based\ncontrollers on the TCV tokamak, which validate the simulation results achieved,\nand point the way towards routinely achieving accurate discharges using the RL\napproach.\n","authors":["Brendan D. Tracey","Andrea Michi","Yuri Chervonyi","Ian Davies","Cosmin Paduraru","Nevena Lazic","Federico Felici","Timo Ewalds","Craig Donner","Cristian Galperti","Jonas Buchli","Michael Neunert","Andrea Huber","Jonathan Evens","Paula Kurylowicz","Daniel J. Mankowitz","Martin Riedmiller","The TCV Team"],"pdf_url":"https://arxiv.org/pdf/2307.11546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11532v1","updated":"2023-07-21T12:26:42Z","published":"2023-07-21T12:26:42Z","title":"Training Latency Minimization for Model-Splitting Allowed Federated Edge\n  Learning","summary":"  To alleviate the shortage of computing power faced by clients in training\ndeep neural networks (DNNs) using federated learning (FL), we leverage the edge\ncomputing and split learning to propose a model-splitting allowed FL (SFL)\nframework, with the aim to minimize the training latency without loss of test\naccuracy. Under the synchronized global update setting, the latency to complete\na round of global training is determined by the maximum latency for the clients\nto complete a local training session. Therefore, the training latency\nminimization problem (TLMP) is modelled as a minimizing-maximum problem. To\nsolve this mixed integer nonlinear programming problem, we first propose a\nregression method to fit the quantitative-relationship between the cut-layer\nand other parameters of an AI-model, and thus, transform the TLMP into a\ncontinuous problem. Considering that the two subproblems involved in the TLMP,\nnamely, the cut-layer selection problem for the clients and the computing\nresource allocation problem for the parameter-server are relative independence,\nan alternate-optimization-based algorithm with polynomial time complexity is\ndeveloped to obtain a high-quality solution to the TLMP. Extensive experiments\nare performed on a popular DNN-model EfficientNetV2 using dataset MNIST, and\nthe results verify the validity and improved performance of the proposed SFL\nframework.\n","authors":["Yao Wen","Guopeng Zhang","Kezhi Wang","Kun Yang"],"pdf_url":"https://arxiv.org/pdf/2307.11532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01639v2","updated":"2023-07-21T12:16:41Z","published":"2023-06-02T15:59:47Z","title":"Reduction of finite sampling noise in quantum neural networks","summary":"  Quantum neural networks (QNNs) use parameterized quantum circuits with\ndata-dependent inputs and generate outputs through the evaluation of\nexpectation values. Calculating these expectation values necessitates repeated\ncircuit evaluations, thus introducing fundamental finite-sampling noise even on\nerror-free quantum computers. We reduce this noise by introducing the variance\nregularization, a technique for reducing the variance of the expectation value\nduring the quantum model training. This technique requires no additional\ncircuit evaluations if the QNN is properly constructed. Our empirical findings\ndemonstrate the reduced variance speeds up the training and lowers the output\nnoise as well as decreases the number of necessary evaluations of gradient\ncircuits. This regularization method is benchmarked on the regression of\nmultiple functions. We show that in our examples, it lowers the variance by an\norder of magnitude on average and leads to a significantly reduced noise level\nof the QNN. We finally demonstrate QNN training on a real quantum device and\nevaluate the impact of error mitigation. Here, the optimization is feasible\nonly due to the reduced number of necessary shots in the gradient evaluation\nresulting from the reduced variance.\n","authors":["David A. Kreplin","Marco Roth"],"pdf_url":"https://arxiv.org/pdf/2306.01639v2.pdf","comment":"11 pages, 10 figures; refined section 5"},{"id":"http://arxiv.org/abs/2306.07308v3","updated":"2023-07-21T11:52:28Z","published":"2023-06-12T13:48:37Z","title":"Self-Supervised Hyperspectral Inpainting with the Optimisation inspired\n  Deep Neural Network Prior","summary":"  Hyperspectral Image (HSI)s cover hundreds or thousands of narrow spectral\nbands, conveying a wealth of spatial and spectral information. However, due to\nthe instrumental errors and the atmospheric changes, the HSI obtained in\npractice are often contaminated by noise and dead pixels(lines), resulting in\nmissing information that may severely compromise the subsequent applications.\nWe introduce here a novel HSI missing pixel prediction algorithm, called Low\nRank and Sparsity Constraint Plug-and-Play (LRS-PnP). It is shown that LRS-PnP\nis able to predict missing pixels and bands even when all spectral bands of the\nimage are missing. The proposed LRS-PnP algorithm is further extended to a\nself-supervised model by combining the LRS-PnP with the Deep Image Prior (DIP),\ncalled LRS-PnP-DIP. In a series of experiments with real data, It is shown that\nthe LRS-PnP-DIP either achieves state-of-the-art inpainting performance\ncompared to other learning-based methods, or outperforms them.\n","authors":["Shuo Li","Mehrdad Yaghoobi"],"pdf_url":"https://arxiv.org/pdf/2306.07308v3.pdf","comment":"Presented in ISCS23"},{"id":"http://arxiv.org/abs/2303.06067v2","updated":"2023-07-21T11:40:45Z","published":"2023-03-10T16:48:54Z","title":"Modeling Events and Interactions through Temporal Processes -- A Survey","summary":"  In real-world scenario, many phenomena produce a collection of events that\noccur in continuous time. Point Processes provide a natural mathematical\nframework for modeling these sequences of events. In this survey, we\ninvestigate probabilistic models for modeling event sequences through temporal\nprocesses. We revise the notion of event modeling and provide the mathematical\nfoundations that characterize the literature on the topic. We define an\nontology to categorize the existing approaches in terms of three families:\nsimple, marked, and spatio-temporal point processes. For each family, we\nsystematically review the existing approaches based based on deep learning.\nFinally, we analyze the scenarios where the proposed techniques can be used for\naddressing prediction and modeling aspects.\n","authors":["Angelica Liguori","Luciano Caroprese","Marco Minici","Bruno Veloso","Francesco Spinnato","Mirco Nanni","Giuseppe Manco","Joao Gama"],"pdf_url":"https://arxiv.org/pdf/2303.06067v2.pdf","comment":"Image replacements"},{"id":"http://arxiv.org/abs/2304.14118v2","updated":"2023-07-21T11:36:40Z","published":"2023-04-27T12:05:34Z","title":"Learning Neural PDE Solvers with Parameter-Guided Channel Attention","summary":"  Scientific Machine Learning (SciML) is concerned with the development of\nlearned emulators of physical systems governed by partial differential\nequations (PDE). In application domains such as weather forecasting, molecular\ndynamics, and inverse design, ML-based surrogate models are increasingly used\nto augment or replace inefficient and often non-differentiable numerical\nsimulation algorithms. While a number of ML-based methods for approximating the\nsolutions of PDEs have been proposed in recent years, they typically do not\nadapt to the parameters of the PDEs, making it difficult to generalize to PDE\nparameters not seen during training. We propose a Channel Attention mechanism\nguided by PDE Parameter Embeddings (CAPE) component for neural surrogate models\nand a simple yet effective curriculum learning strategy. The CAPE module can be\ncombined with neural PDE solvers allowing them to adapt to unseen PDE\nparameters. The curriculum learning strategy provides a seamless transition\nbetween teacher-forcing and fully auto-regressive training. We compare CAPE in\nconjunction with the curriculum learning strategy using a popular PDE benchmark\nand obtain consistent and significant improvements over the baseline models.\nThe experiments also show several advantages of CAPE, such as its increased\nability to generalize to unseen PDE parameters without large increases\ninference time and parameter count.\n","authors":["Makoto Takamoto","Francesco Alesiani","Mathias Niepert"],"pdf_url":"https://arxiv.org/pdf/2304.14118v2.pdf","comment":"accepted for publication in ICML2023"},{"id":"http://arxiv.org/abs/2306.00988v2","updated":"2023-07-21T11:27:10Z","published":"2023-06-01T17:59:57Z","title":"Continual Learning for Abdominal Multi-Organ and Tumor Segmentation","summary":"  The ability to dynamically extend a model to new data and classes is critical\nfor multiple organ and tumor segmentation. However, due to privacy regulations,\naccessing previous data and annotations can be problematic in the medical\ndomain. This poses a significant barrier to preserving the high segmentation\naccuracy of the old classes when learning from new classes because of the\ncatastrophic forgetting problem. In this paper, we first empirically\ndemonstrate that simply using high-quality pseudo labels can fairly mitigate\nthis problem in the setting of organ segmentation. Furthermore, we put forward\nan innovative architecture designed specifically for continuous organ and tumor\nsegmentation, which incurs minimal computational overhead. Our proposed design\ninvolves replacing the conventional output layer with a suite of lightweight,\nclass-specific heads, thereby offering the flexibility to accommodate newly\nemerging classes. These heads enable independent predictions for newly\nintroduced and previously learned classes, effectively minimizing the impact of\nnew classes on old ones during the course of continual learning. We further\npropose incorporating Contrastive Language-Image Pretraining (CLIP) embeddings\ninto the organ-specific heads. These embeddings encapsulate the semantic\ninformation of each class, informed by extensive image-text co-training. The\nproposed method is evaluated on both in-house and public abdominal CT datasets\nunder organ and tumor segmentation tasks. Empirical results suggest that the\nproposed design improves the segmentation performance of a baseline neural\nnetwork on newly-introduced and previously-learned classes along the learning\ntrajectory.\n","authors":["Yixiao Zhang","Xinyi Li","Huimiao Chen","Alan Yuille","Yaoyao Liu","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.00988v2.pdf","comment":"MICCAI-2023"},{"id":"http://arxiv.org/abs/2307.11503v1","updated":"2023-07-21T11:19:00Z","published":"2023-07-21T11:19:00Z","title":"General regularization in covariate shift adaptation","summary":"  Sample reweighting is one of the most widely used methods for correcting the\nerror of least squares learning algorithms in reproducing kernel Hilbert spaces\n(RKHS), that is caused by future data distributions that are different from the\ntraining data distribution. In practical situations, the sample weights are\ndetermined by values of the estimated Radon-Nikod\\'ym derivative, of the future\ndata distribution w.r.t.~the training data distribution. In this work, we\nreview known error bounds for reweighted kernel regression in RKHS and obtain,\nby combination, novel results. We show under weak smoothness conditions, that\nthe amount of samples, needed to achieve the same order of accuracy as in the\nstandard supervised learning without differences in data distributions, is\nsmaller than proven by state-of-the-art analyses.\n","authors":["Duc Hoan Nguyen","Sergei V. Pereverzyev","Werner Zellinger"],"pdf_url":"https://arxiv.org/pdf/2307.11503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11494v1","updated":"2023-07-21T10:56:36Z","published":"2023-07-21T10:56:36Z","title":"Predict, Refine, Synthesize: Self-Guiding Diffusion Models for\n  Probabilistic Time Series Forecasting","summary":"  Diffusion models have achieved state-of-the-art performance in generative\nmodeling tasks across various domains. Prior works on time series diffusion\nmodels have primarily focused on developing conditional models tailored to\nspecific forecasting or imputation tasks. In this work, we explore the\npotential of task-agnostic, unconditional diffusion models for several time\nseries applications. We propose TSDiff, an unconditionally trained diffusion\nmodel for time series. Our proposed self-guidance mechanism enables\nconditioning TSDiff for downstream tasks during inference, without requiring\nauxiliary networks or altering the training procedure. We demonstrate the\neffectiveness of our method on three different time series tasks: forecasting,\nrefinement, and synthetic data generation. First, we show that TSDiff is\ncompetitive with several task-specific conditional forecasting methods\n(predict). Second, we leverage the learned implicit probability density of\nTSDiff to iteratively refine the predictions of base forecasters with reduced\ncomputational overhead over reverse diffusion (refine). Notably, the generative\nperformance of the model remains intact -- downstream forecasters trained on\nsynthetic samples from TSDiff outperform forecasters that are trained on\nsamples from other state-of-the-art generative time series models, occasionally\neven outperforming models trained on real data (synthesize).\n","authors":["Marcel Kollovieh","Abdul Fatir Ansari","Michael Bohlke-Schneider","Jasper Zschiegner","Hao Wang","Yuyang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11487v1","updated":"2023-07-21T10:45:08Z","published":"2023-07-21T10:45:08Z","title":"A New Deep State-Space Analysis Framework for Patient Latent State\n  Estimation and Classification from EHR Time Series Data","summary":"  Many diseases, including cancer and chronic conditions, require extended\ntreatment periods and long-term strategies. Machine learning and AI research\nfocusing on electronic health records (EHRs) have emerged to address this need.\nEffective treatment strategies involve more than capturing sequential changes\nin patient test values. It requires an explainable and clinically interpretable\nmodel by capturing the patient's internal state over time.\n  In this study, we propose the \"deep state-space analysis framework,\" using\ntime-series unsupervised learning of EHRs with a deep state-space model. This\nframework enables learning, visualizing, and clustering of temporal changes in\npatient latent states related to disease progression.\n  We evaluated our framework using time-series laboratory data from 12,695\ncancer patients. By estimating latent states, we successfully discover latent\nstates related to prognosis. By visualization and cluster analysis, the\ntemporal transition of patient status and test items during state transitions\ncharacteristic of each anticancer drug were identified. Our framework surpasses\nexisting methods in capturing interpretable latent space. It can be expected to\nenhance our comprehension of disease progression from EHRs, aiding treatment\nadjustments and prognostic determinations.\n","authors":["Aya Nakamura","Ryosuke Kojima","Yuji Okamoto","Eiichiro Uchino","Yohei Mineharu","Yohei Harada","Mayumi Kamada","Manabu Muto","Motoko Yanagita","Yasushi Okuno"],"pdf_url":"https://arxiv.org/pdf/2307.11487v1.pdf","comment":"21 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.06092v3","updated":"2023-07-21T10:04:23Z","published":"2023-07-12T11:35:37Z","title":"Quantitative CLTs in Deep Neural Networks","summary":"  We study the distribution of a fully connected neural network with random\nGaussian weights and biases in which the hidden layer widths are proportional\nto a large constant $n$. Under mild assumptions on the non-linearity, we obtain\nquantitative bounds on normal approximations valid at large but finite $n$ and\nany fixed network depth. Our theorems show both for the finite-dimensional\ndistributions and the entire process, that the distance between a random fully\nconnected network (and its derivatives) to the corresponding infinite width\nGaussian process scales like $n^{-\\gamma}$ for $\\gamma>0$, with the exponent\ndepending on the metric used to measure discrepancy. Our bounds are strictly\nstronger in terms of their dependence on network width than any previously\navailable in the literature; in the one-dimensional case, we also prove that\nthey are optimal, i.e., we establish matching lower bounds.\n","authors":["Stefano Favaro","Boris Hanin","Domenico Marinucci","Ivan Nourdin","Giovanni Peccati"],"pdf_url":"https://arxiv.org/pdf/2307.06092v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11465v1","updated":"2023-07-21T10:01:55Z","published":"2023-07-21T10:01:55Z","title":"A Deep Learning Approach for Overall Survival Analysis with Missing\n  Values","summary":"  One of the most challenging fields where Artificial Intelligence (AI) can be\napplied is lung cancer research, specifically non-small cell lung cancer\n(NSCLC). In particular, overall survival (OS) is a vital indicator of patient\nstatus, helping to identify subgroups with diverse survival probabilities,\nenabling tailored treatment and improved OS rates. In this analysis, there are\ntwo challenges to take into account. First, few studies effectively exploit the\ninformation available from each patient, leveraging both uncensored (i.e.,\ndead) and censored (i.e., survivors) patients, considering also the death\ntimes. Second, the handling of incomplete data is a common issue in the medical\nfield. This problem is typically tackled through the use of imputation methods.\nOur objective is to present an AI model able to overcome these limits,\neffectively learning from both censored and uncensored patients and their\navailable features, for the prediction of OS for NSCLC patients. We present a\nnovel approach to survival analysis in the context of NSCLC, which exploits the\nstrengths of the transformer architecture accounting for only available\nfeatures without requiring any imputation strategy. By making use of ad-hoc\nlosses for OS, it accounts for both censored and uncensored patients,\nconsidering risks over time. We evaluated the results over a period of 6 years\nusing different time granularities obtaining a Ct-index, a time-dependent\nvariant of the C-index, of 71.97, 77.58 and 80.72 for time units of 1 month, 1\nyear and 2 years, respectively, outperforming all state-of-the-art methods\nregardless of the imputation method used.\n","authors":["Camillo Maria Caruso","Valerio Guarrasi","Sara Ramella","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2307.11465v1.pdf","comment":"19 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.11462v1","updated":"2023-07-21T09:55:44Z","published":"2023-07-21T09:55:44Z","title":"Improve Long-term Memory Learning Through Rescaling the Error Temporally","summary":"  This paper studies the error metric selection for long-term memory learning\nin sequence modelling. We examine the bias towards short-term memory in\ncommonly used errors, including mean absolute/squared error. Our findings show\nthat all temporally positive-weighted errors are biased towards short-term\nmemory in learning linear functionals. To reduce this bias and improve\nlong-term memory learning, we propose the use of a temporally rescaled error.\nIn addition to reducing the bias towards short-term memory, this approach can\nalso alleviate the vanishing gradient issue. We conduct numerical experiments\non different long-memory tasks and sequence models to validate our claims.\nNumerical results confirm the importance of appropriate temporally rescaled\nerror for effective long-term memory learning. To the best of our knowledge,\nthis is the first work that quantitatively analyzes different errors' memory\nbias towards short-term memory in sequence modelling.\n","authors":["Shida Wang","Zhanglu Yan"],"pdf_url":"https://arxiv.org/pdf/2307.11462v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.10617v2","updated":"2023-07-21T09:49:15Z","published":"2023-07-20T06:35:43Z","title":"Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques","summary":"  In the contemporary digital landscape, online reviews have become an\nindispensable tool for promoting products and services across various\nbusinesses. Marketers, advertisers, and online businesses have found incentives\nto create deceptive positive reviews for their products and negative reviews\nfor their competitors' offerings. As a result, the writing of deceptive reviews\nhas become an unavoidable practice for businesses seeking to promote themselves\nor undermine their rivals. Detecting such deceptive reviews has become an\nintense and ongoing area of research. This research paper proposes a machine\nlearning model to identify deceptive reviews, with a particular focus on\nrestaurants. This study delves into the performance of numerous experiments\nconducted on a dataset of restaurant reviews known as the Deceptive Opinion\nSpam Corpus. To accomplish this, an n-gram model and max features are developed\nto effectively identify deceptive content, particularly focusing on fake\nreviews. A benchmark study is undertaken to explore the performance of two\ndifferent feature extraction techniques, which are then coupled with five\ndistinct machine learning classification algorithms. The experimental results\nreveal that the passive aggressive classifier stands out among the various\nalgorithms, showcasing the highest accuracy not only in text classification but\nalso in identifying fake reviews. Moreover, the research delves into data\naugmentation and implements various deep learning techniques to further enhance\nthe process of detecting deceptive reviews. The findings shed light on the\nefficacy of the proposed machine learning approach and offer valuable insights\ninto dealing with deceptive reviews in the realm of online businesses.\n","authors":["Anusuya Baby Hari Krishnan"],"pdf_url":"https://arxiv.org/pdf/2307.10617v2.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.05825v2","updated":"2023-07-21T09:47:20Z","published":"2023-07-11T22:16:13Z","title":"Bayesian taut splines for estimating the number of modes","summary":"  The number of modes in a probability density function is representative of\nthe model's complexity and can also be viewed as the number of existing\nsubpopulations. Despite its relevance, little research has been devoted to its\nestimation. Focusing on the univariate setting, we propose a novel approach\ntargeting prediction accuracy inspired by some overlooked aspects of the\nproblem. We argue for the need for structure in the solutions, the subjective\nand uncertain nature of modes, and the convenience of a holistic view blending\nglobal and local density properties. Our method builds upon a combination of\nflexible kernel estimators and parsimonious compositional splines. Feature\nexploration, model selection and mode testing are implemented in the Bayesian\ninference paradigm, providing soft solutions and allowing to incorporate expert\njudgement in the process. The usefulness of our proposal is illustrated through\na case study in sports analytics, showcasing multiple companion visualisation\ntools. A thorough simulation study demonstrates that traditional\nmodality-driven approaches paradoxically struggle to provide accurate results.\nIn this context, our method emerges as a top-tier alternative offering\ninnovative solutions for analysts.\n","authors":["José E. Chacón","Javier Fernández Serrano"],"pdf_url":"https://arxiv.org/pdf/2307.05825v2.pdf","comment":"20 pages, 8 figures (manuscript) + 19 pages, 16 figures\n  (supplementary material)"},{"id":"http://arxiv.org/abs/2307.10926v2","updated":"2023-07-21T09:47:01Z","published":"2023-07-20T14:52:45Z","title":"Confidence intervals for performance estimates in 3D medical image\n  segmentation","summary":"  Medical segmentation models are evaluated empirically. As such an evaluation\nis based on a limited set of example images, it is unavoidably noisy. Beyond a\nmean performance measure, reporting confidence intervals is thus crucial.\nHowever, this is rarely done in medical image segmentation. The width of the\nconfidence interval depends on the test set size and on the spread of the\nperformance measure (its standard-deviation across of the test set). For\nclassification, many test images are needed to avoid wide confidence intervals.\nSegmentation, however, has not been studied, and it differs by the amount of\ninformation brought by a given test image. In this paper, we study the typical\nconfidence intervals in medical image segmentation. We carry experiments on 3D\nimage segmentation using the standard nnU-net framework, two datasets from the\nMedical Decathlon challenge and two performance measures: the Dice accuracy and\nthe Hausdorff distance. We show that the parametric confidence intervals are\nreasonable approximations of the bootstrap estimates for varying test set sizes\nand spread of the performance metric. Importantly, we show that the test size\nneeded to achieve a given precision is often much lower than for classification\ntasks. Typically, a 1% wide confidence interval requires about 100-200 test\nsamples when the spread is low (standard-deviation around 3%). More difficult\nsegmentation tasks may lead to higher spreads and require over 1000 samples.\n","authors":["R. El Jurdi","G. Varoquaux","O. Colliot"],"pdf_url":"https://arxiv.org/pdf/2307.10926v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.02953v2","updated":"2023-07-21T09:26:06Z","published":"2023-07-06T12:39:06Z","title":"SegNetr: Rethinking the local-global interactions and skip connections\n  in U-shaped networks","summary":"  Recently, U-shaped networks have dominated the field of medical image\nsegmentation due to their simple and easily tuned structure. However, existing\nU-shaped segmentation networks: 1) mostly focus on designing complex\nself-attention modules to compensate for the lack of long-term dependence based\non convolution operation, which increases the overall number of parameters and\ncomputational complexity of the network; 2) simply fuse the features of encoder\nand decoder, ignoring the connection between their spatial locations. In this\npaper, we rethink the above problem and build a lightweight medical image\nsegmentation network, called SegNetr. Specifically, we introduce a novel\nSegNetr block that can perform local-global interactions dynamically at any\nstage and with only linear complexity. At the same time, we design a general\ninformation retention skip connection (IRSC) to preserve the spatial location\ninformation of encoder features and achieve accurate fusion with the decoder\nfeatures. We validate the effectiveness of SegNetr on four mainstream medical\nimage segmentation datasets, with 59\\% and 76\\% fewer parameters and GFLOPs\nthan vanilla U-Net, while achieving segmentation performance comparable to\nstate-of-the-art methods. Notably, the components proposed in this paper can be\napplied to other U-shaped networks to improve their segmentation performance.\n","authors":["Junlong Cheng","Chengrui Gao","Fengjie Wang","Min Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.02953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04246v2","updated":"2023-07-21T09:15:42Z","published":"2023-02-08T18:26:10Z","title":"Shortcut Detection with Variational Autoencoders","summary":"  For real-world applications of machine learning (ML), it is essential that\nmodels make predictions based on well-generalizing features rather than\nspurious correlations in the data. The identification of such spurious\ncorrelations, also known as shortcuts, is a challenging problem and has so far\nbeen scarcely addressed. In this work, we present a novel approach to detect\nshortcuts in image and audio datasets by leveraging variational autoencoders\n(VAEs). The disentanglement of features in the latent space of VAEs allows us\nto discover feature-target correlations in datasets and semi-automatically\nevaluate them for ML shortcuts. We demonstrate the applicability of our method\non several real-world datasets and identify shortcuts that have not been\ndiscovered before.\n","authors":["Nicolas M. Müller","Simon Roschmann","Shahbaz Khan","Philip Sperl","Konstantin Böttinger"],"pdf_url":"https://arxiv.org/pdf/2302.04246v2.pdf","comment":"Accepted at the ICML 2023 Workshop on Spurious Correlations,\n  Invariance and Stability"},{"id":"http://arxiv.org/abs/2303.09975v4","updated":"2023-07-21T09:05:53Z","published":"2023-03-17T13:48:17Z","title":"MedNeXt: Transformer-driven Scaling of ConvNets for Medical Image\n  Segmentation","summary":"  There has been exploding interest in embracing Transformer-based\narchitectures for medical image segmentation. However, the lack of large-scale\nannotated medical datasets make achieving performances equivalent to those in\nnatural images challenging. Convolutional networks, in contrast, have higher\ninductive biases and consequently, are easily trainable to high performance.\nRecently, the ConvNeXt architecture attempted to modernize the standard ConvNet\nby mirroring Transformer blocks. In this work, we improve upon this to design a\nmodernized and scalable convolutional architecture customized to challenges of\ndata-scarce medical settings. We introduce MedNeXt, a Transformer-inspired\nlarge kernel segmentation network which introduces - 1) A fully ConvNeXt 3D\nEncoder-Decoder Network for medical image segmentation, 2) Residual ConvNeXt up\nand downsampling blocks to preserve semantic richness across scales, 3) A novel\ntechnique to iteratively increase kernel sizes by upsampling small kernel\nnetworks, to prevent performance saturation on limited medical data, 4)\nCompound scaling at multiple levels (depth, width, kernel size) of MedNeXt.\nThis leads to state-of-the-art performance on 4 tasks on CT and MRI modalities\nand varying dataset sizes, representing a modernized deep architecture for\nmedical image segmentation. Our code is made publicly available at:\nhttps://github.com/MIC-DKFZ/MedNeXt.\n","authors":["Saikat Roy","Gregor Koehler","Constantin Ulrich","Michael Baumgartner","Jens Petersen","Fabian Isensee","Paul F. Jaeger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2303.09975v4.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.11436v1","updated":"2023-07-21T08:57:16Z","published":"2023-07-21T08:57:16Z","title":"Neural Operators for Delay-Compensating Control of Hyperbolic PIDEs","summary":"  The recently introduced DeepONet operator-learning framework for PDE control\nis extended from the results for basic hyperbolic and parabolic PDEs to an\nadvanced hyperbolic class that involves delays on both the state and the system\noutput or input. The PDE backstepping design produces gain functions that are\noutputs of a nonlinear operator, mapping functions on a spatial domain into\nfunctions on a spatial domain, and where this gain-generating operator's inputs\nare the PDE's coefficients. The operator is approximated with a DeepONet neural\nnetwork to a degree of accuracy that is provably arbitrarily tight. Once we\nproduce this approximation-theoretic result in infinite dimension, with it we\nestablish stability in closed loop under feedback that employs approximate\ngains. In addition to supplying such results under full-state feedback, we also\ndevelop DeepONet-approximated observers and output-feedback laws and prove\ntheir own stabilizing properties under neural operator approximations. With\nnumerical simulations we illustrate the theoretical results and quantify the\nnumerical effort savings, which are of two orders of magnitude, thanks to\nreplacing the numerical PDE solving with the DeepONet.\n","authors":["Jie Qi","Jing Zhang","Miroslav Krstic"],"pdf_url":"https://arxiv.org/pdf/2307.11436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11434v1","updated":"2023-07-21T08:55:23Z","published":"2023-07-21T08:55:23Z","title":"Batching for Green AI -- An Exploratory Study on Inference","summary":"  The batch size is an essential parameter to tune during the development of\nnew neural networks. Amongst other quality indicators, it has a large degree of\ninfluence on the model's accuracy, generalisability, training times and\nparallelisability. This fact is generally known and commonly studied. However,\nduring the application phase of a deep learning model, when the model is\nutilised by an end-user for inference, we find that there is a disregard for\nthe potential benefits of introducing a batch size. In this study, we examine\nthe effect of input batching on the energy consumption and response times of\nfive fully-trained neural networks for computer vision that were considered\nstate-of-the-art at the time of their publication. The results suggest that\nbatching has a significant effect on both of these metrics. Furthermore, we\npresent a timeline of the energy efficiency and accuracy of neural networks\nover the past decade. We find that in general, energy consumption rises at a\nmuch steeper pace than accuracy and question the necessity of this evolution.\nAdditionally, we highlight one particular network, ShuffleNetV2(2018), that\nachieved a competitive performance for its time while maintaining a much lower\nenergy consumption. Nevertheless, we highlight that the results are model\ndependent.\n","authors":["Tim Yarally","Luís Cruz","Daniel Feitosa","June Sallou","Arie van Deursen"],"pdf_url":"https://arxiv.org/pdf/2307.11434v1.pdf","comment":"8 pages, 4 figures, 1 table. Accepted at Euromicro Conference Series\n  on Software Engineering and Advanced Applications (SEAA) 2023"},{"id":"http://arxiv.org/abs/2307.11432v1","updated":"2023-07-21T08:52:08Z","published":"2023-07-21T08:52:08Z","title":"An Analysis of Multi-Agent Reinforcement Learning for Decentralized\n  Inventory Control Systems","summary":"  Most solutions to the inventory management problem assume a centralization of\ninformation that is incompatible with organisational constraints in real supply\nchain networks. The inventory management problem is a well-known planning\nproblem in operations research, concerned with finding the optimal re-order\npolicy for nodes in a supply chain. While many centralized solutions to the\nproblem exist, they are not applicable to real-world supply chains made up of\nindependent entities. The problem can however be naturally decomposed into\nsub-problems, each associated with an independent entity, turning it into a\nmulti-agent system. Therefore, a decentralized data-driven solution to\ninventory management problems using multi-agent reinforcement learning is\nproposed where each entity is controlled by an agent. Three multi-agent\nvariations of the proximal policy optimization algorithm are investigated\nthrough simulations of different supply chain networks and levels of\nuncertainty. The centralized training decentralized execution framework is\ndeployed, which relies on offline centralization during simulation-based policy\nidentification, but enables decentralization when the policies are deployed\nonline to the real system. Results show that using multi-agent proximal policy\noptimization with a centralized critic leads to performance very close to that\nof a centralized data-driven solution and outperforms a distributed model-based\nsolution in most cases while respecting the information constraints of the\nsystem.\n","authors":["Marwan Mousa","Damien van de Berg","Niki Kotecha","Ehecatl Antonio del Rio-Chanona","Max Mowbray"],"pdf_url":"https://arxiv.org/pdf/2307.11432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15342v2","updated":"2023-07-21T08:51:09Z","published":"2023-05-24T16:55:49Z","title":"Is Your Model \"MADD\"? A Novel Metric to Evaluate Algorithmic Fairness\n  for Predictive Student Models","summary":"  Predictive student models are increasingly used in learning environments due\nto their ability to enhance educational outcomes and support stakeholders in\nmaking informed decisions. However, predictive models can be biased and produce\nunfair outcomes, leading to potential discrimination against some students and\npossible harmful long-term implications. This has prompted research on fairness\nmetrics meant to capture and quantify such biases. Nonetheless, so far,\nexisting fairness metrics used in education are predictive\nperformance-oriented, focusing on assessing biased outcomes across groups of\nstudents, without considering the behaviors of the models nor the severity of\nthe biases in the outcomes. Therefore, we propose a novel metric, the Model\nAbsolute Density Distance (MADD), to analyze models' discriminatory behaviors\nindependently from their predictive performance. We also provide a\ncomplementary visualization-based analysis to enable fine-grained human\nassessment of how the models discriminate between groups of students. We\nevaluate our approach on the common task of predicting student success in\nonline courses, using several common predictive classification models on an\nopen educational dataset. We also compare our metric to the only predictive\nperformance-oriented fairness metric developed in education, ABROCA. Results on\nthis dataset show that: (1) fair predictive performance does not guarantee fair\nmodels' behaviors and thus fair outcomes, (2) there is no direct relationship\nbetween data bias and predictive performance bias nor discriminatory behaviors\nbias, and (3) trained on the same data, models exhibit different discriminatory\nbehaviors, according to different sensitive features too. We thus recommend\nusing the MADD on models that show satisfying predictive performance, to gain a\nfiner-grained understanding on how they behave and to refine models selection\nand their usage.\n","authors":["Mélina Verger","Sébastien Lallé","François Bouchet","Vanda Luengo"],"pdf_url":"https://arxiv.org/pdf/2305.15342v2.pdf","comment":"12 pages, conference"},{"id":"http://arxiv.org/abs/2307.11423v1","updated":"2023-07-21T08:33:55Z","published":"2023-07-21T08:33:55Z","title":"Attention to Entropic Communication","summary":"  The concept of attention, numerical weights that emphasize the importance of\nparticular data, has proven to be very relevant in artificial intelligence.\nRelative entropy (RE, aka Kullback-Leibler divergence) plays a central role in\ncommunication theory. Here we combine these concepts, attention and RE. RE\nguides optimal encoding of messages in bandwidth-limited communication as well\nas optimal message decoding via the maximum entropy principle (MEP). In the\ncoding scenario, RE can be derived from four requirements, namely being\nanalytical, local, proper, and calibrated. Weighted RE, used for attention\nsteering in communications, turns out to be improper. To see how proper\nattention communication can emerge, we analyze a scenario of a message sender\nwho wants to ensure that the receiver of the message can perform well-informed\nactions. If the receiver decodes the message using the MEP, the sender only\nneeds to know the receiver's utility function to inform optimally, but not the\nreceiver's initial knowledge state. In case only the curvature of the utility\nfunction maxima are known, it becomes desirable to accurately communicate an\nattention function, in this case a by this curvature weighted and re-normalized\nprobability function. Entropic attention communication is here proposed as the\ndesired generalization of entropic communication that permits weighting while\nbeing proper, thereby aiding the design of optimal communication protocols in\ntechnical applications and helping to understand human communication. For\nexample, our analysis shows how to derive the level of cooperation expected\nunder misaligned interests of otherwise honest communication partners.\n","authors":["Torsten Enßlin","Carolin Weidinger","Philipp Frank"],"pdf_url":"https://arxiv.org/pdf/2307.11423v1.pdf","comment":"23 pages, 4 figures, submitted"},{"id":"http://arxiv.org/abs/2306.09087v2","updated":"2023-07-21T08:32:35Z","published":"2023-06-15T12:33:39Z","title":"Deep learning based Meta-modeling for Multi-objective Technology\n  Optimization of Electrical Machines","summary":"  Optimization of rotating electrical machines is both time- and\ncomputationally expensive. Because of the different parametrization, design\noptimization is commonly executed separately for each machine technology. In\nthis paper, we present the application of a variational auto-encoder (VAE) to\noptimize two different machine technologies simultaneously, namely an\nasynchronous machine and a permanent magnet synchronous machine. After\ntraining, we employ a deep neural network and a decoder as meta-models to\npredict global key performance indicators (KPIs) and generate associated new\ndesigns, respectively, through unified latent space in the optimization loop.\nNumerical results demonstrate concurrent parametric multi-objective technology\noptimization in the high-dimensional design space. The VAE-based approach is\nquantitatively compared to a classical deep learning-based direct approach for\nKPIs prediction.\n","authors":["Vivek Parekh","Dominik Flore","Sebastian Schöps"],"pdf_url":"https://arxiv.org/pdf/2306.09087v2.pdf","comment":"12 pages, 15 figures"},{"id":"http://arxiv.org/abs/2306.09260v2","updated":"2023-07-21T08:18:51Z","published":"2023-06-07T14:22:41Z","title":"IsoEx: an explainable unsupervised approach to process event logs cyber\n  investigation","summary":"  39 seconds. That is the timelapse between two consecutive cyber attacks as of\n2023. Meaning that by the time you are done reading this abstract, about 1 or 2\nadditional cyber attacks would have occurred somewhere in the world. In this\ncontext of highly increased frequency of cyber threats, Security Operation\nCenters (SOC) and Computer Emergency Response Teams (CERT) can be overwhelmed.\nIn order to relieve the cybersecurity teams in their investigative effort and\nhelp them focus on more added-value tasks, machine learning approaches and\nmethods started to emerge. This paper introduces a novel method, IsoEx, for\ndetecting anomalous and potentially problematic command lines during the\ninvestigation of contaminated devices. IsoEx is built around a set of features\nthat leverages the log structure of the command line, as well as its\nparent/child relationship, to achieve a greater accuracy than traditional\nmethods. To detect anomalies, IsoEx resorts to an unsupervised anomaly\ndetection technique that is both highly sensitive and lightweight. A key\ncontribution of the paper is its emphasis on interpretability, achieved through\nthe features themselves and the application of eXplainable Artificial\nIntelligence (XAI) techniques and visualizations. This is critical to ensure\nthe adoption of the method by SOC and CERT teams, as the paper argues that the\ncurrent literature on machine learning for log investigation has not adequately\naddressed the issue of explainability. This method was proven efficient in a\nreal-life environment as it was built to support a company\\'s SOC and CERT\n","authors":["Pierre Lavieille","Ismail Alaoui Hassani Atlas"],"pdf_url":"https://arxiv.org/pdf/2306.09260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11408v1","updated":"2023-07-21T08:07:16Z","published":"2023-07-21T08:07:16Z","title":"Direct and inverse modeling of soft robots by learning a condensed FEM\n  model","summary":"  The Finite Element Method (FEM) is a powerful modeling tool for predicting\nthe behavior of soft robots. However, its use for control can be difficult for\nnon-specialists of numerical computation: it requires an optimization of the\ncomputation to make it real-time. In this paper, we propose a learning-based\napproach to obtain a compact but sufficiently rich mechanical representation.\nOur choice is based on nonlinear compliance data in the actuator/effector space\nprovided by a condensation of the FEM model. We demonstrate that this compact\nmodel can be learned with a reasonable amount of data and, at the same time, be\nvery efficient in terms of modeling, since we can deduce the direct and inverse\nkinematics of the robot. We also show how to couple some models learned\nindividually in particular on an example of a gripper composed of two soft\nfingers. Other results are shown by comparing the inverse model derived from\nthe full FEM model and the one from the compact learned version. This work\nopens new perspectives, namely for the embedded control of soft robots, but\nalso for their design. These perspectives are also discussed in the paper.\n","authors":["Etienne Ménager","Tanguy Navez","Olivier Goury","Christian Duriez"],"pdf_url":"https://arxiv.org/pdf/2307.11408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09382v3","updated":"2023-07-21T07:59:06Z","published":"2023-06-15T12:59:04Z","title":"Sound Demixing Challenge 2023 Music Demixing Track Technical Report:\n  TFC-TDF-UNet v3","summary":"  In this report, we present our award-winning solutions for the Music Demixing\nTrack of Sound Demixing Challenge 2023. First, we propose TFC-TDF-UNet v3, a\ntime-efficient music source separation model that achieves state-of-the-art\nresults on the MUSDB benchmark. We then give full details regarding our\nsolutions for each Leaderboard, including a loss masking approach for\nnoise-robust training. Code for reproducing model training and final\nsubmissions is available at github.com/kuielab/sdx23.\n","authors":["Minseok Kim","Jun Hyung Lee","Soonyoung Jung"],"pdf_url":"https://arxiv.org/pdf/2306.09382v3.pdf","comment":"5 pages, 4 tables"},{"id":"http://arxiv.org/abs/2304.04250v2","updated":"2023-07-21T07:39:58Z","published":"2023-04-09T14:52:18Z","title":"Editable User Profiles for Controllable Text Recommendation","summary":"  Methods for making high-quality recommendations often rely on learning latent\nrepresentations from interaction data. These methods, while performant, do not\nprovide ready mechanisms for users to control the recommendation they receive.\nOur work tackles this problem by proposing LACE, a novel concept value\nbottleneck model for controllable text recommendations. LACE represents each\nuser with a succinct set of human-readable concepts through retrieval given\nuser-interacted documents and learns personalized representations of the\nconcepts based on user documents. This concept based user profile is then\nleveraged to make recommendations. The design of our model affords control over\nthe recommendations through a number of intuitive interactions with a\ntransparent user profile. We first establish the quality of recommendations\nobtained from LACE in an offline evaluation on three recommendation tasks\nspanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we\nvalidate the controllability of LACE under simulated user interactions.\nFinally, we implement LACE in an interactive controllable recommender system\nand conduct a user study to demonstrate that users are able to improve the\nquality of recommendations they receive through interactions with an editable\nuser profile.\n","authors":["Sheshera Mysore","Mahmood Jasim","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2304.04250v2.pdf","comment":"SIGIR-2023 Camera Ready"},{"id":"http://arxiv.org/abs/2307.11397v1","updated":"2023-07-21T07:29:38Z","published":"2023-07-21T07:29:38Z","title":"Probabilistic Modeling of Inter- and Intra-observer Variability in\n  Medical Image Segmentation","summary":"  Medical image segmentation is a challenging task, particularly due to inter-\nand intra-observer variability, even between medical experts. In this paper, we\npropose a novel model, called Probabilistic Inter-Observer and iNtra-Observer\nvariation NetwOrk (Pionono). It captures the labeling behavior of each rater\nwith a multidimensional probability distribution and integrates this\ninformation with the feature maps of the image to produce probabilistic\nsegmentation predictions. The model is optimized by variational inference and\ncan be trained end-to-end. It outperforms state-of-the-art models such as\nSTAPLE, Probabilistic U-Net, and models based on confusion matrices.\nAdditionally, Pionono predicts multiple coherent segmentation maps that mimic\nthe rater's expert opinion, which provides additional valuable information for\nthe diagnostic process. Experiments on real-world cancer segmentation datasets\ndemonstrate the high accuracy and efficiency of Pionono, making it a powerful\ntool for medical image analysis.\n","authors":["Arne Schmidt","Pablo Morales-Álvarez","Rafael Molina"],"pdf_url":"https://arxiv.org/pdf/2307.11397v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.06146v2","updated":"2023-07-21T06:34:54Z","published":"2023-03-10T18:59:33Z","title":"StyleGANEX: StyleGAN-Based Manipulation Beyond Cropped Aligned Faces","summary":"  Recent advances in face manipulation using StyleGAN have produced impressive\nresults. However, StyleGAN is inherently limited to cropped aligned faces at a\nfixed image resolution it is pre-trained on. In this paper, we propose a simple\nand effective solution to this limitation by using dilated convolutions to\nrescale the receptive fields of shallow layers in StyleGAN, without altering\nany model parameters. This allows fixed-size small features at shallow layers\nto be extended into larger ones that can accommodate variable resolutions,\nmaking them more robust in characterizing unaligned faces. To enable real face\ninversion and manipulation, we introduce a corresponding encoder that provides\nthe first-layer feature of the extended StyleGAN in addition to the latent\nstyle code. We validate the effectiveness of our method using unaligned face\ninputs of various resolutions in a diverse set of face manipulation tasks,\nincluding facial attribute editing, super-resolution, sketch/mask-to-face\ntranslation, and face toonification.\n","authors":["Shuai Yang","Liming Jiang","Ziwei Liu","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2303.06146v2.pdf","comment":"ICCV 2023. Code: https://github.com/williamyang1991/StyleGANEX\n  Project page: https://www.mmlab-ntu.com/project/styleganex/"},{"id":"http://arxiv.org/abs/2307.11379v1","updated":"2023-07-21T06:34:41Z","published":"2023-07-21T06:34:41Z","title":"Towards Better Fairness-Utility Trade-off: A Comprehensive\n  Measurement-Based Reinforcement Learning Framework","summary":"  Machine learning is widely used to make decisions with societal impact such\nas bank loan approving, criminal sentencing, and resume filtering. How to\nensure its fairness while maintaining utility is a challenging but crucial\nissue. Fairness is a complex and context-dependent concept with over 70\ndifferent measurement metrics. Since existing regulations are often vague in\nterms of which metric to use and different organizations may prefer different\nfairness metrics, it is important to have means of improving fairness\ncomprehensively. Existing mitigation techniques often target at one specific\nfairness metric and have limitations in improving multiple notions of fairness\nsimultaneously. In this work, we propose CFU (Comprehensive Fairness-Utility),\na reinforcement learning-based framework, to efficiently improve the\nfairness-utility trade-off in machine learning classifiers. A comprehensive\nmeasurement that can simultaneously consider multiple fairness notions as well\nas utility is established, and new metrics are proposed based on an in-depth\nanalysis of the relationship between different fairness metrics. The reward\nfunction of CFU is constructed with comprehensive measurement and new metrics.\nWe conduct extensive experiments to evaluate CFU on 6 tasks, 3 machine learning\nmodels, and 15 fairness-utility measurements. The results demonstrate that CFU\ncan improve the classifier on multiple fairness metrics without sacrificing its\nutility. It outperforms all state-of-the-art techniques and has witnessed a\n37.5% improvement on average.\n","authors":["Simiao Zhang","Jitao Bai","Menghong Guan","Yihao Huang","Yueling Zhang","Jun Sun","Geguang Pu"],"pdf_url":"https://arxiv.org/pdf/2307.11379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.07493v3","updated":"2023-07-21T06:28:40Z","published":"2022-05-16T07:53:42Z","title":"Multi-scale Attention Flow for Probabilistic Time Series Forecasting","summary":"  The probability prediction of multivariate time series is a notoriously\nchallenging but practical task. On the one hand, the challenge is how to\neffectively capture the cross-series correlations between interacting time\nseries, to achieve accurate distribution modeling. On the other hand, we should\nconsider how to capture the contextual information within time series more\naccurately to model multivariate temporal dynamics of time series. In this\nwork, we proposed a novel non-autoregressive deep learning model, called\nMulti-scale Attention Normalizing Flow(MANF), where we integrate multi-scale\nattention and relative position information and the multivariate data\ndistribution is represented by the conditioned normalizing flow. Additionally,\ncompared with autoregressive modeling methods, our model avoids the influence\nof cumulative error and does not increase the time complexity. Extensive\nexperiments demonstrate that our model achieves state-of-the-art performance on\nmany popular multivariate datasets.\n","authors":["Shibo Feng","Chunyan Miao","Ke Xu","Jiaxiang Wu","Pengcheng Wu","Yang Zhang","Peilin Zhao"],"pdf_url":"https://arxiv.org/pdf/2205.07493v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11375v1","updated":"2023-07-21T06:17:09Z","published":"2023-07-21T06:17:09Z","title":"LatentAugment: Data Augmentation via Guided Manipulation of GAN's Latent\n  Space","summary":"  Data Augmentation (DA) is a technique to increase the quantity and diversity\nof the training data, and by that alleviate overfitting and improve\ngeneralisation. However, standard DA produces synthetic data for augmentation\nwith limited diversity. Generative Adversarial Networks (GANs) may unlock\nadditional information in a dataset by generating synthetic samples having the\nappearance of real images. However, these models struggle to simultaneously\naddress three key requirements: fidelity and high-quality samples; diversity\nand mode coverage; and fast sampling. Indeed, GANs generate high-quality\nsamples rapidly, but have poor mode coverage, limiting their adoption in DA\napplications. We propose LatentAugment, a DA strategy that overcomes the low\ndiversity of GANs, opening up for use in DA applications. Without external\nsupervision, LatentAugment modifies latent vectors and moves them into latent\nspace regions to maximise the synthetic images' diversity and fidelity. It is\nalso agnostic to the dataset and the downstream task. A wide set of experiments\nshows that LatentAugment improves the generalisation of a deep model\ntranslating from MRI-to-CT beating both standard DA as well GAN-based sampling.\nMoreover, still in comparison with GAN-based sampling, LatentAugment synthetic\nsamples show superior mode coverage and diversity. Code is available at:\nhttps://github.com/ltronchin/LatentAugment.\n","authors":["Lorenzo Tronchin","Minh H. Vu","Paolo Soda","Tommy Löfstedt"],"pdf_url":"https://arxiv.org/pdf/2307.11375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11373v1","updated":"2023-07-21T06:12:39Z","published":"2023-07-21T06:12:39Z","title":"Diverse Offline Imitation via Fenchel Duality","summary":"  There has been significant recent progress in the area of unsupervised skill\ndiscovery, with various works proposing mutual information based objectives, as\na source of intrinsic motivation. Prior works predominantly focused on\ndesigning algorithms that require online access to the environment. In\ncontrast, we develop an \\textit{offline} skill discovery algorithm. Our problem\nformulation considers the maximization of a mutual information objective\nconstrained by a KL-divergence. More precisely, the constraints ensure that the\nstate occupancy of each skill remains close to the state occupancy of an\nexpert, within the support of an offline dataset with good state-action\ncoverage. Our main contribution is to connect Fenchel duality, reinforcement\nlearning and unsupervised skill discovery, and to give a simple offline\nalgorithm for learning diverse skills that are aligned with an expert.\n","authors":["Marin Vlastelica","Pavel Kolev","Jin Cheng","Georg Martius"],"pdf_url":"https://arxiv.org/pdf/2307.11373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11371v1","updated":"2023-07-21T06:03:43Z","published":"2023-07-21T06:03:43Z","title":"Random Separating Hyperplane Theorem and Learning Polytopes","summary":"  The Separating Hyperplane theorem is a fundamental result in Convex Geometry\nwith myriad applications. Our first result, Random Separating Hyperplane\nTheorem (RSH), is a strengthening of this for polytopes. $\\rsh$ asserts that if\nthe distance between $a$ and a polytope $K$ with $k$ vertices and unit diameter\nin $\\Re^d$ is at least $\\delta$, where $\\delta$ is a fixed constant in $(0,1)$,\nthen a randomly chosen hyperplane separates $a$ and $K$ with probability at\nleast $1/poly(k)$ and margin at least $\\Omega \\left(\\delta/\\sqrt{d} \\right)$.\nAn immediate consequence of our result is the first near optimal bound on the\nerror increase in the reduction from a Separation oracle to an Optimization\noracle over a polytope.\n  RSH has algorithmic applications in learning polytopes. We consider a\nfundamental problem, denoted the ``Hausdorff problem'', of learning a unit\ndiameter polytope $K$ within Hausdorff distance $\\delta$, given an optimization\noracle for $K$. Using RSH, we show that with polynomially many random queries\nto the optimization oracle, $K$ can be approximated within error $O(\\delta)$.\nTo our knowledge this is the first provable algorithm for the Hausdorff\nProblem. Building on this result, we show that if the vertices of $K$ are\nwell-separated, then an optimization oracle can be used to generate a list of\npoints, each within Hausdorff distance $O(\\delta)$ of $K$, with the property\nthat the list contains a point close to each vertex of $K$. Further, we show\nhow to prune this list to generate a (unique) approximation to each vertex of\nthe polytope. We prove that in many latent variable settings, e.g., topic\nmodeling, LDA, optimization oracles do exist provided we project to a suitable\nSVD subspace. Thus, our work yields the first efficient algorithm for finding\napproximations to the vertices of the latent polytope under the\nwell-separatedness assumption.\n","authors":["Chiranjib Bhattacharyya","Ravindran Kannan","Amit Kumar"],"pdf_url":"https://arxiv.org/pdf/2307.11371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11357v1","updated":"2023-07-21T05:17:21Z","published":"2023-07-21T05:17:21Z","title":"Bridging the Reality Gap of Reinforcement Learning based Traffic Signal\n  Control using Domain Randomization and Meta Learning","summary":"  Reinforcement Learning (RL) has been widely explored in Traffic Signal\nControl (TSC) applications, however, still no such system has been deployed in\npractice. A key barrier to progress in this area is the reality gap, the\ndiscrepancy that results from differences between simulation models and their\nreal-world equivalents. In this paper, we address this challenge by first\npresenting a comprehensive analysis of potential simulation parameters that\ncontribute to this reality gap. We then also examine two promising strategies\nthat can bridge this gap: Domain Randomization (DR) and Model-Agnostic\nMeta-Learning (MAML). Both strategies were trained with a traffic simulation\nmodel of an intersection. In addition, the model was embedded in LemgoRL, a\nframework that integrates realistic, safety-critical requirements into the\ncontrol system. Subsequently, we evaluated the performance of the two methods\non a separate model of the same intersection that was developed with a\ndifferent traffic simulator. In this way, we mimic the reality gap. Our\nexperimental results show that both DR and MAML outperform a state-of-the-art\nRL algorithm, therefore highlighting their potential to mitigate the reality\ngap in RLbased TSC systems.\n","authors":["Arthur Müller","Matthia Sabatelli"],"pdf_url":"https://arxiv.org/pdf/2307.11357v1.pdf","comment":"Paper was accepted by the ITSC 2023 (26th IEEE International\n  Conference on Intelligent Transportation Systems)"},{"id":"http://arxiv.org/abs/2307.09484v2","updated":"2023-07-21T05:13:55Z","published":"2023-06-06T12:45:15Z","title":"MolFM: A Multimodal Molecular Foundation Model","summary":"  Molecular knowledge resides within three different modalities of information\nsources: molecular structures, biomedical documents, and knowledge bases.\nEffective incorporation of molecular knowledge from these modalities holds\nparamount significance in facilitating biomedical research. However, existing\nmultimodal molecular foundation models exhibit limitations in capturing\nintricate connections between molecular structures and texts, and more\nimportantly, none of them attempt to leverage a wealth of molecular expertise\nderived from knowledge graphs. In this study, we introduce MolFM, a multimodal\nmolecular foundation model designed to facilitate joint representation learning\nfrom molecular structures, biomedical texts, and knowledge graphs. We propose\ncross-modal attention between atoms of molecular structures, neighbors of\nmolecule entities and semantically related texts to facilitate cross-modal\ncomprehension. We provide theoretical analysis that our cross-modal\npre-training captures local and global molecular knowledge by minimizing the\ndistance in the feature space between different modalities of the same\nmolecule, as well as molecules sharing similar structures or functions. MolFM\nachieves state-of-the-art performance on various downstream tasks. On\ncross-modal retrieval, MolFM outperforms existing models with 12.13% and 5.04%\nabsolute gains under the zero-shot and fine-tuning settings, respectively.\nFurthermore, qualitative analysis showcases MolFM's implicit ability to provide\ngrounding from molecular substructures and knowledge graphs. Code and models\nare available on https://github.com/BioFM/OpenBioMed.\n","authors":["Yizhen Luo","Kai Yang","Massimo Hong","Xing Yi Liu","Zaiqing Nie"],"pdf_url":"https://arxiv.org/pdf/2307.09484v2.pdf","comment":"31 pages, 15 figures, and 15 tables"},{"id":"http://arxiv.org/abs/2307.11353v1","updated":"2023-07-21T05:05:55Z","published":"2023-07-21T05:05:55Z","title":"What can a Single Attention Layer Learn? A Study Through the Random\n  Features Lens","summary":"  Attention layers -- which map a sequence of inputs to a sequence of outputs\n-- are core building blocks of the Transformer architecture which has achieved\nsignificant breakthroughs in modern artificial intelligence. This paper\npresents a rigorous theoretical study on the learning and generalization of a\nsingle multi-head attention layer, with a sequence of key vectors and a\nseparate query vector as input. We consider the random feature setting where\nthe attention layer has a large number of heads, with randomly sampled frozen\nquery and key matrices, and trainable value matrices. We show that such a\nrandom-feature attention layer can express a broad class of target functions\nthat are permutation invariant to the key vectors. We further provide\nquantitative excess risk bounds for learning these target functions from finite\nsamples, using random feature attention with finitely many heads.\n  Our results feature several implications unique to the attention structure\ncompared with existing random features theory for neural networks, such as (1)\nAdvantages in the sample complexity over standard two-layer random-feature\nnetworks; (2) Concrete and natural classes of functions that can be learned\nefficiently by a random-feature attention layer; and (3) The effect of the\nsampling distribution of the query-key weight matrix (the product of the query\nand key matrix), where Gaussian random weights with a non-zero mean result in\nbetter sample complexities over the zero-mean counterpart for learning certain\nnatural target functions. Experiments on simulated data corroborate our\ntheoretical findings and further illustrate the interplay between the sample\nsize and the complexity of the target function.\n","authors":["Hengyu Fu","Tianyu Guo","Yu Bai","Song Mei"],"pdf_url":"https://arxiv.org/pdf/2307.11353v1.pdf","comment":"41pages, 5 figures"},{"id":"http://arxiv.org/abs/2106.06134v4","updated":"2023-07-21T05:02:21Z","published":"2021-06-11T02:44:00Z","title":"Is Homophily a Necessity for Graph Neural Networks?","summary":"  Graph neural networks (GNNs) have shown great prowess in learning\nrepresentations suitable for numerous graph-based machine learning tasks. When\napplied to semi-supervised node classification, GNNs are widely believed to\nwork well due to the homophily assumption (\"like attracts like\"), and fail to\ngeneralize to heterophilous graphs where dissimilar nodes connect. Recent works\ndesign new architectures to overcome such heterophily-related limitations,\nciting poor baseline performance and new architecture improvements on a few\nheterophilous graph benchmark datasets as evidence for this notion. In our\nexperiments, we empirically find that standard graph convolutional networks\n(GCNs) can actually achieve better performance than such carefully designed\nmethods on some commonly used heterophilous graphs. This motivates us to\nreconsider whether homophily is truly necessary for good GNN performance. We\nfind that this claim is not quite true, and in fact, GCNs can achieve strong\nperformance on heterophilous graphs under certain conditions. Our work\ncarefully characterizes these conditions, and provides supporting theoretical\nunderstanding and empirical observations. Finally, we examine existing\nheterophilous graphs benchmarks and reconcile how the GCN (under)performs on\nthem based on this understanding.\n","authors":["Yao Ma","Xiaorui Liu","Neil Shah","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2106.06134v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11352v1","updated":"2023-07-21T04:59:23Z","published":"2023-07-21T04:59:23Z","title":"Model-based Offline Reinforcement Learning with Count-based Conservatism","summary":"  In this paper, we propose a model-based offline reinforcement learning method\nthat integrates count-based conservatism, named $\\texttt{Count-MORL}$. Our\nmethod utilizes the count estimates of state-action pairs to quantify model\nestimation error, marking the first algorithm of demonstrating the efficacy of\ncount-based conservatism in model-based offline deep RL to the best of our\nknowledge. For our proposed method, we first show that the estimation error is\ninversely proportional to the frequency of state-action pairs. Secondly, we\ndemonstrate that the learned policy under the count-based conservative model\noffers near-optimality performance guarantees. Through extensive numerical\nexperiments, we validate that $\\texttt{Count-MORL}$ with hash code\nimplementation significantly outperforms existing offline RL algorithms on the\nD4RL benchmark datasets. The code is accessible at\n$\\href{https://github.com/oh-lab/Count-MORL}{https://github.com/oh-lab/Count-MORL}$.\n","authors":["Byeongchan Kim","Min-hwan Oh"],"pdf_url":"https://arxiv.org/pdf/2307.11352v1.pdf","comment":"Accepted in ICML 2023"},{"id":"http://arxiv.org/abs/2307.11351v1","updated":"2023-07-21T04:55:03Z","published":"2023-07-21T04:55:03Z","title":"Bounded P-values in Parametric Programming-based Selective Inference","summary":"  Selective inference (SI) has been actively studied as a promising framework\nfor statistical hypothesis testing for data-driven hypotheses. The basic idea\nof SI is to make inferences conditional on an event that a hypothesis is\nselected. In order to perform SI, this event must be characterized in a\ntraceable form. When selection event is too difficult to characterize,\nadditional conditions are introduced for tractability. This additional\nconditions often causes the loss of power, and this issue is referred to as\nover-conditioning. Parametric programming-based SI (PP-based SI) has been\nproposed as one way to address the over-conditioning issue. The main problem of\nPP-based SI is its high computational cost due to the need to exhaustively\nexplore the data space. In this study, we introduce a procedure to reduce the\ncomputational cost while guaranteeing the desired precision, by proposing a\nmethod to compute the upper and lower bounds of p-values. We also proposed\nthree types of search strategies that efficiently improve these bounds. We\ndemonstrate the effectiveness of the proposed method in hypothesis testing\nproblems for feature selection in linear models and attention region\nidentification in deep neural networks.\n","authors":["Tomohiro Shiraishi","Daiki Miwa","Vo Nguyen Le Duy","Ichiro Takeuchi"],"pdf_url":"https://arxiv.org/pdf/2307.11351v1.pdf","comment":"47pages, 14figures"},{"id":"http://arxiv.org/abs/2302.09738v5","updated":"2023-07-21T04:19:43Z","published":"2023-02-20T03:31:11Z","title":"Simplifying Momentum-based Positive-definite Submanifold Optimization\n  with Applications to Deep Learning","summary":"  Riemannian submanifold optimization with momentum is computationally\nchallenging because, to ensure that the iterates remain on the submanifold, we\noften need to solve difficult differential equations. Here, we simplify such\ndifficulties for a class of structured symmetric positive-definite matrices\nwith the affine-invariant metric. We do so by proposing a generalized version\nof the Riemannian normal coordinates that dynamically orthonormalizes the\nmetric and locally converts the problem into an unconstrained problem in the\nEuclidean space. We use our approach to simplify existing approaches for\nstructured covariances and develop matrix-inverse-free $2^\\text{nd}$-order\noptimizers for deep learning with low precision by using only matrix\nmultiplications. Code: https://github.com/yorkerlin/StructuredNGD-DL\n","authors":["Wu Lin","Valentin Duruisseaux","Melvin Leok","Frank Nielsen","Mohammad Emtiyaz Khan","Mark Schmidt"],"pdf_url":"https://arxiv.org/pdf/2302.09738v5.pdf","comment":"An updated version of the ICML 2023 paper. Updated the main text and\n  added more numerical results for DNNs including a new baseline method and\n  improving existing baseline methods"},{"id":"http://arxiv.org/abs/2307.11334v1","updated":"2023-07-21T03:43:07Z","published":"2023-07-21T03:43:07Z","title":"Improving Transferability of Adversarial Examples via Bayesian Attacks","summary":"  This paper presents a substantial extension of our work published at ICLR.\nOur ICLR work advocated for enhancing transferability in adversarial examples\nby incorporating a Bayesian formulation into model parameters, which\neffectively emulates the ensemble of infinitely many deep neural networks,\nwhile, in this paper, we introduce a novel extension by incorporating the\nBayesian formulation into the model input as well, enabling the joint\ndiversification of both the model input and model parameters. Our empirical\nfindings demonstrate that: 1) the combination of Bayesian formulations for both\nthe model input and model parameters yields significant improvements in\ntransferability; 2) by introducing advanced approximations of the posterior\ndistribution over the model input, adversarial transferability achieves further\nenhancement, surpassing all state-of-the-arts when attacking without model\nfine-tuning. Moreover, we propose a principled approach to fine-tune model\nparameters in such an extended Bayesian formulation. The derived optimization\nobjective inherently encourages flat minima in the parameter space and input\nspace. Extensive experiments demonstrate that our method achieves a new\nstate-of-the-art on transfer-based attacks, improving the average success rate\non ImageNet and CIFAR-10 by 19.14% and 2.08%, respectively, when comparing with\nour ICLR basic Bayesian method. We will make our code publicly available.\n","authors":["Qizhang Li","Yiwen Guo","Xiaochen Yang","Wangmeng Zuo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2307.11334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11333v1","updated":"2023-07-21T03:41:55Z","published":"2023-07-21T03:41:55Z","title":"Demystifying Local and Global Fairness Trade-offs in Federated Learning\n  Using Partial Information Decomposition","summary":"  In this paper, we present an information-theoretic perspective to group\nfairness trade-offs in federated learning (FL) with respect to sensitive\nattributes, such as gender, race, etc. Existing works mostly focus on either\n\\emph{global fairness} (overall disparity of the model across all clients) or\n\\emph{local fairness} (disparity of the model at each individual client),\nwithout always considering their trade-offs. There is a lack of understanding\nof the interplay between global and local fairness in FL, and if and when one\nimplies the other. To address this gap, we leverage a body of work in\ninformation theory called partial information decomposition (PID) which first\nidentifies three sources of unfairness in FL, namely, \\emph{Unique Disparity},\n\\emph{Redundant Disparity}, and \\emph{Masked Disparity}. Using canonical\nexamples, we demonstrate how these three disparities contribute to global and\nlocal fairness. This decomposition helps us derive fundamental limits and\ntrade-offs between global or local fairness, particularly under data\nheterogeneity, as well as, derive conditions under which one implies the other.\nWe also present experimental results on benchmark datasets to support our\ntheoretical findings. This work offers a more nuanced understanding of the\nsources of disparity in FL that can inform the use of local disparity\nmitigation techniques, and their convergence and effectiveness when deployed in\npractice.\n","authors":["Faisal Hamman","Sanghamitra Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.11333v1.pdf","comment":"Accepted at ICML Workshop on Federated Learning and Analytics in\n  Practice"},{"id":"http://arxiv.org/abs/2307.11332v1","updated":"2023-07-21T03:40:53Z","published":"2023-07-21T03:40:53Z","title":"Beyond Convergence: Identifiability of Machine Learning and Deep\n  Learning Models","summary":"  Machine learning (ML) and deep learning models are extensively used for\nparameter optimization and regression problems. However, not all inverse\nproblems in ML are ``identifiable,'' indicating that model parameters may not\nbe uniquely determined from the available data and the data model's\ninput-output relationship. In this study, we investigate the notion of model\nparameter identifiability through a case study focused on parameter estimation\nfrom motion sensor data. Utilizing a bipedal-spring mass human walk dynamics\nmodel, we generate synthetic data representing diverse gait patterns and\nconditions. Employing a deep neural network, we attempt to estimate\nsubject-wise parameters, including mass, stiffness, and equilibrium leg length.\nThe results show that while certain parameters can be identified from the\nobservation data, others remain unidentifiable, highlighting that\nunidentifiability is an intrinsic limitation of the experimental setup,\nnecessitating a change in data collection and experimental scenarios. Beyond\nthis specific case study, the concept of identifiability has broader\nimplications in ML and deep learning. Addressing unidentifiability requires\nproven identifiable models (with theoretical support), multimodal data fusion\ntechniques, and advancements in model-based machine learning. Understanding and\nresolving unidentifiability challenges will lead to more reliable and accurate\napplications across diverse domains, transcending mere model convergence and\nenhancing the reliability of machine learning models.\n","authors":["Reza Sameni"],"pdf_url":"https://arxiv.org/pdf/2307.11332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.10736v3","updated":"2023-07-21T03:39:05Z","published":"2022-03-21T05:00:54Z","title":"The activity-weight duality in feed forward neural networks: The\n  geometric determinants of generalization","summary":"  One of the fundamental problems in machine learning is generalization. In\nneural network models with a large number of weights (parameters), many\nsolutions can be found to fit the training data equally well. The key question\nis which solution can describe testing data not in the training set. Here, we\nreport the discovery of an exact duality (equivalence) between changes in\nactivities in a given layer of neurons and changes in weights that connect to\nthe next layer of neurons in a densely connected layer in any feed forward\nneural network. The activity-weight (A-W) duality allows us to map variations\nin inputs (data) to variations of the corresponding dual weights. By using this\nmapping, we show that the generalization loss can be decomposed into a sum of\ncontributions from different eigen-directions of the Hessian matrix of the loss\nfunction at the solution in weight space. The contribution from a given\neigen-direction is the product of two geometric factors (determinants): the\nsharpness of the loss landscape and the standard deviation of the dual weights,\nwhich is found to scale with the weight norm of the solution. Our results\nprovide an unified framework, which we used to reveal how different\nregularization schemes (weight decay, stochastic gradient descent with\ndifferent batch sizes and learning rates, dropout), training data size, and\nlabeling noise affect generalization performance by controlling either one or\nboth of these two geometric determinants for generalization. These insights can\nbe used to guide development of algorithms for finding more generalizable\nsolutions in overparametrized neural networks.\n","authors":["Yu Feng","Yuhai Tu"],"pdf_url":"https://arxiv.org/pdf/2203.10736v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11327v1","updated":"2023-07-21T03:24:55Z","published":"2023-07-21T03:24:55Z","title":"Systematic Adaptation of Communication-focused Machine Learning Models\n  from Real to Virtual Environments for Human-Robot Collaboration","summary":"  Virtual reality has proved to be useful in applications in several fields\nranging from gaming, medicine, and training to development of interfaces that\nenable human-robot collaboration. It empowers designers to explore applications\noutside of the constraints posed by the real world environment and develop\ninnovative solutions and experiences. Hand gestures recognition which has been\na topic of much research and subsequent commercialization in the real world has\nbeen possible because of the creation of large, labelled datasets. In order to\nutilize the power of natural and intuitive hand gestures in the virtual domain\nfor enabling embodied teleoperation of collaborative robots, similarly large\ndatasets must be created so as to keep the working interface easy to learn and\nflexible enough to add more gestures. Depending on the application, this may be\ncomputationally or economically prohibitive. Thus, the adaptation of trained\ndeep learning models that perform well in the real environment to the virtual\nmay be a solution to this challenge. This paper presents a systematic framework\nfor the real to virtual adaptation using limited size of virtual dataset along\nwith guidelines for creating a curated dataset. Finally, while hand gestures\nhave been considered as the communication mode, the guidelines and\nrecommendations presented are generic. These are applicable to other modes such\nas body poses and facial expressions which have large datasets available in the\nreal domain which must be adapted to the virtual one.\n","authors":["Debasmita Mukherjee","Ritwik Singhai","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2307.11327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11325v1","updated":"2023-07-21T03:23:17Z","published":"2023-07-21T03:23:17Z","title":"Analysis of Elephant Movement in Sub-Saharan Africa: Ecological,\n  Climatic, and Conservation Perspectives","summary":"  The interaction between elephants and their environment has profound\nimplications for both ecology and conservation strategies. This study presents\nan analytical approach to decipher the intricate patterns of elephant movement\nin Sub-Saharan Africa, concentrating on key ecological drivers such as seasonal\nvariations and rainfall patterns. Despite the complexities surrounding these\ninfluential factors, our analysis provides a holistic view of elephant\nmigratory behavior in the context of the dynamic African landscape. Our\ncomprehensive approach enables us to predict the potential impact of these\necological determinants on elephant migration, a critical step in establishing\ninformed conservation strategies. This projection is particularly crucial given\nthe impacts of global climate change on seasonal and rainfall patterns, which\ncould substantially influence elephant movements in the future. The findings of\nour work aim to not only advance the understanding of movement ecology but also\nfoster a sustainable coexistence of humans and elephants in Sub-Saharan Africa.\nBy predicting potential elephant routes, our work can inform strategies to\nminimize human-elephant conflict, effectively manage land use, and enhance\nanti-poaching efforts. This research underscores the importance of integrating\nmovement ecology and climatic variables for effective wildlife management and\nconservation planning.\n","authors":["Matthew Hines","Gregory Glatzer","Shreya Ghosh","Prasenjit Mitra"],"pdf_url":"https://arxiv.org/pdf/2307.11325v1.pdf","comment":"11 pages, 17 figures, Accepted in ACM SIGCAS SIGCHI Conference on\n  Computing and Sustainable Societies (COMPASS 2023)"},{"id":"http://arxiv.org/abs/2307.11317v1","updated":"2023-07-21T02:57:40Z","published":"2023-07-21T02:57:40Z","title":"XLDA: Linear Discriminant Analysis for Scaling Continual Learning to\n  Extreme Classification at the Edge","summary":"  Streaming Linear Discriminant Analysis (LDA) while proven in\nClass-incremental Learning deployments at the edge with limited classes (upto\n1000), has not been proven for deployment in extreme classification scenarios.\nIn this paper, we present: (a) XLDA, a framework for Class-IL in edge\ndeployment where LDA classifier is proven to be equivalent to FC layer\nincluding in extreme classification scenarios, and (b) optimizations to enable\nXLDA-based training and inference for edge deployment where there is a\nconstraint on available compute resources. We show up to 42x speed up using a\nbatched training approach and up to 5x inference speedup with nearest neighbor\nsearch on extreme datasets like AliProducts (50k classes) and Google Landmarks\nV2 (81k classes)\n","authors":["Karan Shah","Vishruth Veerendranath","Anushka Hebbar","Raghavendra Bhat"],"pdf_url":"https://arxiv.org/pdf/2307.11317v1.pdf","comment":"Submitted at ICML 2023: PAC-Bayes Interactive Learning Workshop"},{"id":"http://arxiv.org/abs/2307.10579v2","updated":"2023-07-21T02:54:25Z","published":"2023-07-20T04:45:59Z","title":"SecureBoost Hyperparameter Tuning via Multi-Objective Federated Learning","summary":"  SecureBoost is a tree-boosting algorithm leveraging homomorphic encryption to\nprotect data privacy in vertical federated learning setting. It is widely used\nin fields such as finance and healthcare due to its interpretability,\neffectiveness, and privacy-preserving capability. However, SecureBoost suffers\nfrom high computational complexity and risk of label leakage. To harness the\nfull potential of SecureBoost, hyperparameters of SecureBoost should be\ncarefully chosen to strike an optimal balance between utility, efficiency, and\nprivacy. Existing methods either set hyperparameters empirically or\nheuristically, which are far from optimal. To fill this gap, we propose a\nConstrained Multi-Objective SecureBoost (CMOSB) algorithm to find Pareto\noptimal solutions that each solution is a set of hyperparameters achieving\noptimal tradeoff between utility loss, training cost, and privacy leakage. We\ndesign measurements of the three objectives. In particular, the privacy leakage\nis measured using our proposed instance clustering attack. Experimental results\ndemonstrate that the CMOSB yields not only hyperparameters superior to the\nbaseline but also optimal sets of hyperparameters that can support the flexible\nrequirements of FL participants.\n","authors":["Ziyao Ren","Yan Kang","Lixin Fan","Linghua Yang","Tao Fan","Yongxin Tong","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.10579v2.pdf","comment":"FL-ICAI'23"},{"id":"http://arxiv.org/abs/2307.11316v1","updated":"2023-07-21T02:51:41Z","published":"2023-07-21T02:51:41Z","title":"Making Pre-trained Language Models both Task-solvers and\n  Self-calibrators","summary":"  Pre-trained language models (PLMs) serve as backbones for various real-world\nsystems. For high-stake applications, it's equally essential to have reasonable\nconfidence estimations in predictions. While the vanilla confidence scores of\nPLMs can already be effectively utilized, PLMs consistently become\noverconfident in their wrong predictions, which is not desirable in practice.\nPrevious work shows that introducing an extra calibration task can mitigate\nthis issue. The basic idea involves acquiring additional data to train models\nin predicting the confidence of their initial predictions. However, it only\ndemonstrates the feasibility of this kind of method, assuming that there are\nabundant extra available samples for the introduced calibration task. In this\nwork, we consider the practical scenario that we need to effectively utilize\ntraining samples to make PLMs both task-solvers and self-calibrators. Three\nchallenges are presented, including limited training samples, data imbalance,\nand distribution shifts. We first conduct pilot experiments to quantify various\ndecisive factors in the calibration task. Based on the empirical analysis\nresults, we propose a training algorithm LM-TOAST to tackle the challenges.\nExperimental results show that LM-TOAST can effectively utilize the training\ndata to make PLMs have reasonable confidence estimations while maintaining the\noriginal task performance. Further, we consider three downstream applications,\nnamely selective classification, adversarial defense, and model cascading, to\nshow the practical usefulness of LM-TOAST. The code will be made public at\n\\url{https://github.com/Yangyi-Chen/LM-TOAST}.\n","authors":["Yangyi Chen","Xingyao Wang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2307.11316v1.pdf","comment":"Accepted to Findings of ACL 2023"},{"id":"http://arxiv.org/abs/2307.11314v1","updated":"2023-07-21T02:47:03Z","published":"2023-07-21T02:47:03Z","title":"Neuromorphic Online Learning for Spatiotemporal Patterns with a\n  Forward-only Timeline","summary":"  Spiking neural networks (SNNs) are bio-plausible computing models with high\nenergy efficiency. The temporal dynamics of neurons and synapses enable them to\ndetect temporal patterns and generate sequences. While Backpropagation Through\nTime (BPTT) is traditionally used to train SNNs, it is not suitable for online\nlearning of embedded applications due to its high computation and memory cost\nas well as extended latency. Previous works have proposed online learning\nalgorithms, but they often utilize highly simplified spiking neuron models\nwithout synaptic dynamics and reset feedback, resulting in subpar performance.\nIn this work, we present Spatiotemporal Online Learning for Synaptic Adaptation\n(SOLSA), specifically designed for online learning of SNNs composed of Leaky\nIntegrate and Fire (LIF) neurons with exponentially decayed synapses and soft\nreset. The algorithm not only learns the synaptic weight but also adapts the\ntemporal filters associated to the synapses. Compared to the BPTT algorithm,\nSOLSA has much lower memory requirement and achieves a more balanced temporal\nworkload distribution. Moreover, SOLSA incorporates enhancement techniques such\nas scheduled weight update, early stop training and adaptive synapse filter,\nwhich speed up the convergence and enhance the learning performance. When\ncompared to other non-BPTT based SNN learning, SOLSA demonstrates an average\nlearning accuracy improvement of 14.2%. Furthermore, compared to BPTT, SOLSA\nachieves a 5% higher average learning accuracy with a 72% reduction in memory\ncost.\n","authors":["Zhenhang Zhang","Jingang Jin","Haowen Fang","Qinru Qiu"],"pdf_url":"https://arxiv.org/pdf/2307.11314v1.pdf","comment":"9 pages,8 figures"},{"id":"http://arxiv.org/abs/2303.17555v2","updated":"2023-07-21T02:20:39Z","published":"2023-03-16T21:02:09Z","title":"Factoring the Matrix of Domination: A Critical Review and Reimagination\n  of Intersectionality in AI Fairness","summary":"  Intersectionality is a critical framework that, through inquiry and praxis,\nallows us to examine how social inequalities persist through domains of\nstructure and discipline. Given AI fairness' raison d'etre of \"fairness\", we\nargue that adopting intersectionality as an analytical framework is pivotal to\neffectively operationalizing fairness. Through a critical review of how\nintersectionality is discussed in 30 papers from the AI fairness literature, we\ndeductively and inductively: 1) map how intersectionality tenets operate within\nthe AI fairness paradigm and 2) uncover gaps between the conceptualization and\noperationalization of intersectionality. We find that researchers\noverwhelmingly reduce intersectionality to optimizing for fairness metrics over\ndemographic subgroups. They also fail to discuss their social context and when\nmentioning power, they mostly situate it only within the AI pipeline. We: 3)\noutline and assess the implications of these gaps for critical inquiry and\npraxis, and 4) provide actionable recommendations for AI fairness researchers\nto engage with intersectionality in their work by grounding it in AI\nepistemology.\n","authors":["Anaelia Ovalle","Arjun Subramonian","Vagrant Gautam","Gilbert Gee","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2303.17555v2.pdf","comment":"To appear at AIES 2023"},{"id":"http://arxiv.org/abs/2302.04973v2","updated":"2023-07-21T01:40:31Z","published":"2023-02-09T23:25:28Z","title":"Invariant Slot Attention: Object Discovery with Slot-Centric Reference\n  Frames","summary":"  Automatically discovering composable abstractions from raw perceptual data is\na long-standing challenge in machine learning. Recent slot-based neural\nnetworks that learn about objects in a self-supervised manner have made\nexciting progress in this direction. However, they typically fall short at\nadequately capturing spatial symmetries present in the visual world, which\nleads to sample inefficiency, such as when entangling object appearance and\npose. In this paper, we present a simple yet highly effective method for\nincorporating spatial symmetries via slot-centric reference frames. We\nincorporate equivariance to per-object pose transformations into the attention\nand generation mechanism of Slot Attention by translating, scaling, and\nrotating position encodings. These changes result in little computational\noverhead, are easy to implement, and can result in large gains in terms of data\nefficiency and overall improvements to object discovery. We evaluate our method\non a wide range of synthetic object discovery benchmarks namely CLEVR,\nTetrominoes, CLEVRTex, Objects Room and MultiShapeNet, and show promising\nimprovements on the challenging real-world Waymo Open dataset.\n","authors":["Ondrej Biza","Sjoerd van Steenkiste","Mehdi S. M. Sajjadi","Gamaleldin F. Elsayed","Aravindh Mahendran","Thomas Kipf"],"pdf_url":"https://arxiv.org/pdf/2302.04973v2.pdf","comment":"Accepted at ICML 2023. Project page: https://invariantsa.github.io/"},{"id":"http://arxiv.org/abs/2307.11289v1","updated":"2023-07-21T01:18:02Z","published":"2023-07-21T01:18:02Z","title":"PI-VEGAN: Physics Informed Variational Embedding Generative Adversarial\n  Networks for Stochastic Differential Equations","summary":"  We present a new category of physics-informed neural networks called physics\ninformed variational embedding generative adversarial network (PI-VEGAN), that\neffectively tackles the forward, inverse, and mixed problems of stochastic\ndifferential equations. In these scenarios, the governing equations are known,\nbut only a limited number of sensor measurements of the system parameters are\navailable. We integrate the governing physical laws into PI-VEGAN with\nautomatic differentiation, while introducing a variational encoder for\napproximating the latent variables of the actual distribution of the\nmeasurements. These latent variables are integrated into the generator to\nfacilitate accurate learning of the characteristics of the stochastic partial\nequations. Our model consists of three components, namely the encoder,\ngenerator, and discriminator, each of which is updated alternatively employing\nthe stochastic gradient descent algorithm. We evaluate the effectiveness of\nPI-VEGAN in addressing forward, inverse, and mixed problems that require the\nconcurrent calculation of system parameters and solutions. Numerical results\ndemonstrate that the proposed method achieves satisfactory stability and\naccuracy in comparison with the previous physics-informed generative\nadversarial network (PI-WGAN).\n","authors":["Ruisong Gao","Yufeng Wang","Min Yang","Chuanjun Chen"],"pdf_url":"https://arxiv.org/pdf/2307.11289v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2307.11288v1","updated":"2023-07-21T01:17:31Z","published":"2023-07-21T01:17:31Z","title":"Kernelized Offline Contextual Dueling Bandits","summary":"  Preference-based feedback is important for many applications where direct\nevaluation of a reward function is not feasible. A notable recent example\narises in reinforcement learning from human feedback on large language models.\nFor many of these applications, the cost of acquiring the human feedback can be\nsubstantial or even prohibitive. In this work, we take advantage of the fact\nthat often the agent can choose contexts at which to obtain human feedback in\norder to most efficiently identify a good policy, and introduce the offline\ncontextual dueling bandit setting. We give an upper-confidence-bound style\nalgorithm for this setting and prove a regret bound. We also give empirical\nconfirmation that this method outperforms a similar strategy that uses\nuniformly sampled contexts.\n","authors":["Viraj Mehta","Ojash Neopane","Vikramjeet Das","Sen Lin","Jeff Schneider","Willie Neiswanger"],"pdf_url":"https://arxiv.org/pdf/2307.11288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09702v5","updated":"2023-07-21T01:07:19Z","published":"2022-05-19T17:11:45Z","title":"Parallel and Distributed Graph Neural Networks: An In-Depth Concurrency\n  Analysis","summary":"  Graph neural networks (GNNs) are among the most powerful tools in deep\nlearning. They routinely solve complex problems on unstructured networks, such\nas node classification, graph classification, or link prediction, with high\naccuracy. However, both inference and training of GNNs are complex, and they\nuniquely combine the features of irregular graph processing with dense and\nregular computations. This complexity makes it very challenging to execute GNNs\nefficiently on modern massively parallel architectures. To alleviate this, we\nfirst design a taxonomy of parallelism in GNNs, considering data and model\nparallelism, and different forms of pipelining. Then, we use this taxonomy to\ninvestigate the amount of parallelism in numerous GNN models, GNN-driven\nmachine learning tasks, software frameworks, or hardware accelerators. We use\nthe work-depth model, and we also assess communication volume and\nsynchronization. We specifically focus on the sparsity/density of the\nassociated tensors, in order to understand how to effectively apply techniques\nsuch as vectorization. We also formally analyze GNN pipelining, and we\ngeneralize the established Message-Passing class of GNN models to cover\narbitrary pipeline depths, facilitating future optimizations. Finally, we\ninvestigate different forms of asynchronicity, navigating the path for future\nasynchronous parallel GNN pipelines. The outcomes of our analysis are\nsynthesized in a set of insights that help to maximize GNN performance, and a\ncomprehensive list of challenges and opportunities for further research into\nefficient GNN computations. Our work will help to advance the design of future\nGNNs.\n","authors":["Maciej Besta","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2205.09702v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11285v1","updated":"2023-07-21T01:04:52Z","published":"2023-07-21T01:04:52Z","title":"MAS: Towards Resource-Efficient Federated Multiple-Task Learning","summary":"  Federated learning (FL) is an emerging distributed machine learning method\nthat empowers in-situ model training on decentralized edge devices. However,\nmultiple simultaneous FL tasks could overload resource-constrained devices. In\nthis work, we propose the first FL system to effectively coordinate and train\nmultiple simultaneous FL tasks. We first formalize the problem of training\nsimultaneous FL tasks. Then, we present our new approach, MAS (Merge and\nSplit), to optimize the performance of training multiple simultaneous FL tasks.\nMAS starts by merging FL tasks into an all-in-one FL task with a multi-task\narchitecture. After training for a few rounds, MAS splits the all-in-one FL\ntask into two or more FL tasks by using the affinities among tasks measured\nduring the all-in-one training. It then continues training each split of FL\ntasks based on model parameters from the all-in-one training. Extensive\nexperiments demonstrate that MAS outperforms other methods while reducing\ntraining time by 2x and reducing energy consumption by 40%. We hope this work\nwill inspire the community to further study and optimize training simultaneous\nFL tasks.\n","authors":["Weiming Zhuang","Yonggang Wen","Lingjuan Lyu","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11285v1.pdf","comment":"ICCV'23. arXiv admin note: substantial text overlap with\n  arXiv:2207.04202"},{"id":"http://arxiv.org/abs/2307.11280v1","updated":"2023-07-21T00:49:07Z","published":"2023-07-21T00:49:07Z","title":"Epsilon*: Privacy Metric for Machine Learning Models","summary":"  We introduce Epsilon*, a new privacy metric for measuring the privacy risk of\na single model instance prior to, during, or after deployment of privacy\nmitigation strategies. The metric does not require access to the training data\nsampling or model training algorithm. Epsilon* is a function of true positive\nand false positive rates in a hypothesis test used by an adversary in a\nmembership inference attack. We distinguish between quantifying the privacy\nloss of a trained model instance and quantifying the privacy loss of the\ntraining mechanism which produces this model instance. Existing approaches in\nthe privacy auditing literature provide lower bounds for the latter, while our\nmetric provides a lower bound for the former by relying on an\n(${\\epsilon}$,${\\delta}$)-type of quantification of the privacy of the trained\nmodel instance. We establish a relationship between these lower bounds and show\nhow to implement Epsilon* to avoid numerical and noise amplification\ninstability. We further show in experiments on benchmark public data sets that\nEpsilon* is sensitive to privacy risk mitigation by training with differential\nprivacy (DP), where the value of Epsilon* is reduced by up to 800% compared to\nthe Epsilon* values of non-DP trained baseline models. This metric allows\nprivacy auditors to be independent of model owners, and enables all\ndecision-makers to visualize the privacy-utility landscape to make informed\ndecisions regarding the trade-offs between model privacy and utility.\n","authors":["Diana M. Negoescu","Humberto Gonzalez","Saad Eddin Al Orjany","Jilei Yang","Yuliia Lut","Rahul Tandra","Xiaowen Zhang","Xinyi Zheng","Zach Douglas","Vidita Nolkha","Parvez Ahammad","Gennady Samorodnitsky"],"pdf_url":"https://arxiv.org/pdf/2307.11280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11274v1","updated":"2023-07-21T00:15:56Z","published":"2023-07-21T00:15:56Z","title":"Screening Mammography Breast Cancer Detection","summary":"  Breast cancer is a leading cause of cancer-related deaths, but current\nprograms are expensive and prone to false positives, leading to unnecessary\nfollow-up and patient anxiety. This paper proposes a solution to automated\nbreast cancer detection, to improve the efficiency and accuracy of screening\nprograms. Different methodologies were tested against the RSNA dataset of\nradiographic breast images of roughly 20,000 female patients and yielded an\naverage validation case pF1 score of 0.56 across methods.\n","authors":["Debajyoti Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2307.11274v1.pdf","comment":"Released @ Apr 2023. For associated project files, see\n  https://github.com/chakrabortyde/rsna-breast-cancer"},{"id":"http://arxiv.org/abs/2305.13503v2","updated":"2023-07-21T00:15:28Z","published":"2023-05-22T21:39:38Z","title":"Asynchronous Multi-Model Dynamic Federated Learning over Wireless\n  Networks: Theory, Modeling, and Optimization","summary":"  Federated learning (FL) has emerged as a key technique for distributed\nmachine learning (ML). Most literature on FL has focused on ML model training\nfor (i) a single task/model, with (ii) a synchronous scheme for uplink/downlink\ntransfer of model parameters, and (iii) a static data distribution setting\nacross devices. These assumptions are often not well representative of\nconditions encountered in practical FL environments. To address this, we\ndevelop DMA-FL, which considers dynamic FL with multiple downstream tasks to be\ntrained over an asynchronous model transmission architecture. We first\ncharacterize the convergence of ML model training under DMA-FL via introducing\na family of scheduling tensors and rectangular functions to capture the\nscheduling of devices. Our convergence analysis sheds light on the impact of\nresource allocation, device scheduling, and individual model states on the\nperformance of ML models. We then formulate a non-convex mixed integer\noptimization problem for jointly configuring the resource allocation and device\nscheduling to strike an efficient trade-off between energy consumption and ML\nperformance. We develop a solution methodology employing successive convex\napproximations with convergence guarantee to a stationary point. Through\nnumerical simulations, we reveal the advantages of DMA-FL in terms of model\nperformance and network resource savings.\n","authors":["Zhan-Lun Chang","Seyyedali Hosseinalipour","Mung Chiang","Christopher G. Brinton"],"pdf_url":"https://arxiv.org/pdf/2305.13503v2.pdf","comment":"Submission to IEEE Transactions on Cognitive Communications and\n  Networking"}],"Multimedia":[{"id":"http://arxiv.org/abs/2304.14133v2","updated":"2023-07-21T12:06:17Z","published":"2023-04-27T12:28:29Z","title":"VERITE: A Robust Benchmark for Multimodal Misinformation Detection\n  Accounting for Unimodal Bias","summary":"  Multimedia content has become ubiquitous on social media platforms, leading\nto the rise of multimodal misinformation (MM) and the urgent need for effective\nstrategies to detect and prevent its spread. In recent years, the challenge of\nmultimodal misinformation detection (MMD) has garnered significant attention by\nresearchers and has mainly involved the creation of annotated, weakly\nannotated, or synthetically generated training datasets, along with the\ndevelopment of various deep learning MMD models. However, the problem of\nunimodal bias in MMD benchmarks -- where biased or unimodal methods outperform\ntheir multimodal counterparts on an inherently multimodal task -- has been\noverlooked. In this study, we systematically investigate and identify the\npresence of unimodal bias in widely-used MMD benchmarks (VMU-Twitter, COSMOS),\nraising concerns about their suitability for reliable evaluation. To address\nthis issue, we introduce the \"VERification of Image-TExtpairs\" (VERITE)\nbenchmark for MMD which incorporates real-world data, excludes \"asymmetric\nmultimodal misinformation\" and utilizes \"modality balancing\". We conduct an\nextensive comparative study with a Transformer-based architecture that shows\nthe ability of VERITE to effectively address unimodal bias, rendering it a\nrobust evaluation framework for MMD. Furthermore, we introduce a new method --\ntermed Crossmodal HArd Synthetic MisAlignment (CHASMA) -- for generating\nrealistic synthetic training data that preserve crossmodal relations between\nlegitimate images and false human-written captions. By leveraging CHASMA in the\ntraining process, we observe consistent and notable improvements in predictive\nperformance on VERITE; with a 9.2% increase in accuracy. We release our code\nat: https://github.com/stevejpapad/image-text-verification\n","authors":["Stefanos-Iordanis Papadopoulos","Christos Koutlis","Symeon Papadopoulos","Panagiotis C. Petrantonakis"],"pdf_url":"https://arxiv.org/pdf/2304.14133v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09382v3","updated":"2023-07-21T07:59:06Z","published":"2023-06-15T12:59:04Z","title":"Sound Demixing Challenge 2023 Music Demixing Track Technical Report:\n  TFC-TDF-UNet v3","summary":"  In this report, we present our award-winning solutions for the Music Demixing\nTrack of Sound Demixing Challenge 2023. First, we propose TFC-TDF-UNet v3, a\ntime-efficient music source separation model that achieves state-of-the-art\nresults on the MUSDB benchmark. We then give full details regarding our\nsolutions for each Leaderboard, including a loss masking approach for\nnoise-robust training. Code for reproducing model training and final\nsubmissions is available at github.com/kuielab/sdx23.\n","authors":["Minseok Kim","Jun Hyung Lee","Soonyoung Jung"],"pdf_url":"https://arxiv.org/pdf/2306.09382v3.pdf","comment":"5 pages, 4 tables"},{"id":"http://arxiv.org/abs/2301.12688v3","updated":"2023-07-21T18:13:10Z","published":"2023-01-30T06:37:35Z","title":"Dynamic Storyboard Generation in an Engine-based Virtual Environment for\n  Video Production","summary":"  Amateurs working on mini-films and short-form videos usually spend lots of\ntime and effort on the multi-round complicated process of setting and adjusting\nscenes, plots, and cameras to deliver satisfying video shots. We present\nVirtual Dynamic Storyboard (VDS) to allow users storyboarding shots in virtual\nenvironments, where the filming staff can easily test the settings of shots\nbefore the actual filming. VDS runs on a \"propose-simulate-discriminate\" mode:\nGiven a formatted story script and a camera script as input, it generates\nseveral character animation and camera movement proposals following predefined\nstory and cinematic rules to allow an off-the-shelf simulation engine to render\nvideos. To pick up the top-quality dynamic storyboard from the candidates, we\nequip it with a shot ranking discriminator based on shot quality criteria\nlearned from professional manual-created data. VDS is comprehensively validated\nvia extensive experiments and user studies, demonstrating its efficiency,\neffectiveness, and great potential in assisting amateur video production.\n","authors":["Anyi Rao","Xuekun Jiang","Yuwei Guo","Linning Xu","Lei Yang","Libiao Jin","Dahua Lin","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2301.12688v3.pdf","comment":"Project page: https://virtualfilmstudio.github.io/"}]},"2023-07-24T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.12981v1","updated":"2023-07-24T17:59:02Z","published":"2023-07-24T17:59:02Z","title":"3D-LLM: Injecting the 3D World into Large Language Models","summary":"  Large language models (LLMs) and Vision-Language Models (VLMs) have been\nproven to excel at multiple tasks, such as commonsense reasoning. Powerful as\nthese models can be, they are not grounded in the 3D physical world, which\ninvolves richer concepts such as spatial relationships, affordances, physics,\nlayout, and so on. In this work, we propose to inject the 3D world into large\nlanguage models and introduce a whole new family of 3D-LLMs. Specifically,\n3D-LLMs can take 3D point clouds and their features as input and perform a\ndiverse set of 3D-related tasks, including captioning, dense captioning, 3D\nquestion answering, task decomposition, 3D grounding, 3D-assisted dialog,\nnavigation, and so on. Using three types of prompting mechanisms that we\ndesign, we are able to collect over 300k 3D-language data covering these tasks.\nTo efficiently train 3D-LLMs, we first utilize a 3D feature extractor that\nobtains 3D features from rendered multi- view images. Then, we use 2D VLMs as\nour backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,\n3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show\nthat our model outperforms state-of-the-art baselines by a large margin (e.g.,\nthe BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,\nexperiments on our held-in datasets for 3D captioning, task composition, and\n3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative\nexamples also show that our model could perform more tasks beyond the scope of\nexisting LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.\n","authors":["Yining Hong","Haoyu Zhen","Peihao Chen","Shuhong Zheng","Yilun Du","Zhenfang Chen","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2307.12981v1.pdf","comment":"Project Page: : https://vis-www.cs.umass.edu/3dllm/"},{"id":"http://arxiv.org/abs/2307.12976v1","updated":"2023-07-24T17:52:46Z","published":"2023-07-24T17:52:46Z","title":"Evaluating the Ripple Effects of Knowledge Editing in Language Models","summary":"  Modern language models capture a large body of factual knowledge. However,\nsome facts can be incorrectly induced or become obsolete over time, resulting\nin factually incorrect generations. This has led to the development of various\nediting methods that allow updating facts encoded by the model. Evaluation of\nthese methods has primarily focused on testing whether an individual fact has\nbeen successfully injected, and if similar predictions for other subjects have\nnot changed. Here we argue that such evaluation is limited, since injecting one\nfact (e.g. ``Jack Depp is the son of Johnny Depp'') introduces a ``ripple\neffect'' in the form of additional facts that the model needs to update\n(e.g.``Jack Depp is the sibling of Lily-Rose Depp''). To address this issue, we\npropose a novel set of evaluation criteria that consider the implications of an\nedit on related facts. Using these criteria, we then construct \\ripple{}, a\ndiagnostic benchmark of 5K factual edits, capturing a variety of types of\nripple effects. We evaluate prominent editing methods on \\ripple{}, showing\nthat current methods fail to introduce consistent changes in the model's\nknowledge. In addition, we find that a simple in-context editing baseline\nobtains the best scores on our benchmark, suggesting a promising research\ndirection for model editing.\n","authors":["Roi Cohen","Eden Biran","Ori Yoran","Amir Globerson","Mor Geva"],"pdf_url":"https://arxiv.org/pdf/2307.12976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12973v1","updated":"2023-07-24T17:49:31Z","published":"2023-07-24T17:49:31Z","title":"Leveraging Label Variation in Large Language Models for Zero-Shot Text\n  Classification","summary":"  The zero-shot learning capabilities of large language models (LLMs) make them\nideal for text classification without annotation or supervised training. Many\nstudies have shown impressive results across multiple tasks. While tasks, data,\nand results differ widely, their similarities to human annotation can aid us in\ntackling new tasks with minimal expenses. We evaluate using 5 state-of-the-art\nLLMs as \"annotators\" on 5 different tasks (age, gender, topic, sentiment\nprediction, and hate speech detection), across 4 languages: English, French,\nGerman, and Spanish. No single model excels at all tasks, across languages, or\nacross all labels within a task. However, aggregation techniques designed for\nhuman annotators perform substantially better than any one individual model.\nOverall, though, LLMs do not rival even simple supervised models, so they do\nnot (yet) replace the need for human annotation. We also discuss the tradeoffs\nbetween speed, accuracy, cost, and bias when it comes to aggregated model\nlabeling versus human annotation.\n","authors":["Flor Miriam Plaza-del-Arco","Debora Nozza","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2307.12973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12966v1","updated":"2023-07-24T17:44:58Z","published":"2023-07-24T17:44:58Z","title":"Aligning Large Language Models with Human: A Survey","summary":"  Large Language Models (LLMs) trained on extensive textual corpora have\nemerged as leading solutions for a broad array of Natural Language Processing\n(NLP) tasks. Despite their notable performance, these models are prone to\ncertain limitations such as misunderstanding human instructions, generating\npotentially biased content, or factually incorrect (hallucinated) information.\nHence, aligning LLMs with human expectations has become an active area of\ninterest within the research community. This survey presents a comprehensive\noverview of these alignment technologies, including the following aspects. (1)\nData collection: the methods for effectively collecting high-quality\ninstructions for LLM alignment, including the use of NLP benchmarks, human\nannotations, and leveraging strong LLMs. (2) Training methodologies: a detailed\nreview of the prevailing training methods employed for LLM alignment. Our\nexploration encompasses Supervised Fine-tuning, both Online and Offline human\npreference training, along with parameter-efficient training mechanisms. (3)\nModel Evaluation: the methods for evaluating the effectiveness of these\nhuman-aligned LLMs, presenting a multifaceted approach towards their\nassessment. In conclusion, we collate and distill our findings, shedding light\non several promising future research avenues in the field. This survey,\ntherefore, serves as a valuable resource for anyone invested in understanding\nand advancing the alignment of LLMs to better suit human-oriented tasks and\nexpectations. An associated GitHub link collecting the latest papers is\navailable at https://github.com/GaryYufei/AlignLLMHumanSurvey.\n","authors":["Yufei Wang","Wanjun Zhong","Liangyou Li","Fei Mi","Xingshan Zeng","Wenyong Huang","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12966v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2303.04245v2","updated":"2023-07-24T17:29:04Z","published":"2023-03-07T21:42:17Z","title":"How Do Transformers Learn Topic Structure: Towards a Mechanistic\n  Understanding","summary":"  While the successes of transformers across many domains are indisputable,\naccurate understanding of the learning mechanics is still largely lacking.\nTheir capabilities have been probed on benchmarks which include a variety of\nstructured and reasoning tasks -- but mathematical understanding is lagging\nsubstantially behind. Recent lines of work have begun studying representational\naspects of this question: that is, the size/depth/complexity of attention-based\nnetworks to perform certain tasks. However, there is no guarantee the learning\ndynamics will converge to the constructions proposed. In our paper, we provide\nfine-grained mechanistic understanding of how transformers learn \"semantic\nstructure\", understood as capturing co-occurrence structure of words.\nPrecisely, we show, through a combination of mathematical analysis and\nexperiments on Wikipedia data and synthetic data modeled by Latent Dirichlet\nAllocation (LDA), that the embedding layer and the self-attention layer encode\nthe topical structure. In the former case, this manifests as higher average\ninner product of embeddings between same-topic words. In the latter, it\nmanifests as higher average pairwise attention between same-topic words. The\nmathematical results involve several assumptions to make the analysis\ntractable, which we verify on data, and might be of independent interest as\nwell.\n","authors":["Yuchen Li","Yuanzhi Li","Andrej Risteski"],"pdf_url":"https://arxiv.org/pdf/2303.04245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12950v1","updated":"2023-07-24T17:23:22Z","published":"2023-07-24T17:23:22Z","title":"RLCD: Reinforcement Learning from Contrast Distillation for Language\n  Model Alignment","summary":"  We propose Reinforcement Learning from Contrast Distillation (RLCD), a method\nfor aligning language models to follow natural language principles without\nusing human feedback. RLCD trains a preference model using simulated preference\npairs that contain both a high-quality and low-quality example, generated using\ncontrasting positive and negative prompts. The preference model is then used to\nimprove a base unaligned language model via reinforcement learning.\nEmpirically, RLCD outperforms RLAIF (Bai et al., 2022b) and context\ndistillation (Huang et al., 2022) baselines across three diverse alignment\ntasks--harmlessness, helpfulness, and story outline generation--and on both 7B\nand 30B model scales for preference data simulation.\n","authors":["Kevin Yang","Dan Klein","Asli Celikyilmaz","Nanyun Peng","Yuandong Tian"],"pdf_url":"https://arxiv.org/pdf/2307.12950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12949v1","updated":"2023-07-24T17:22:04Z","published":"2023-07-24T17:22:04Z","title":"Boosting Punctuation Restoration with Data Generation and Reinforcement\n  Learning","summary":"  Punctuation restoration is an important task in automatic speech recognition\n(ASR) which aim to restore the syntactic structure of generated ASR texts to\nimprove readability. While punctuated texts are abundant from written\ndocuments, the discrepancy between written punctuated texts and ASR texts\nlimits the usability of written texts in training punctuation restoration\nsystems for ASR texts. This paper proposes a reinforcement learning method to\nexploit in-topic written texts and recent advances in large pre-trained\ngenerative language models to bridge this gap. The experiments show that our\nmethod achieves state-of-the-art performance on the ASR test set on two\nbenchmark datasets for punctuation restoration.\n","authors":["Viet Dac Lai","Abel Salinas","Hao Tan","Trung Bui","Quan Tran","Seunghyun Yoon","Hanieh Deilamsalehy","Franck Dernoncourt","Thien Huu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2307.12949v1.pdf","comment":"Accepted at INTERSPEECH 2023, 6 pages"},{"id":"http://arxiv.org/abs/2307.12935v1","updated":"2023-07-24T16:55:37Z","published":"2023-07-24T16:55:37Z","title":"Rule By Example: Harnessing Logical Rules for Explainable Hate Speech\n  Detection","summary":"  Classic approaches to content moderation typically apply a rule-based\nheuristic approach to flag content. While rules are easily customizable and\nintuitive for humans to interpret, they are inherently fragile and lack the\nflexibility or robustness needed to moderate the vast amount of undesirable\ncontent found online today. Recent advances in deep learning have demonstrated\nthe promise of using highly effective deep neural models to overcome these\nchallenges. However, despite the improved performance, these data-driven models\nlack transparency and explainability, often leading to mistrust from everyday\nusers and a lack of adoption by many platforms. In this paper, we present Rule\nBy Example (RBE): a novel exemplar-based contrastive learning approach for\nlearning from logical rules for the task of textual content moderation. RBE is\ncapable of providing rule-grounded predictions, allowing for more explainable\nand customizable predictions compared to typical deep learning-based\napproaches. We demonstrate that our approach is capable of learning rich rule\nembedding representations using only a few data examples. Experimental results\non 3 popular hate speech classification datasets show that RBE is able to\noutperform state-of-the-art deep learning classifiers as well as the use of\nrules in both supervised and unsupervised settings while providing explainable\nmodel predictions via rule-grounding.\n","authors":["Christopher Clarke","Matthew Hall","Gaurav Mittal","Ye Yu","Sandra Sajeev","Jason Mars","Mei Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12935v1.pdf","comment":"ACL 2023 Main Conference"},{"id":"http://arxiv.org/abs/2307.12896v1","updated":"2023-07-24T15:44:23Z","published":"2023-07-24T15:44:23Z","title":"Corrections of Zipf's and Heaps' Laws Derived from Hapax Rate Models","summary":"  The article introduces corrections to Zipf's and Heaps' laws based on\nsystematic models of the hapax rate. The derivation rests on two assumptions:\nThe first one is the standard urn model which predicts that marginal frequency\ndistributions for shorter texts look as if word tokens were sampled blindly\nfrom a given longer text. The second assumption posits that the rate of hapaxes\nis a simple function of the text size. Four such functions are discussed: the\nconstant model, the Davis model, the linear model, and the logistic model. It\nis shown that the logistic model yields the best fit.\n","authors":["Łukasz Dębowski"],"pdf_url":"https://arxiv.org/pdf/2307.12896v1.pdf","comment":"41 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2304.08649v3","updated":"2023-07-24T15:33:25Z","published":"2023-04-17T22:53:54Z","title":"Classification of US Supreme Court Cases using BERT-Based Techniques","summary":"  Models based on bidirectional encoder representations from transformers\n(BERT) produce state of the art (SOTA) results on many natural language\nprocessing (NLP) tasks such as named entity recognition (NER), part-of-speech\n(POS) tagging etc. An interesting phenomenon occurs when classifying long\ndocuments such as those from the US supreme court where BERT-based models can\nbe considered difficult to use on a first-pass or out-of-the-box basis. In this\npaper, we experiment with several BERT-based classification techniques for US\nsupreme court decisions or supreme court database (SCDB) and compare them with\nthe previous SOTA results. We then compare our results specifically with SOTA\nmodels for long documents. We compare our results for two classification tasks:\n(1) a broad classification task with 15 categories and (2) a fine-grained\nclassification task with 279 categories. Our best result produces an accuracy\nof 80\\% on the 15 broad categories and 60\\% on the fine-grained 279 categories\nwhich marks an improvement of 8\\% and 28\\% respectively from previously\nreported SOTA results.\n","authors":["Shubham Vatsal","Adam Meyers","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2304.08649v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10490v3","updated":"2023-07-24T15:24:17Z","published":"2023-07-19T23:03:20Z","title":"(Ab)using Images and Sounds for Indirect Instruction Injection in\n  Multi-Modal LLMs","summary":"  We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12856v1","updated":"2023-07-24T14:56:30Z","published":"2023-07-24T14:56:30Z","title":"A Real-World WebAgent with Planning, Long Context Understanding, and\n  Program Synthesis","summary":"  Pre-trained large language models (LLMs) have recently achieved better\ngeneralization and sample efficiency in autonomous web navigation. However, the\nperformance on real-world websites has still suffered from (1) open domainness,\n(2) limited context length, and (3) lack of inductive bias on HTML. We\nintroduce WebAgent, an LLM-driven agent that can complete the tasks on real\nwebsites following natural language instructions. WebAgent plans ahead by\ndecomposing instructions into canonical sub-instructions, summarizes long HTML\ndocuments into task-relevant snippets, and acts on websites via generated\nPython programs from those. We design WebAgent with Flan-U-PaLM, for grounded\ncode generation, and HTML-T5, new pre-trained LLMs for long HTML documents\nusing local and global attention mechanisms and a mixture of long-span\ndenoising objectives, for planning and summarization. We empirically\ndemonstrate that our recipe improves the success on a real website by over 50%,\nand that HTML-T5 is the best model to solve HTML-based tasks; achieving 14.9%\nhigher success rate than prior SoTA on the MiniWoB web navigation benchmark and\nbetter accuracy on offline task planning evaluation.\n","authors":["Izzeddin Gur","Hiroki Furuta","Austin Huang","Mustafa Safdari","Yutaka Matsuo","Douglas Eck","Aleksandra Faust"],"pdf_url":"https://arxiv.org/pdf/2307.12856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12835v1","updated":"2023-07-24T14:33:49Z","published":"2023-07-24T14:33:49Z","title":"Joint Dropout: Improving Generalizability in Low-Resource Neural Machine\n  Translation through Phrase Pair Variables","summary":"  Despite the tremendous success of Neural Machine Translation (NMT), its\nperformance on low-resource language pairs still remains subpar, partly due to\nthe limited ability to handle previously unseen inputs, i.e., generalization.\nIn this paper, we propose a method called Joint Dropout, that addresses the\nchallenge of low-resource neural machine translation by substituting phrases\nwith variables, resulting in significant enhancement of compositionality, which\nis a key aspect of generalization. We observe a substantial improvement in\ntranslation quality for language pairs with minimal resources, as seen in BLEU\nand Direct Assessment scores. Furthermore, we conduct an error analysis, and\nfind Joint Dropout to also enhance generalizability of low-resource NMT in\nterms of robustness and adaptability across different domains\n","authors":["Ali Araabi","Vlad Niculae","Christof Monz"],"pdf_url":"https://arxiv.org/pdf/2307.12835v1.pdf","comment":"Accepted at MT Summit 2023"},{"id":"http://arxiv.org/abs/2307.12803v1","updated":"2023-07-24T13:54:37Z","published":"2023-07-24T13:54:37Z","title":"Guidance in Radiology Report Summarization: An Empirical Evaluation and\n  Error Analysis","summary":"  Automatically summarizing radiology reports into a concise impression can\nreduce the manual burden of clinicians and improve the consistency of\nreporting. Previous work aimed to enhance content selection and factuality\nthrough guided abstractive summarization. However, two key issues persist.\nFirst, current methods heavily rely on domain-specific resources to extract the\nguidance signal, limiting their transferability to domains and languages where\nthose resources are unavailable. Second, while automatic metrics like ROUGE\nshow progress, we lack a good understanding of the errors and failure modes in\nthis task. To bridge these gaps, we first propose a domain-agnostic guidance\nsignal in form of variable-length extractive summaries. Our empirical results\non two English benchmarks demonstrate that this guidance signal improves upon\nunguided summarization while being competitive with domain-specific methods.\nAdditionally, we run an expert evaluation of four systems according to a\ntaxonomy of 11 fine-grained errors. We find that the most pressing differences\nbetween automatic summaries and those of radiologists relate to content\nselection including omissions (up to 52%) and additions (up to 57%). We\nhypothesize that latent reporting factors and corpus-level inconsistencies may\nlimit models to reliably learn content selection from the available data,\npresenting promising directions for future work.\n","authors":["Jan Trienes","Paul Youssef","Jörg Schlötterer","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2307.12803v1.pdf","comment":"Accepted at INLG2023"},{"id":"http://arxiv.org/abs/2307.12798v1","updated":"2023-07-24T13:51:19Z","published":"2023-07-24T13:51:19Z","title":"RRAML: Reinforced Retrieval Augmented Machine Learning","summary":"  The emergence of large language models (LLMs) has revolutionized machine\nlearning and related fields, showcasing remarkable abilities in comprehending,\ngenerating, and manipulating human language. However, their conventional usage\nthrough API-based text prompt submissions imposes certain limitations in terms\nof context constraints and external source availability. To address these\nchallenges, we propose a novel framework called Reinforced Retrieval Augmented\nMachine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs\nwith supporting information retrieved by a purpose-built retriever from a vast\nuser-provided database. By leveraging recent advancements in reinforcement\nlearning, our method effectively addresses several critical challenges.\nFirstly, it circumvents the need for accessing LLM gradients. Secondly, our\nmethod alleviates the burden of retraining LLMs for specific tasks, as it is\noften impractical or impossible due to restricted access to the model and the\ncomputational intensity involved. Additionally we seamlessly link the\nretriever's task with the reasoner, mitigating hallucinations and reducing\nirrelevant, and potentially damaging retrieved documents. We believe that the\nresearch agenda outlined in this paper has the potential to profoundly impact\nthe field of AI, democratizing access to and utilization of LLMs for a wide\nrange of entities.\n","authors":["Andrea Bacciu","Florin Cocunasu","Federico Siciliano","Fabrizio Silvestri","Nicola Tonellotto","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2307.12798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.12662v4","updated":"2023-07-24T13:22:58Z","published":"2020-11-25T11:44:12Z","title":"XTQA: Span-Level Explanations of the Textbook Question Answering","summary":"  Textbook Question Answering (TQA) is a task that one should answer a\ndiagram/non-diagram question given a large multi-modal context consisting of\nabundant essays and diagrams. We argue that the explainability of this task\nshould place students as a key aspect to be considered. To address this issue,\nwe devise a novel architecture towards span-level eXplanations of the TQA\n(XTQA) based on our proposed coarse-to-fine grained algorithm, which can\nprovide not only the answers but also the span-level evidences to choose them\nfor students. This algorithm first coarsely chooses top $M$ paragraphs relevant\nto questions using the TF-IDF method, and then chooses top $K$ evidence spans\nfinely from all candidate spans within these paragraphs by computing the\ninformation gain of each span to questions. Experimental results shows that\nXTQA significantly improves the state-of-the-art performance compared with\nbaselines. The source code is available at\nhttps://github.com/keep-smile-001/opentqa\n","authors":["Jie Ma","Qi Chai","Jun Liu","Qingyu Yin","Pinghui Wang","Qinghua Zheng"],"pdf_url":"https://arxiv.org/pdf/2011.12662v4.pdf","comment":"Accepted by IEEE TNNLS"},{"id":"http://arxiv.org/abs/2307.12759v1","updated":"2023-07-24T13:04:21Z","published":"2023-07-24T13:04:21Z","title":"Code-Switched Urdu ASR for Noisy Telephonic Environment using Data\n  Centric Approach with Hybrid HMM and CNN-TDNN","summary":"  Call Centers have huge amount of audio data which can be used for achieving\nvaluable business insights and transcription of phone calls is manually tedious\ntask. An effective Automated Speech Recognition system can accurately\ntranscribe these calls for easy search through call history for specific\ncontext and content allowing automatic call monitoring, improving QoS through\nkeyword search and sentiment analysis. ASR for Call Center requires more\nrobustness as telephonic environment are generally noisy. Moreover, there are\nmany low-resourced languages that are on verge of extinction which can be\npreserved with help of Automatic Speech Recognition Technology. Urdu is the\n$10^{th}$ most widely spoken language in the world, with 231,295,440 worldwide\nstill remains a resource constrained language in ASR. Regional call-center\nconversations operate in local language, with a mix of English numbers and\ntechnical terms generally causing a \"code-switching\" problem. Hence, this paper\ndescribes an implementation framework of a resource efficient Automatic Speech\nRecognition/ Speech to Text System in a noisy call-center environment using\nChain Hybrid HMM and CNN-TDNN for Code-Switched Urdu Language. Using Hybrid\nHMM-DNN approach allowed us to utilize the advantages of Neural Network with\nless labelled data. Adding CNN with TDNN has shown to work better in noisy\nenvironment due to CNN's additional frequency dimension which captures extra\ninformation from noisy speech, thus improving accuracy. We collected data from\nvarious open sources and labelled some of the unlabelled data after analysing\nits general context and content from Urdu language as well as from commonly\nused words from other languages, primarily English and were able to achieve WER\nof 5.2% with noisy as well as clean environment in isolated words or numbers as\nwell as in continuous spontaneous speech.\n","authors":["Muhammad Danyal Khan","Raheem Ali","Arshad Aziz"],"pdf_url":"https://arxiv.org/pdf/2307.12759v1.pdf","comment":"32 pages, 19 figures, 2 tables, preprint"},{"id":"http://arxiv.org/abs/2305.16731v3","updated":"2023-07-24T11:20:10Z","published":"2023-05-26T08:33:28Z","title":"Automatic Emotion Experiencer Recognition","summary":"  The most prominent subtask in emotion analysis is emotion classification; to\nassign a category to a textual unit, for instance a social media post. Many\nresearch questions from the social sciences do, however, not only require the\ndetection of the emotion of an author of a post but to understand who is\nascribed an emotion in text. This task is tackled by emotion role labeling\nwhich aims at extracting who is described in text to experience an emotion,\nwhy, and towards whom. This could, however, be considered overly sophisticated\nif the main question to answer is who feels which emotion. A targeted approach\nfor such setup is to classify emotion experiencer mentions (aka \"emoters\")\nregarding the emotion they presumably perceive. This task is similar to named\nentity recognition of person names with the difference that not every mentioned\nentity name is an emoter. While, very recently, data with emoter annotations\nhas been made available, no experiments have yet been performed to detect such\nmentions. With this paper, we provide baseline experiments to understand how\nchallenging the task is. We further evaluate the impact on experiencer-specific\nemotion categorization and appraisal detection in a pipeline, when gold\nmentions are not available. We show that experiencer detection in text is a\nchallenging task, with a precision of .82 and a recall of .56 (F1 =.66). These\nresults motivate future work of jointly modeling emoter spans and\nemotion/appraisal predictions.\n","authors":["Maximilian Wegge","Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2305.16731v3.pdf","comment":"accepted to the CPSS workshop at KONVENS"},{"id":"http://arxiv.org/abs/2307.12659v1","updated":"2023-07-24T10:03:28Z","published":"2023-07-24T10:03:28Z","title":"A Model for Every User and Budget: Label-Free and Personalized\n  Mixed-Precision Quantization","summary":"  Recent advancement in Automatic Speech Recognition (ASR) has produced large\nAI models, which become impractical for deployment in mobile devices. Model\nquantization is effective to produce compressed general-purpose models, however\nsuch models may only be deployed to a restricted sub-domain of interest. We\nshow that ASR models can be personalized during quantization while relying on\njust a small set of unlabelled samples from the target domain. To this end, we\npropose myQASR, a mixed-precision quantization method that generates tailored\nquantization schemes for diverse users under any memory requirement with no\nfine-tuning. myQASR automatically evaluates the quantization sensitivity of\nnetwork layers by analysing the full-precision activation values. We are then\nable to generate a personalised mixed-precision quantization scheme for any\npre-determined memory budget. Results for large-scale ASR models show how\nmyQASR improves performance for specific genders, languages, and speakers.\n","authors":["Edward Fish","Umberto Michieli","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2307.12659v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2301.09790v3","updated":"2023-07-24T10:03:01Z","published":"2023-01-24T02:44:02Z","title":"The Next Chapter: A Study of Large Language Models in Storytelling","summary":"  To enhance the quality of generated stories, recent story generation models\nhave been investigating the utilization of higher-level attributes like plots\nor commonsense knowledge. The application of prompt-based learning with large\nlanguage models (LLMs), exemplified by GPT-3, has exhibited remarkable\nperformance in diverse natural language processing (NLP) tasks. This paper\nconducts a comprehensive investigation, utilizing both automatic and human\nevaluation, to compare the story generation capacity of LLMs with recent models\nacross three datasets with variations in style, register, and length of\nstories. The results demonstrate that LLMs generate stories of significantly\nhigher quality compared to other story generation models. Moreover, they\nexhibit a level of performance that competes with human authors, albeit with\nthe preliminary observation that they tend to replicate real stories in\nsituations involving world knowledge, resembling a form of plagiarism.\n","authors":["Zhuohan Xie","Trevor Cohn","Jey Han Lau"],"pdf_url":"https://arxiv.org/pdf/2301.09790v3.pdf","comment":"Accepted to INLG2023"},{"id":"http://arxiv.org/abs/2304.14721v4","updated":"2023-07-24T09:49:55Z","published":"2023-04-28T09:42:18Z","title":"Towards autonomous system: flexible modular production system enhanced\n  with large language model agents","summary":"  In this paper, we present a novel framework that combines large language\nmodels (LLMs), digital twins and industrial automation system to enable\nintelligent planning and control of production processes. We retrofit the\nautomation system for a modular production facility and create executable\ncontrol interfaces of fine-granular functionalities and coarse-granular skills.\nLow-level functionalities are executed by automation components, and high-level\nskills are performed by automation modules. Subsequently, a digital twin system\nis developed, registering these interfaces and containing additional\ndescriptive information about the production system. Based on the retrofitted\nautomation system and the created digital twins, LLM-agents are designed to\ninterpret descriptive information in the digital twins and control the physical\nsystem through service interfaces. These LLM-agents serve as intelligent agents\non different levels within an automation system, enabling autonomous planning\nand control of flexible production. Given a task instruction as input, the\nLLM-agents orchestrate a sequence of atomic functionalities and skills to\naccomplish the task. We demonstrate how our implemented prototype can handle\nun-predefined tasks, plan a production process, and execute the operations.\nThis research highlights the potential of integrating LLMs into industrial\nautomation systems in the context of smart factory for more agile, flexible,\nand adaptive production processes, while it also underscores the critical\ninsights and limitations for future work. Demos at:\nhttps://github.com/YuchenXia/GPT4IndustrialAutomation\n","authors":["Yuchen Xia","Manthan Shenoy","Nasser Jazdi","Michael Weyrich"],"pdf_url":"https://arxiv.org/pdf/2304.14721v4.pdf","comment":"This is the pre-print draft manuscript. The peer-reviewed version\n  will be published exclusively by IEEE after the conference, which is set to\n  take place from September 12th to 15th, 2023. We've made several improvements\n  to the final version of the paper based on valuable feedback and suggestions\n  from other researchers"},{"id":"http://arxiv.org/abs/2307.12639v1","updated":"2023-07-24T09:30:30Z","published":"2023-07-24T09:30:30Z","title":"Fake News Detection Through Graph-based Neural Networks: A Survey","summary":"  The popularity of online social networks has enabled rapid dissemination of\ninformation. People now can share and consume information much more rapidly\nthan ever before. However, low-quality and/or accidentally/deliberately fake\ninformation can also spread rapidly. This can lead to considerable and negative\nimpacts on society. Identifying, labelling and debunking online misinformation\nas early as possible has become an increasingly urgent problem. Many methods\nhave been proposed to detect fake news including many deep learning and\ngraph-based approaches. In recent years, graph-based methods have yielded\nstrong results, as they can closely model the social context and propagation\nprocess of online news. In this paper, we present a systematic review of fake\nnews detection studies based on graph-based and deep learning-based techniques.\nWe classify existing graph-based methods into knowledge-driven methods,\npropagation-based methods, and heterogeneous social context-based methods,\ndepending on how a graph structure is constructed to model news related\ninformation flows. We further discuss the challenges and open problems in\ngraph-based fake news detection and identify future research directions.\n","authors":["Shuzhi Gong","Richard O. Sinnott","Jianzhong Qi","Cecile Paris"],"pdf_url":"https://arxiv.org/pdf/2307.12639v1.pdf","comment":"18 pages, 3 tables, 7 figures"},{"id":"http://arxiv.org/abs/2210.04676v2","updated":"2023-07-24T09:00:03Z","published":"2022-10-10T13:26:45Z","title":"Learning \"O\" Helps for Learning More: Handling the Concealed Entity\n  Problem for Class-incremental NER","summary":"  As the categories of named entities rapidly increase, the deployed NER models\nare required to keep updating toward recognizing more entity types, creating a\ndemand for class-incremental learning for NER. Considering the privacy concerns\nand storage constraints, the standard paradigm for class-incremental NER\nupdates the models with training data only annotated with the new classes, yet\nthe entities from other entity classes are unlabeled, regarded as \"Non-entity\"\n(or \"O\"). In this work, we conduct an empirical study on the \"Unlabeled Entity\nProblem\" and find that it leads to severe confusion between \"O\" and entities,\ndecreasing class discrimination of old classes and declining the model's\nability to learn new classes. To solve the Unlabeled Entity Problem, we propose\na novel representation learning method to learn discriminative representations\nfor the entity classes and \"O\". Specifically, we propose an entity-aware\ncontrastive learning method that adaptively detects entity clusters in \"O\".\nFurthermore, we propose two effective distance-based relabeling strategies for\nbetter learning the old classes. We introduce a more realistic and challenging\nbenchmark for class-incremental NER, and the proposed method achieves up to\n10.62\\% improvement over the baseline methods.\n","authors":["Ruotian Ma","Xuanting Chen","Lin Zhang","Xin Zhou","Junzhe Wang","Tao Gui","Qi Zhang","Xiang Gao","Yunwen Chen"],"pdf_url":"https://arxiv.org/pdf/2210.04676v2.pdf","comment":"Accepted by ACL 2023"},{"id":"http://arxiv.org/abs/2306.16108v2","updated":"2023-07-24T08:14:44Z","published":"2023-06-28T11:24:48Z","title":"Is ChatGPT a Biomedical Expert? -- Exploring the Zero-Shot Performance\n  of Current GPT Models in Biomedical Tasks","summary":"  We assessed the performance of commercial Large Language Models (LLMs)\nGPT-3.5-Turbo and GPT-4 on tasks from the 2023 BioASQ challenge. In Task 11b\nPhase B, which is focused on answer generation, both models demonstrated\ncompetitive abilities with leading systems. Remarkably, they achieved this with\nsimple zero-shot learning, grounded with relevant snippets. Even without\nrelevant snippets, their performance was decent, though not on par with the\nbest systems. Interestingly, the older and cheaper GPT-3.5-Turbo system was\nable to compete with GPT-4 in the grounded Q&A setting on factoid and list\nanswers. In Task 11b Phase A, focusing on retrieval, query expansion through\nzero-shot learning improved performance, but the models fell short compared to\nother systems. The code needed to rerun these experiments is available through\nGitHub.\n","authors":["Samy Ateia","Udo Kruschwitz"],"pdf_url":"https://arxiv.org/pdf/2306.16108v2.pdf","comment":"Preprint accepted at the 11th BioASQ Workshop at the 14th Conference\n  and Labs of the Evaluation Forum (CLEF) 2023; Changes: 1. Added related work\n  and experimental setup sections. 2. Reworked discussion and future work\n  section. 3. Fixed multiple typos and improved style. Changed license"},{"id":"http://arxiv.org/abs/2307.12573v1","updated":"2023-07-24T07:40:59Z","published":"2023-07-24T07:40:59Z","title":"Tachikuma: Understading Complex Interactions with Multi-Character and\n  Novel Objects by Large Language Models","summary":"  Recent advancements in natural language and Large Language Models (LLMs) have\nenabled AI agents to simulate human-like interactions within virtual worlds.\nHowever, these interactions still face limitations in complexity and\nflexibility, particularly in scenarios involving multiple characters and novel\nobjects. Pre-defining all interactable objects in the agent's world model\npresents challenges, and conveying implicit intentions to multiple characters\nthrough complex interactions remains difficult. To address these issues, we\npropose integrating virtual Game Masters (GMs) into the agent's world model,\ndrawing inspiration from Tabletop Role-Playing Games (TRPGs). GMs play a\ncrucial role in overseeing information, estimating players' intentions,\nproviding environment descriptions, and offering feedback, compensating for\ncurrent world model deficiencies. To facilitate future explorations for complex\ninteractions, we introduce a benchmark named Tachikuma, comprising a Multiple\ncharacter and novel Object based interaction Estimation (MOE) task and a\nsupporting dataset. MOE challenges models to understand characters' intentions\nand accurately determine their actions within intricate contexts involving\nmulti-character and novel object interactions. Besides, the dataset captures\nlog data from real-time communications during gameplay, providing diverse,\ngrounded, and complex interactions for further explorations. Finally, we\npresent a simple prompting baseline and evaluate its performance, demonstrating\nits effectiveness in enhancing interaction understanding. We hope that our\ndataset and task will inspire further research in complex interactions with\nnatural language, fostering the development of more advanced AI agents.\n","authors":["Yuanzhi Liang","Linchao Zhu","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12573v1.pdf","comment":"Preliminary version of an ongoing work"},{"id":"http://arxiv.org/abs/2307.12564v1","updated":"2023-07-24T07:17:33Z","published":"2023-07-24T07:17:33Z","title":"Towards Generalising Neural Topical Representations","summary":"  Topic models have evolved from conventional Bayesian probabilistic models to\nNeural Topic Models (NTMs) over the last two decays. Although NTMs have\nachieved promising performance when trained and tested on a specific corpus,\ntheir generalisation ability across corpora is rarely studied. In practice, we\noften expect that an NTM trained on a source corpus can still produce quality\ntopical representation for documents in a different target corpus without\nretraining. In this work, we aim to improve NTMs further so that their benefits\ngeneralise reliably across corpora and tasks. To do so, we propose to model\nsimilar documents by minimising their semantical distance when training NTMs.\nSpecifically, similar documents are created by data augmentation during\ntraining; The semantical distance between documents is measured by the\nHierarchical Topic Transport Distance (HOTT), which computes the Optimal\nTransport (OT) distance between the topical representations. Our framework can\nbe readily applied to most NTMs as a plug-and-play module. Extensive\nexperiments show that our framework significantly improves the generalisation\nability regarding neural topical representation across corpora.\n","authors":["Xiaohao Yang","He Zhao","Dinh Phung","Lan Du"],"pdf_url":"https://arxiv.org/pdf/2307.12564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.11578v2","updated":"2023-07-24T06:53:10Z","published":"2021-03-22T04:44:43Z","title":"SparseGAN: Sparse Generative Adversarial Network for Text Generation","summary":"  It is still a challenging task to learn a neural text generation model under\nthe framework of generative adversarial networks (GANs) since the entire\ntraining process is not differentiable. The existing training strategies either\nsuffer from unreliable gradient estimations or imprecise sentence\nrepresentations. Inspired by the principle of sparse coding, we propose a\nSparseGAN that generates semantic-interpretable, but sparse sentence\nrepresentations as inputs to the discriminator. The key idea is that we treat\nan embedding matrix as an over-complete dictionary, and use a linear\ncombination of very few selected word embeddings to approximate the output\nfeature representation of the generator at each time step. With such\nsemantic-rich representations, we not only reduce unnecessary noises for\nefficient adversarial training, but also make the entire training process fully\ndifferentiable. Experiments on multiple text generation datasets yield\nperformance improvements, especially in sequence-level metrics, such as BLEU.\n","authors":["Liping Yuan","Jiehang Zeng","Xiaoqing Zheng"],"pdf_url":"https://arxiv.org/pdf/2103.11578v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09710v3","updated":"2023-07-24T05:39:27Z","published":"2022-11-17T17:45:59Z","title":"Style Classification of Rabbinic Literature for Detection of Lost\n  Midrash Tanhuma Material","summary":"  Midrash collections are complex rabbinic works that consist of text in\nmultiple languages, which evolved through long processes of unstable oral and\nwritten transmission. Determining the origin of a given passage in such a\ncompilation is not always straightforward and is often a matter of dispute\namong scholars, yet it is essential for scholars' understanding of the passage\nand its relationship to other texts in the rabbinic corpus. To help solve this\nproblem, we propose a system for classification of rabbinic literature based on\nits style, leveraging recent advances in natural language processing for Hebrew\ntexts. Additionally, we demonstrate how this method can be applied to uncover\nlost material from a specific midrash genre, Tan\\d{h}uma-Yelammedenu, that has\nbeen preserved in later anthologies.\n","authors":["Shlomo Tannor","Nachum Dershowitz","Moshe Lavee"],"pdf_url":"https://arxiv.org/pdf/2211.09710v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12520v1","updated":"2023-07-24T04:29:43Z","published":"2023-07-24T04:29:43Z","title":"Lost In Translation: Generating Adversarial Examples Robust to\n  Round-Trip Translation","summary":"  Language Models today provide a high accuracy across a large number of\ndownstream tasks. However, they remain susceptible to adversarial attacks,\nparticularly against those where the adversarial examples maintain considerable\nsimilarity to the original text. Given the multilingual nature of text, the\neffectiveness of adversarial examples across translations and how machine\ntranslations can improve the robustness of adversarial examples remain largely\nunexplored. In this paper, we present a comprehensive study on the robustness\nof current text adversarial attacks to round-trip translation. We demonstrate\nthat 6 state-of-the-art text-based adversarial attacks do not maintain their\nefficacy after round-trip translation. Furthermore, we introduce an\nintervention-based solution to this problem, by integrating Machine Translation\ninto the process of adversarial example generation and demonstrating increased\nrobustness to round-trip translation. Our results indicate that finding\nadversarial examples robust to translation can help identify the insufficiency\nof language models that is common across languages, and motivate further\nresearch into multilingual adversarial attacks.\n","authors":["Neel Bhandari","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12520v1.pdf","comment":"Published at International Conference on Acoustics, Speech, and\n  Signal Processing (ICASSP) 2023"},{"id":"http://arxiv.org/abs/2009.04639v2","updated":"2023-07-24T03:56:31Z","published":"2020-09-10T02:22:21Z","title":"Improving Coreference Resolution by Leveraging Entity-Centric Features\n  with Graph Neural Networks and Second-order Inference","summary":"  One of the major challenges in coreference resolution is how to make use of\nentity-level features defined over clusters of mentions rather than mention\npairs. However, coreferent mentions usually spread far apart in an entire text,\nwhich makes it extremely difficult to incorporate entity-level features. We\npropose a graph neural network-based coreference resolution method that can\ncapture the entity-centric information by encouraging the sharing of features\nacross all mentions that probably refer to the same real-world entity. Mentions\nare linked to each other via the edges modeling how likely two linked mentions\npoint to the same entity. Modeling by such graphs, the features between\nmentions can be shared by message passing operations in an entity-centric\nmanner. A global inference algorithm up to second-order features is also\npresented to optimally cluster mentions into consistent groups. Experimental\nresults show our graph neural network-based method combing with the\nsecond-order decoding algorithm (named GNNCR) achieved close to\nstate-of-the-art performance on the English CoNLL-2012 Shared Task dataset.\n","authors":["Lu Liu","Zhenqiao Song","Xiaoqing Zheng","Jun He"],"pdf_url":"https://arxiv.org/pdf/2009.04639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12507v1","updated":"2023-07-24T03:44:17Z","published":"2023-07-24T03:44:17Z","title":"Investigating the Existence of \"Secret Language'' in Language Models","summary":"  In this paper, we study the problem of secret language in NLP, where current\nlanguage models (LMs) seem to have a hidden vocabulary that allows them to\ninterpret absurd inputs as meaningful concepts. We investigate two research\nquestions: ``Does the secret language phenomenon exist in different language\nmodels?'' and ``Does secret language depend on specific context?'' To answer\nthese questions, we introduce a novel method named \\textit{SecretFinding}, a\ngradient-based approach that can automatically discover secret languages in\nLMs. We conduct experiments on five representative models (Electra, ALBERT,\nRoberta, DistillBERT, and CLIP) finetuned on four NLP benchmarks (SST-2, MRPC,\nSNLI, and SQuAD) and a language-grounding benchmark (MSCOCO). Our experimental\nresults show that even when we replace the most important words with others\nthat are semantically dissimilar to the original words in a sentence, LMs do\nnot consider the new sentence semantically dissimilar to the original, as the\noutput does not change with a high probability. This phenomenon holds true\nacross the five models and five tasks and gives a positive answer to the first\nresearch question. As for the second research question, we find that the secret\nlanguage discovered by \\textit{SecretFinding} is quite general and could even\nbe transferred to other models in the black-box settings, such as GPT-3 and\nChatGPT. Finally, we discuss the causes of secret language, how to eliminate\nit, the potential connection to memorization, and ethical implications.\nExamples of secret language found by SecretFinding are available on\nhttps://huggingface.co/spaces/anonymousauthors/ACL23_SecretLanguage.\n","authors":["Yimu Wang","Peng Shi","Hongyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13040v3","updated":"2023-07-24T03:31:42Z","published":"2023-05-22T13:47:51Z","title":"SpokenWOZ: A Large-Scale Speech-Text Benchmark for Spoken Task-Oriented\n  Dialogue Agents","summary":"  Task-oriented dialogue (TOD) models have made significant progress in recent\nyears. However, previous studies primarily focus on datasets written by\nannotators, which has resulted in a gap between academic research and\nreal-world spoken conversation scenarios. While several small-scale spoken TOD\ndatasets are proposed to address robustness issues such as ASR errors, they\nignore the unique challenges in spoken conversation. To tackle the limitations,\nwe introduce SpokenWOZ, a large-scale speech-text dataset for spoken TOD,\ncontaining 8 domains, 203k turns, 5.7k dialogues and 249 hours of audios from\nhuman-to-human spoken conversations. SpokenWOZ further incorporates common\nspoken characteristics such as word-by-word processing and reasoning in spoken\nlanguage. Based on these characteristics, we present cross-turn slot and\nreasoning slot detection as new challenges. We conduct experiments on various\nbaselines, including text-modal models, newly proposed dual-modal models, and\nLLMs, e.g., ChatGPT. The results show that the current models still have\nsubstantial room for improvement in spoken conversation, where the most\nadvanced dialogue state tracker only achieves 25.65% in joint goal accuracy and\nthe SOTA end-to-end model only correctly completes the user request in 52.1% of\ndialogues. The dataset, code, and leaderboard are available:\nhttps://spokenwoz.github.io/SpokenWOZ-github.io/.\n","authors":["Shuzheng Si","Wentao Ma","Haoyu Gao","Yuchuan Wu","Ting-En Lin","Yinpei Dai","Hangyu Li","Rui Yan","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2305.13040v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2009.07481v2","updated":"2023-07-24T03:26:17Z","published":"2020-09-16T05:58:00Z","title":"Unsupervised Summarization by Jointly Extracting Sentences and Keywords","summary":"  We present RepRank, an unsupervised graph-based ranking model for extractive\nmulti-document summarization in which the similarity between words, sentences,\nand word-to-sentence can be estimated by the distances between their vector\nrepresentations in a unified vector space. In order to obtain desirable\nrepresentations, we propose a self-attention based learning method that\nrepresent a sentence by the weighted sum of its word embeddings, and the\nweights are concentrated to those words hopefully better reflecting the content\nof a document. We show that salient sentences and keywords can be extracted in\na joint and mutual reinforcement process using our learned representations, and\nprove that this process always converges to a unique solution leading to\nimprovement in performance. A variant of absorbing random walk and the\ncorresponding sampling-based algorithm are also described to avoid redundancy\nand increase diversity in the summaries. Experiment results with multiple\nbenchmark datasets show that RepRank achieved the best or comparable\nperformance in ROUGE.\n","authors":["Zongyi Li","Xiaoqing Zheng","Jun He"],"pdf_url":"https://arxiv.org/pdf/2009.07481v2.pdf","comment":"10 pages(includes 2 pages references), 1 figure"},{"id":"http://arxiv.org/abs/2307.12498v1","updated":"2023-07-24T03:07:40Z","published":"2023-07-24T03:07:40Z","title":"Robust Automatic Speech Recognition via WavAugment Guided Phoneme\n  Adversarial Training","summary":"  Developing a practically-robust automatic speech recognition (ASR) is\nchallenging since the model should not only maintain the original performance\non clean samples, but also achieve consistent efficacy under small volume\nperturbations and large domain shifts. To address this problem, we propose a\nnovel WavAugment Guided Phoneme Adversarial Training (wapat). wapat use\nadversarial examples in phoneme space as augmentation to make the model\ninvariant to minor fluctuations in phoneme representation and preserve the\nperformance on clean samples. In addition, wapat utilizes the phoneme\nrepresentation of augmented samples to guide the generation of adversaries,\nwhich helps to find more stable and diverse gradient-directions, resulting in\nimproved generalization. Extensive experiments demonstrate the effectiveness of\nwapat on End-to-end Speech Challenge Benchmark (ESB). Notably, SpeechLM-wapat\noutperforms the original model by 6.28% WER reduction on ESB, achieving the new\nstate-of-the-art.\n","authors":["Gege Qi","Yuefeng Chen","Xiaofeng Mao","Xiaojun Jia","Ranjie Duan","Rong Zhang","Hui Xue"],"pdf_url":"https://arxiv.org/pdf/2307.12498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11610v2","updated":"2023-07-24T01:35:47Z","published":"2023-07-21T14:25:39Z","title":"CausE: Towards Causal Knowledge Graph Embedding","summary":"  Knowledge graph embedding (KGE) focuses on representing the entities and\nrelations of a knowledge graph (KG) into the continuous vector spaces, which\ncan be employed to predict the missing triples to achieve knowledge graph\ncompletion (KGC). However, KGE models often only briefly learn structural\ncorrelations of triple data and embeddings would be misled by the trivial\npatterns and noisy links in real-world KGs. To address this issue, we build the\nnew paradigm of KGE in the context of causality and embedding disentanglement.\nWe further propose a Causality-enhanced knowledge graph Embedding (CausE)\nframework. CausE employs causal intervention to estimate the causal effect of\nthe confounder embeddings and design new training objectives to make stable\npredictions. Experimental results demonstrate that CausE could outperform the\nbaseline models and achieve state-of-the-art KGC performance. We release our\ncode in https://github.com/zjukg/CausE.\n","authors":["Yichi Zhang","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11610v2.pdf","comment":"Accepted by CCKS 2023 as a research paper"},{"id":"http://arxiv.org/abs/2306.14096v4","updated":"2023-07-24T00:58:11Z","published":"2023-06-25T02:24:30Z","title":"Chinese Fine-Grained Financial Sentiment Analysis with Large Language\n  Models","summary":"  Entity-level fine-grained sentiment analysis in the financial domain is a\ncrucial subtask of sentiment analysis and currently faces numerous challenges.\nThe primary challenge stems from the lack of high-quality and large-scale\nannotated corpora specifically designed for financial text sentiment analysis,\nwhich in turn limits the availability of data necessary for developing\neffective text processing techniques. Recent advancements in large language\nmodels (LLMs) have yielded remarkable performance in natural language\nprocessing tasks, primarily centered around language pattern matching. In this\npaper, we propose a novel and extensive Chinese fine-grained financial\nsentiment analysis dataset, FinChina SA, for enterprise early warning. We\nthoroughly evaluate and experiment with well-known existing open-source LLMs\nusing our dataset. We firmly believe that our dataset will serve as a valuable\nresource to advance the exploration of real-world financial sentiment analysis\ntasks, which should be the focus of future research. The FinChina SA dataset is\npublicly available at https://github.com/YerayL/FinChina-SA\n","authors":["Yinyu Lan","Yanru Wu","Wang Xu","Weiqiang Feng","Youhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.14096v4.pdf","comment":"FinLLM Symposium at IJCAI 2023"},{"id":"http://arxiv.org/abs/2305.01788v3","updated":"2023-07-24T00:54:51Z","published":"2023-05-02T21:33:10Z","title":"Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation\n  Incorporating Gloss Information","summary":"  Visual Word Sense Disambiguation (VWSD) is a task to find the image that most\naccurately depicts the correct sense of the target word for the given context.\nPreviously, image-text matching models often suffered from recognizing\npolysemous words. This paper introduces an unsupervised VWSD approach that uses\ngloss information of an external lexical knowledge-base, especially the sense\ndefinitions. Specifically, we suggest employing Bayesian inference to\nincorporate the sense definitions when sense information of the answer is not\nprovided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we\npropose a context-aware definition generation with GPT-3. Experimental results\nshow that the VWSD performance significantly increased with our Bayesian\ninference-based approach. In addition, our context-aware definition generation\nachieved prominent performance improvement in OOD examples exhibiting better\nperformance than the existing definition generation method.\n","authors":["Sunjae Kwon","Rishabh Garodia","Minhwa Lee","Zhichao Yang","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2305.01788v3.pdf","comment":"ACL 2023, https://aclanthology.org/2023.acl-long.88"},{"id":"http://arxiv.org/abs/2307.02591v2","updated":"2023-07-24T00:47:23Z","published":"2023-07-05T18:41:29Z","title":"ODD: A Benchmark Dataset for the NLP-based Opioid Related Aberrant\n  Behavior Detection","summary":"  Opioid related aberrant behaviors (ORAB) present novel risk factors for\nopioid overdose. Previously, ORAB have been mainly assessed by survey results\nand by monitoring drug administrations. Such methods however, cannot scale up\nand do not cover the entire spectrum of aberrant behaviors. On the other hand,\nORAB are widely documented in electronic health record notes. This paper\nintroduces a novel biomedical natural language processing benchmark dataset\nnamed ODD, for ORAB Detection Dataset. ODD is an expert-annotated dataset\ncomprising of more than 750 publicly available EHR notes. ODD has been designed\nto identify ORAB from patients' EHR notes and classify them into nine\ncategories; 1) Confirmed Aberrant Behavior, 2) Suggested Aberrant Behavior, 3)\nOpioids, 4) Indication, 5) Diagnosed opioid dependency, 6) Benzodiapines, 7)\nMedication Changes, 8) Central Nervous System-related, and 9) Social\nDeterminants of Health. We explored two state-of-the-art natural language\nprocessing (NLP) models (finetuning pretrained language models and\nprompt-tuning approaches) to identify ORAB. Experimental results show that the\nprompt-tuning models outperformed the finetuning models in most cateogories and\nthe gains were especially higher among uncommon categories (Suggested aberrant\nbehavior, Diagnosed opioid dependency and Medication change). Although the best\nmodel achieved the highest 83.92% on area under precision recall curve,\nuncommon classes (Suggested Aberrant Behavior, Diagnosed Opioid Dependence, and\nMedication Change) still have a large room for performance improvement.\n","authors":["Sunjae Kwon","Xun Wang","Weisong Liu","Emily Druhl","Minhee L. Sung","Joel I. Reisman","Wenjun Li","Robert D. Kerns","William Becker","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2307.02591v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.13176v1","updated":"2023-07-24T23:53:13Z","published":"2023-07-24T23:53:13Z","title":"Schema-Driven Actionable Insight Generation and Smart Recommendation","summary":"  In natural language generation (NLG), insight mining is seen as a\ndata-to-text task, where data is mined for interesting patterns and verbalised\ninto 'insight' statements. An 'over-generate and rank' paradigm is intuitively\nused to generate such insights. The multidimensionality and subjectivity of\nthis process make it challenging. This paper introduces a schema-driven method\nto generate actionable insights from data to drive growth and change. It also\nintroduces a technique to rank the insights to align with user interests based\non their feedback. We show preliminary qualitative results of the insights\ngenerated using our technique and demonstrate its ability to adapt to feedback.\n","authors":["Allmin Susaiyah","Aki Härmä","Milan Petković"],"pdf_url":"https://arxiv.org/pdf/2307.13176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13173v1","updated":"2023-07-24T23:42:32Z","published":"2023-07-24T23:42:32Z","title":"Opinion Mining Using Population-tuned Generative Language Models","summary":"  We present a novel method for mining opinions from text collections using\ngenerative language models trained on data collected from different\npopulations. We describe the basic definitions, methodology and a generic\nalgorithm for opinion insight mining. We demonstrate the performance of our\nmethod in an experiment where a pre-trained generative model is fine-tuned\nusing specifically tailored content with unnatural and fully annotated\nopinions. We show that our approach can learn and transfer the opinions to the\nsemantic classes while maintaining the proportion of polarisation. Finally, we\ndemonstrate the usage of an insight mining system to scale up the discovery of\nopinion insights from a real text corpus.\n","authors":["Allmin Susaiyah","Abhinay Pandya","Aki Härmä"],"pdf_url":"https://arxiv.org/pdf/2307.13173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13128v1","updated":"2023-07-24T21:05:47Z","published":"2023-07-24T21:05:47Z","title":"Explaining Math Word Problem Solvers","summary":"  Automated math word problem solvers based on neural networks have\nsuccessfully managed to obtain 70-80\\% accuracy in solving arithmetic word\nproblems. However, it has been shown that these solvers may rely on superficial\npatterns to obtain their equations. In order to determine what information math\nword problem solvers use to generate solutions, we remove parts of the input\nand measure the model's performance on the perturbed dataset. Our results show\nthat the model is not sensitive to the removal of many words from the input and\ncan still manage to find a correct answer when given a nonsense question. This\nindicates that automatic solvers do not follow the semantic logic of math word\nproblems, and may be overfitting to the presence of specific words.\n","authors":["Abby Newcomb","Jugal Kalita"],"pdf_url":"https://arxiv.org/pdf/2307.13128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.15498v2","updated":"2023-07-24T20:08:20Z","published":"2021-06-29T15:25:33Z","title":"Classification of Consumer Belief Statements From Social Media","summary":"  Social media offer plenty of information to perform market research in order\nto meet the requirements of customers. One way how this research is conducted\nis that a domain expert gathers and categorizes user-generated content into a\ncomplex and fine-grained class structure. In many of such cases, little data\nmeets complex annotations. It is not yet fully understood how this can be\nleveraged successfully for classification. We examine the classification\naccuracy of expert labels when used with a) many fine-grained classes and b)\nfew abstract classes. For scenario b) we compare abstract class labels given by\nthe domain expert as baseline and by automatic hierarchical clustering. We\ncompare this to another baseline where the entire class structure is given by a\ncompletely unsupervised clustering approach. By doing so, this work can serve\nas an example of how complex expert annotations are potentially beneficial and\ncan be utilized in the most optimal way for opinion mining in highly specific\ndomains. By exploring across a range of techniques and experiments, we find\nthat automated class abstraction approaches in particular the unsupervised\napproach performs remarkably well against domain expert baseline on text\nclassification tasks. This has the potential to inspire opinion mining\napplications in order to support market researchers in practice and to inspire\nfine-grained automated content analysis on a large scale.\n","authors":["Gerhard Johann Hagerer","Wenbin Le","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2106.15498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.10575v2","updated":"2023-07-24T20:07:07Z","published":"2021-10-20T14:04:13Z","title":"SocialVisTUM: An Interactive Visualization Toolkit for Correlated Neural\n  Topic Models on Social Media Opinion Mining","summary":"  Recent research in opinion mining proposed word embedding-based topic\nmodeling methods that provide superior coherence compared to traditional topic\nmodeling. In this paper, we demonstrate how these methods can be used to\ndisplay correlated topic models on social media texts using SocialVisTUM, our\nproposed interactive visualization toolkit. It displays a graph with topics as\nnodes and their correlations as edges. Further details are displayed\ninteractively to support the exploration of large text collections, e.g.,\nrepresentative words and sentences of topics, topic and sentiment\ndistributions, hierarchical topic clustering, and customizable, predefined\ntopic labels. The toolkit optimizes automatically on custom data for optimal\ncoherence. We show a working instance of the toolkit on data crawled from\nEnglish social media discussions about organic food consumption. The\nvisualization confirms findings of a qualitative consumer research study.\nSocialVisTUM and its training procedures are accessible online.\n","authors":["Gerhard Johann Hagerer","Martin Kirchhoff","Hannah Danner","Robert Pesch","Mainak Ghosh","Archishman Roy","Jiaxi Zhao","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2110.10575v2.pdf","comment":"Demo paper accepted for publication on RANLP 2021; 8 pages, 5\n  figures, 1 table"},{"id":"http://arxiv.org/abs/2110.15134v2","updated":"2023-07-24T20:05:38Z","published":"2021-10-28T14:09:44Z","title":"An Analysis of Programming Course Evaluations Before and After the\n  Introduction of an Autograder","summary":"  Commonly, introductory programming courses in higher education institutions\nhave hundreds of participating students eager to learn to program. The manual\neffort for reviewing the submitted source code and for providing feedback can\nno longer be managed. Manually reviewing the submitted homework can be\nsubjective and unfair, particularly if many tutors are responsible for grading.\nDifferent autograders can help in this situation; however, there is a lack of\nknowledge about how autograders can impact students' overall perception of\nprogramming classes and teaching. This is relevant for course organizers and\ninstitutions to keep their programming courses attractive while coping with\nincreasing students.\n  This paper studies the answers to the standardized university evaluation\nquestionnaires of multiple large-scale foundational computer science courses\nwhich recently introduced autograding. The differences before and after this\nintervention are analyzed. By incorporating additional observations, we\nhypothesize how the autograder might have contributed to the significant\nchanges in the data, such as, improved interactions between tutors and\nstudents, improved overall course quality, improved learning success, increased\ntime spent, and reduced difficulty. This qualitative study aims to provide\nhypotheses for future research to define and conduct quantitative surveys and\ndata analysis. The autograder technology can be validated as a teaching method\nto improve student satisfaction with programming courses.\n","authors":["Gerhard Johann Hagerer","Laura Lahesoo","Miriam Anschütz","Stephan Krusche","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2110.15134v2.pdf","comment":"Accepted full paper article on IEEE ITHET 2021"},{"id":"http://arxiv.org/abs/2111.02259v3","updated":"2023-07-24T20:03:14Z","published":"2021-11-03T14:49:50Z","title":"A Case Study and Qualitative Analysis of Simple Cross-Lingual Opinion\n  Mining","summary":"  User-generated content from social media is produced in many languages,\nmaking it technically challenging to compare the discussed themes from one\ndomain across different cultures and regions. It is relevant for domains in a\nglobalized world, such as market research, where people from two nations and\nmarkets might have different requirements for a product. We propose a simple,\nmodern, and effective method for building a single topic model with sentiment\nanalysis capable of covering multiple languages simultanteously, based on a\npre-trained state-of-the-art deep neural network for natural language\nunderstanding. To demonstrate its feasibility, we apply the model to newspaper\narticles and user comments of a specific domain, i.e., organic food products\nand related consumption behavior. The themes match across languages.\nAdditionally, we obtain an high proportion of stable and domain-relevant\ntopics, a meaningful relation between topics and their respective textual\ncontents, and an interpretable representation for social media documents.\nMarketing can potentially benefit from our method, since it provides an\neasy-to-use means of addressing specific customer interests from different\nmarket regions around the globe. For reproducibility, we provide the code,\ndata, and results of our study.\n","authors":["Gerhard Johann Hagerer","Wing Sheung Leung","Qiaoxi Liu","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2111.02259v3.pdf","comment":"10 pages, 2 tables, 5 figures, full paper, peer-reviewed, published\n  at KDIR/IC3k 2021 conference"},{"id":"http://arxiv.org/abs/2307.13106v1","updated":"2023-07-24T19:54:15Z","published":"2023-07-24T19:54:15Z","title":"How to use LLMs for Text Analysis","summary":"  This guide introduces Large Language Models (LLM) as a highly versatile text\nanalysis method within the social sciences. As LLMs are easy-to-use, cheap,\nfast, and applicable on a broad range of text analysis tasks, ranging from text\nannotation and classification to sentiment analysis and critical discourse\nanalysis, many scholars believe that LLMs will transform how we do text\nanalysis. This how-to guide is aimed at students and researchers with limited\nprogramming experience, and offers a simple introduction to how LLMs can be\nused for text analysis in your own research project, as well as advice on best\npractices. We will go through each of the steps of analyzing textual data with\nLLMs using Python: installing the software, setting up the API, loading the\ndata, developing an analysis prompt, analyzing the text, and validating the\nresults. As an illustrative example, we will use the challenging task of\nidentifying populism in political texts, and show how LLMs move beyond the\nexisting state-of-the-art.\n","authors":["Petter Törnberg"],"pdf_url":"https://arxiv.org/pdf/2307.13106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.02326v2","updated":"2023-07-24T19:44:53Z","published":"2021-11-03T16:20:16Z","title":"End-to-End Annotator Bias Approximation on Crowdsourced Single-Label\n  Sentiment Analysis","summary":"  Sentiment analysis is often a crowdsourcing task prone to subjective labels\ngiven by many annotators. It is not yet fully understood how the annotation\nbias of each annotator can be modeled correctly with state-of-the-art methods.\nHowever, resolving annotator bias precisely and reliably is the key to\nunderstand annotators' labeling behavior and to successfully resolve\ncorresponding individual misconceptions and wrongdoings regarding the\nannotation task. Our contribution is an explanation and improvement for precise\nneural end-to-end bias modeling and ground truth estimation, which reduces an\nundesired mismatch in that regard of the existing state-of-the-art.\nClassification experiments show that it has potential to improve accuracy in\ncases where each sample is annotated only by one single annotator. We provide\nthe whole source code publicly and release an own domain-specific sentiment\ndataset containing 10,000 sentences discussing organic food products. These are\ncrawled from social media and are singly labeled by 10 non-expert annotators.\n","authors":["Gerhard Johann Hagerer","David Szabo","Andreas Koch","Maria Luisa Ripoll Dominguez","Christian Widmer","Maximilian Wich","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2111.02326v2.pdf","comment":"10 pages, 2 figures, 2 tables, full conference paper, peer-reviewed"},{"id":"http://arxiv.org/abs/2305.17008v2","updated":"2023-07-24T19:18:25Z","published":"2023-05-26T15:09:11Z","title":"NormBank: A Knowledge Bank of Situational Social Norms","summary":"  We present NormBank, a knowledge bank of 155k situational norms. This\nresource is designed to ground flexible normative reasoning for interactive,\nassistive, and collaborative AI systems. Unlike prior commonsense resources,\nNormBank grounds each inference within a multivalent sociocultural frame, which\nincludes the setting (e.g., restaurant), the agents' contingent roles (waiter,\ncustomer), their attributes (age, gender), and other physical, social, and\ncultural constraints (e.g., the temperature or the country of operation). In\ntotal, NormBank contains 63k unique constraints from a taxonomy that we\nintroduce and iteratively refine here. Constraints then apply in different\ncombinations to frame social norms. Under these manipulations, norms are\nnon-monotonic - one can cancel an inference by updating its frame even\nslightly. Still, we find evidence that neural models can help reliably extend\nthe scope and coverage of NormBank. We further demonstrate the utility of this\nresource with a series of transfer experiments.\n","authors":["Caleb Ziems","Jane Dwivedi-Yu","Yi-Chia Wang","Alon Halevy","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2305.17008v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13085v1","updated":"2023-07-24T19:14:38Z","published":"2023-07-24T19:14:38Z","title":"Making Metadata More FAIR Using Large Language Models","summary":"  With the global increase in experimental data artifacts, harnessing them in a\nunified fashion leads to a major stumbling block - bad metadata. To bridge this\ngap, this work presents a Natural Language Processing (NLP) informed\napplication, called FAIRMetaText, that compares metadata. Specifically,\nFAIRMetaText analyzes the natural language descriptions of metadata and\nprovides a mathematical similarity measure between two terms. This measure can\nthen be utilized for analyzing varied metadata, by suggesting terms for\ncompliance or grouping similar terms for identification of replaceable terms.\nThe efficacy of the algorithm is presented qualitatively and quantitatively on\npublicly available research artifacts and demonstrates large gains across\nmetadata related tasks through an in-depth study of a wide variety of Large\nLanguage Models (LLMs). This software can drastically reduce the human effort\nin sifting through various natural language metadata while employing several\nexperimental datasets on the same topic.\n","authors":["Sowmya S. Sundaram","Mark A. Musen"],"pdf_url":"https://arxiv.org/pdf/2307.13085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00017v2","updated":"2023-07-24T18:46:22Z","published":"2023-05-30T15:15:40Z","title":"Towards Explainable and Language-Agnostic LLMs: Symbolic Reverse\n  Engineering of Language at Scale","summary":"  Large language models (LLMs) have achieved a milestone that undenia-bly\nchanged many held beliefs in artificial intelligence (AI). However, there\nremains many limitations of these LLMs when it comes to true language\nunderstanding, limitations that are a byproduct of the under-lying architecture\nof deep neural networks. Moreover, and due to their subsymbolic nature,\nwhatever knowledge these models acquire about how language works will always be\nburied in billions of microfeatures (weights), none of which is meaningful on\nits own, making such models hopelessly unexplainable. To address these\nlimitations, we suggest com-bining the strength of symbolic representations\nwith what we believe to be the key to the success of LLMs, namely a successful\nbottom-up re-verse engineering of language at scale. As such we argue for a\nbottom-up reverse engineering of language in a symbolic setting. Hints on what\nthis project amounts to have been suggested by several authors, and we discuss\nin some detail here how this project could be accomplished.\n","authors":["Walid S. Saba"],"pdf_url":"https://arxiv.org/pdf/2306.00017v2.pdf","comment":"Draft, preprint"},{"id":"http://arxiv.org/abs/2307.13018v1","updated":"2023-07-24T17:17:13Z","published":"2023-07-24T17:17:13Z","title":"The potential of LLMs for coding with low-resource and domain-specific\n  programming languages","summary":"  This paper presents a study on the feasibility of using large language models\n(LLM) for coding with low-resource and domain-specific programming languages\nthat typically lack the amount of data required for effective LLM processing\ntechniques. This study focuses on the econometric scripting language named\nhansl of the open-source software gretl and employs a proprietary LLM based on\nGPT-3.5. Our findings suggest that LLMs can be a useful tool for writing,\nunderstanding, improving, and documenting gretl code, which includes generating\ndescriptive docstrings for functions and providing precise explanations for\nabstract and poorly documented econometric code. While the LLM showcased\npromoting docstring-to-code translation capability, we also identify some\nlimitations, such as its inability to improve certain sections of code and to\nwrite accurate unit tests. This study is a step towards leveraging the power of\nLLMs to facilitate software development in low-resource programming languages\nand ultimately to lower barriers to entry for their adoption.\n","authors":["Artur Tarassow"],"pdf_url":"https://arxiv.org/pdf/2307.13018v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.12981v1","updated":"2023-07-24T17:59:02Z","published":"2023-07-24T17:59:02Z","title":"3D-LLM: Injecting the 3D World into Large Language Models","summary":"  Large language models (LLMs) and Vision-Language Models (VLMs) have been\nproven to excel at multiple tasks, such as commonsense reasoning. Powerful as\nthese models can be, they are not grounded in the 3D physical world, which\ninvolves richer concepts such as spatial relationships, affordances, physics,\nlayout, and so on. In this work, we propose to inject the 3D world into large\nlanguage models and introduce a whole new family of 3D-LLMs. Specifically,\n3D-LLMs can take 3D point clouds and their features as input and perform a\ndiverse set of 3D-related tasks, including captioning, dense captioning, 3D\nquestion answering, task decomposition, 3D grounding, 3D-assisted dialog,\nnavigation, and so on. Using three types of prompting mechanisms that we\ndesign, we are able to collect over 300k 3D-language data covering these tasks.\nTo efficiently train 3D-LLMs, we first utilize a 3D feature extractor that\nobtains 3D features from rendered multi- view images. Then, we use 2D VLMs as\nour backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,\n3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show\nthat our model outperforms state-of-the-art baselines by a large margin (e.g.,\nthe BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,\nexperiments on our held-in datasets for 3D captioning, task composition, and\n3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative\nexamples also show that our model could perform more tasks beyond the scope of\nexisting LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.\n","authors":["Yining Hong","Haoyu Zhen","Peihao Chen","Shuhong Zheng","Yilun Du","Zhenfang Chen","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2307.12981v1.pdf","comment":"Project Page: : https://vis-www.cs.umass.edu/3dllm/"},{"id":"http://arxiv.org/abs/2209.05407v3","updated":"2023-07-24T17:58:31Z","published":"2022-09-12T16:59:36Z","title":"Segmenting Known Objects and Unseen Unknowns without Prior Knowledge","summary":"  Panoptic segmentation methods assign a known class to each pixel given in\ninput. Even for state-of-the-art approaches, this inevitably enforces decisions\nthat systematically lead to wrong predictions for objects outside the training\ncategories. However, robustness against out-of-distribution samples and corner\ncases is crucial in safety-critical settings to avoid dangerous consequences.\nSince real-world datasets cannot contain enough data points to adequately\nsample the long tail of the underlying distribution, models must be able to\ndeal with unseen and unknown scenarios as well. Previous methods targeted this\nby re-identifying already-seen unlabeled objects. In this work, we propose the\nnecessary step to extend segmentation with a new setting which we term holistic\nsegmentation. Holistic segmentation aims to identify and separate objects of\nunseen unknown categories into instances, without any prior knowledge about\nthem, while performing panoptic segmentation of known classes. We tackle this\nnew problem with U3HS, which finds unknowns as highly uncertain regions and\nclusters their corresponding instance-aware embeddings into individual objects.\nBy doing so, for the first time in panoptic segmentation with unknown objects,\nour U3HS is trained without unknown categories, reducing assumptions and\nleaving the settings as unconstrained as in real-life scenarios. Extensive\nexperiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate\nthe effectiveness of U3HS for this new, challenging, and assumptions-free\nsetting called holistic segmentation.\n","authors":["Stefano Gasperini","Alvaro Marcos-Ramiro","Michael Schmidt","Nassir Navab","Benjamin Busam","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2209.05407v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12980v1","updated":"2023-07-24T17:58:06Z","published":"2023-07-24T17:58:06Z","title":"A Systematic Survey of Prompt Engineering on Vision-Language Foundation\n  Models","summary":"  Prompt engineering is a technique that involves augmenting a large\npre-trained model with task-specific hints, known as prompts, to adapt the\nmodel to new tasks. Prompts can be created manually as natural language\ninstructions or generated automatically as either natural language instructions\nor vector representations. Prompt engineering enables the ability to perform\npredictions based solely on prompts without updating model parameters, and the\neasier application of large pre-trained models in real-world tasks. In past\nyears, Prompt engineering has been well-studied in natural language processing.\nRecently, it has also been intensively studied in vision-language modeling.\nHowever, there is currently a lack of a systematic overview of prompt\nengineering on pre-trained vision-language models. This paper aims to provide a\ncomprehensive survey of cutting-edge research in prompt engineering on three\ntypes of vision-language models: multimodal-to-text generation models (e.g.\nFlamingo), image-text matching models (e.g. CLIP), and text-to-image generation\nmodels (e.g. Stable Diffusion). For each type of model, a brief model summary,\nprompting methods, prompting-based applications, and the corresponding\nresponsibility and integrity issues are summarized and discussed. Furthermore,\nthe commonalities and differences between prompting on vision-language models,\nlanguage models, and vision models are also discussed. The challenges, future\ndirections, and research opportunities are summarized to foster future research\non this topic.\n","authors":["Jindong Gu","Zhen Han","Shuo Chen","Ahmad Beirami","Bailan He","Gengyuan Zhang","Ruotong Liao","Yao Qin","Volker Tresp","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2307.12980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12972v1","updated":"2023-07-24T17:49:11Z","published":"2023-07-24T17:49:11Z","title":"DFA3D: 3D Deformable Attention For 2D-to-3D Feature Lifting","summary":"  In this paper, we propose a new operator, called 3D DeFormable Attention\n(DFA3D), for 2D-to-3D feature lifting, which transforms multi-view 2D image\nfeatures into a unified 3D space for 3D object detection. Existing feature\nlifting approaches, such as Lift-Splat-based and 2D attention-based, either use\nestimated depth to get pseudo LiDAR features and then splat them to a 3D space,\nwhich is a one-pass operation without feature refinement, or ignore depth and\nlift features by 2D attention mechanisms, which achieve finer semantics while\nsuffering from a depth ambiguity problem. In contrast, our DFA3D-based method\nfirst leverages the estimated depth to expand each view's 2D feature map to 3D\nand then utilizes DFA3D to aggregate features from the expanded 3D feature\nmaps. With the help of DFA3D, the depth ambiguity problem can be effectively\nalleviated from the root, and the lifted features can be progressively refined\nlayer by layer, thanks to the Transformer-like architecture. In addition, we\npropose a mathematically equivalent implementation of DFA3D which can\nsignificantly improve its memory efficiency and computational speed. We\nintegrate DFA3D into several methods that use 2D attention-based feature\nlifting with only a few modifications in code and evaluate on the nuScenes\ndataset. The experiment results show a consistent improvement of +1.41\\% mAP on\naverage, and up to +15.1\\% mAP improvement when high-quality depth information\nis available, demonstrating the superiority, applicability, and huge potential\nof DFA3D. The code is available at\nhttps://github.com/IDEA-Research/3D-deformable-attention.git.\n","authors":["Hongyang Li","Hao Zhang","Zhaoyang Zeng","Shilong Liu","Feng Li","Tianhe Ren","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12970v1","updated":"2023-07-24T17:49:04Z","published":"2023-07-24T17:49:04Z","title":"Volcanic ash delimitation using Artificial Intelligence based on Pix2Pix","summary":"  Volcanic eruptions emit ash that can be harmful to human health and cause\ndamage to infrastructure, economic activities and the environment. The\ndelimitation of ash clouds allows to know their behavior and dispersion, which\nhelps in the prevention and mitigation of this phenomenon. Traditional methods\ntake advantage of specialized software programs to process the bands or\nchannels that compose the satellite images. However, their use is limited to\nexperts and demands a lot of time and significant computational resources. In\nrecent years, Artificial Intelligence has been a milestone in the computational\ntreatment of complex problems in different areas. In particular, Deep Learning\ntechniques allow automatic, fast and accurate processing of digital images. The\npresent work proposes the use of the Pix2Pix model, a type of generative\nadversarial network that, once trained, learns the mapping of input images to\noutput images. The architecture of such a network consisting of a generator and\na discriminator provides the versatility needed to produce black and white ash\ncloud images from multispectral satellite images. The evaluation of the model,\nbased on loss and accuracy plots, a confusion matrix, and visual inspection,\nindicates a satisfactory solution for accurate ash cloud delineation,\napplicable in any area of the world and becomes a useful tool in risk\nmanagement.\n","authors":["Christian Carrillo","Gissela Torres","Christian Mejia-Escobar"],"pdf_url":"https://arxiv.org/pdf/2307.12970v1.pdf","comment":"18 pages, in Spanish language, 15 figures"},{"id":"http://arxiv.org/abs/2307.12967v1","updated":"2023-07-24T17:45:40Z","published":"2023-07-24T17:45:40Z","title":"Learning Dense Correspondences between Photos and Sketches","summary":"  Humans effortlessly grasp the connection between sketches and real-world\nobjects, even when these sketches are far from realistic. Moreover, human\nsketch understanding goes beyond categorization -- critically, it also entails\nunderstanding how individual elements within a sketch correspond to parts of\nthe physical world it represents. What are the computational ingredients needed\nto support this ability? Towards answering this question, we make two\ncontributions: first, we introduce a new sketch-photo correspondence benchmark,\n$\\textit{PSC6k}$, containing 150K annotations of 6250 sketch-photo pairs across\n125 object categories, augmenting the existing Sketchy dataset with\nfine-grained correspondence metadata. Second, we propose a self-supervised\nmethod for learning dense correspondences between sketch-photo pairs, building\nupon recent advances in correspondence learning for pairs of photos. Our model\nuses a spatial transformer network to estimate the warp flow between latent\nrepresentations of a sketch and photo extracted by a contrastive learning-based\nConvNet backbone. We found that this approach outperformed several strong\nbaselines and produced predictions that were quantitatively consistent with\nother warp-based methods. However, our benchmark also revealed systematic\ndifferences between predictions of the suite of models we tested and those of\nhumans. Taken together, our work suggests a promising path towards developing\nartificial systems that achieve more human-like understanding of visual images\nat different levels of abstraction. Project page:\nhttps://photo-sketch-correspondence.github.io\n","authors":["Xuanchen Lu","Xiaolong Wang","Judith E Fan"],"pdf_url":"https://arxiv.org/pdf/2307.12967v1.pdf","comment":"Accepted to ICML 2023. Project page:\n  https://photo-sketch-correspondence.github.io"},{"id":"http://arxiv.org/abs/2307.12964v1","updated":"2023-07-24T17:43:13Z","published":"2023-07-24T17:43:13Z","title":"Audio-Enhanced Text-to-Video Retrieval using Text-Conditioned Feature\n  Alignment","summary":"  Text-to-video retrieval systems have recently made significant progress by\nutilizing pre-trained models trained on large-scale image-text pairs. However,\nmost of the latest methods primarily focus on the video modality while\ndisregarding the audio signal for this task. Nevertheless, a recent advancement\nby ECLIPSE has improved long-range text-to-video retrieval by developing an\naudiovisual video representation. Nonetheless, the objective of the\ntext-to-video retrieval task is to capture the complementary audio and video\ninformation that is pertinent to the text query rather than simply achieving\nbetter audio and video alignment. To address this issue, we introduce TEFAL, a\nTExt-conditioned Feature ALignment method that produces both audio and video\nrepresentations conditioned on the text query. Instead of using only an\naudiovisual attention block, which could suppress the audio information\nrelevant to the text query, our approach employs two independent cross-modal\nattention blocks that enable the text to attend to the audio and video\nrepresentations separately. Our proposed method's efficacy is demonstrated on\nfour benchmark datasets that include audio: MSR-VTT, LSMDC, VATEX, and\nCharades, and achieves better than state-of-the-art performance consistently\nacross the four datasets. This is attributed to the additional\ntext-query-conditioned audio representation and the complementary information\nit adds to the text-query-conditioned video representation.\n","authors":["Sarah Ibrahimi","Xiaohang Sun","Pichao Wang","Amanmeet Garg","Ashutosh Sanan","Mohamed Omar"],"pdf_url":"https://arxiv.org/pdf/2307.12964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12941v1","updated":"2023-07-24T17:11:39Z","published":"2023-07-24T17:11:39Z","title":"On Privileged and Convergent Bases in Neural Network Representations","summary":"  In this study, we investigate whether the representations learned by neural\nnetworks possess a privileged and convergent basis. Specifically, we examine\nthe significance of feature directions represented by individual neurons.\nFirst, we establish that arbitrary rotations of neural representations cannot\nbe inverted (unlike linear networks), indicating that they do not exhibit\ncomplete rotational invariance. Subsequently, we explore the possibility of\nmultiple bases achieving identical performance. To do this, we compare the\nbases of networks trained with the same parameters but with varying random\ninitializations. Our study reveals two findings: (1) Even in wide networks such\nas WideResNets, neural networks do not converge to a unique basis; (2) Basis\ncorrelation increases significantly when a few early layers of the network are\nfrozen identically.\n  Furthermore, we analyze Linear Mode Connectivity, which has been studied as a\nmeasure of basis correlation. Our findings give evidence that while Linear Mode\nConnectivity improves with increased network width, this improvement is not due\nto an increase in basis correlation.\n","authors":["Davis Brown","Nikhil Vyas","Yamini Bansal"],"pdf_url":"https://arxiv.org/pdf/2307.12941v1.pdf","comment":"In the Workshop on High-dimensional Learning Dynamics at ICML 2023"},{"id":"http://arxiv.org/abs/2307.12917v1","updated":"2023-07-24T16:18:22Z","published":"2023-07-24T16:18:22Z","title":"Hierarchical Skeleton Meta-Prototype Contrastive Learning with Hard\n  Skeleton Mining for Unsupervised Person Re-Identification","summary":"  With rapid advancements in depth sensors and deep learning, skeleton-based\nperson re-identification (re-ID) models have recently achieved remarkable\nprogress with many advantages. Most existing solutions learn single-level\nskeleton features from body joints with the assumption of equal skeleton\nimportance, while they typically lack the ability to exploit more informative\nskeleton features from various levels such as limb level with more global body\npatterns. The label dependency of these methods also limits their flexibility\nin learning more general skeleton representations. This paper proposes a\ngeneric unsupervised Hierarchical skeleton Meta-Prototype Contrastive learning\n(Hi-MPC) approach with Hard Skeleton Mining (HSM) for person re-ID with\nunlabeled 3D skeletons. Firstly, we construct hierarchical representations of\nskeletons to model coarse-to-fine body and motion features from the levels of\nbody joints, components, and limbs. Then a hierarchical meta-prototype\ncontrastive learning model is proposed to cluster and contrast the most typical\nskeleton features (\"prototypes\") from different-level skeletons. By converting\noriginal prototypes into meta-prototypes with multiple homogeneous\ntransformations, we induce the model to learn the inherent consistency of\nprototypes to capture more effective skeleton features for person re-ID.\nFurthermore, we devise a hard skeleton mining mechanism to adaptively infer the\ninformative importance of each skeleton, so as to focus on harder skeletons to\nlearn more discriminative skeleton representations. Extensive evaluations on\nfive datasets demonstrate that our approach outperforms a wide variety of\nstate-of-the-art skeleton-based methods. We further show the general\napplicability of our method to cross-view person re-ID and RGB-based scenarios\nwith estimated skeletons.\n","authors":["Haocong Rao","Cyril Leung","Chunyan Miao"],"pdf_url":"https://arxiv.org/pdf/2307.12917v1.pdf","comment":"Accepted by International Journal of Computer Vision (IJCV). Codes\n  are available at https://github.com/Kali-Hac/Hi-MPC. Supplemental materials\n  will be included in the published version"},{"id":"http://arxiv.org/abs/2307.12914v1","updated":"2023-07-24T16:13:43Z","published":"2023-07-24T16:13:43Z","title":"Towards a Visual-Language Foundation Model for Computational Pathology","summary":"  The accelerated adoption of digital pathology and advances in deep learning\nhave enabled the development of powerful models for various pathology tasks\nacross a diverse array of diseases and patient cohorts. However, model training\nis often difficult due to label scarcity in the medical domain and the model's\nusage is limited by the specific task and disease for which it is trained.\nAdditionally, most models in histopathology leverage only image data, a stark\ncontrast to how humans teach each other and reason about histopathologic\nentities. We introduce CONtrastive learning from Captions for Histopathology\n(CONCH), a visual-language foundation model developed using diverse sources of\nhistopathology images, biomedical text, and notably over 1.17 million\nimage-caption pairs via task-agnostic pretraining. Evaluated on a suite of 13\ndiverse benchmarks, CONCH can be transferred to a wide range of downstream\ntasks involving either or both histopathology images and text, achieving\nstate-of-the-art performance on histology image classification, segmentation,\ncaptioning, text-to-image and image-to-text retrieval. CONCH represents a\nsubstantial leap over concurrent visual-language pretrained systems for\nhistopathology, with the potential to directly facilitate a wide array of\nmachine learning-based workflows requiring minimal or no further supervised\nfine-tuning.\n","authors":["Ming Y. Lu","Bowen Chen","Drew F. K. Williamson","Richard J. Chen","Ivy Liang","Tong Ding","Guillaume Jaume","Igor Odintsov","Andrew Zhang","Long Phi Le","Georg Gerber","Anil V Parwani","Faisal Mahmood"],"pdf_url":"https://arxiv.org/pdf/2307.12914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12909v1","updated":"2023-07-24T16:08:32Z","published":"2023-07-24T16:08:32Z","title":"Dyn-E: Local Appearance Editing of Dynamic Neural Radiance Fields","summary":"  Recently, the editing of neural radiance fields (NeRFs) has gained\nconsiderable attention, but most prior works focus on static scenes while\nresearch on the appearance editing of dynamic scenes is relatively lacking. In\nthis paper, we propose a novel framework to edit the local appearance of\ndynamic NeRFs by manipulating pixels in a single frame of training video.\nSpecifically, to locally edit the appearance of dynamic NeRFs while preserving\nunedited regions, we introduce a local surface representation of the edited\nregion, which can be inserted into and rendered along with the original NeRF\nand warped to arbitrary other frames through a learned invertible motion\nrepresentation network. By employing our method, users without professional\nexpertise can easily add desired content to the appearance of a dynamic scene.\nWe extensively evaluate our approach on various scenes and show that our\napproach achieves spatially and temporally consistent editing results. Notably,\nour approach is versatile and applicable to different variants of dynamic NeRF\nrepresentations.\n","authors":["Shangzhan Zhang","Sida Peng","Yinji ShenTu","Qing Shuai","Tianrun Chen","Kaicheng Yu","Hujun Bao","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.12909v1.pdf","comment":"project page: https://dyn-e.github.io/"},{"id":"http://arxiv.org/abs/2307.12907v1","updated":"2023-07-24T16:02:42Z","published":"2023-07-24T16:02:42Z","title":"GridMM: Grid Memory Map for Vision-and-Language Navigation","summary":"  Vision-and-language navigation (VLN) enables the agent to navigate to a\nremote location following the natural language instruction in 3D environments.\nTo represent the previously visited environment, most approaches for VLN\nimplement memory using recurrent states, topological maps, or top-down semantic\nmaps. In contrast to these approaches, we build the top-down egocentric and\ndynamically growing Grid Memory Map (i.e., GridMM) to structure the visited\nenvironment. From a global perspective, historical observations are projected\ninto a unified grid map in a top-down view, which can better represent the\nspatial relations of the environment. From a local perspective, we further\npropose an instruction relevance aggregation method to capture fine-grained\nvisual clues in each grid region. Extensive experiments are conducted on both\nthe REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE\ndataset in the continuous environments, showing the superiority of our proposed\nmethod.\n","authors":["Zihan Wang","Xiangyang Li","Jiahao Yang","Yeqi Liu","Shuqiang Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.12907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12900v1","updated":"2023-07-24T15:47:21Z","published":"2023-07-24T15:47:21Z","title":"Automotive Object Detection via Learning Sparse Events by Temporal\n  Dynamics of Spiking Neurons","summary":"  Event-based sensors, with their high temporal resolution (1us) and dynamical\nrange (120dB), have the potential to be deployed in high-speed platforms such\nas vehicles and drones. However, the highly sparse and fluctuating nature of\nevents poses challenges for conventional object detection techniques based on\nArtificial Neural Networks (ANNs). In contrast, Spiking Neural Networks (SNNs)\nare well-suited for representing event-based data due to their inherent\ntemporal dynamics. In particular, we demonstrate that the membrane potential\ndynamics can modulate network activity upon fluctuating events and strengthen\nfeatures of sparse input. In addition, the spike-triggered adaptive threshold\ncan stabilize training which further improves network performance. Based on\nthis, we develop an efficient spiking feature pyramid network for event-based\nobject detection. Our proposed SNN outperforms previous SNNs and sophisticated\nANNs with attention mechanisms, achieving a mean average precision (map50) of\n47.7% on the Gen1 benchmark dataset. This result significantly surpasses the\nprevious best SNN by 9.7% and demonstrates the potential of SNNs for\nevent-based vision. Our model has a concise architecture while maintaining high\naccuracy and much lower computation cost as a result of sparse computation. Our\ncode will be publicly available.\n","authors":["Hu Zhang","Luziwei Leng","Kaiwei Che","Qian Liu","Jie Cheng","Qinghai Guo","Jiangxing Liao","Ran Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.12900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12803v3","updated":"2023-07-24T15:27:16Z","published":"2022-01-30T12:53:51Z","title":"Generalizing similarity in noisy setups: the DIBS phenomenon","summary":"  This work uncovers an interplay among data density, noise, and the\ngeneralization ability in similarity learning. We consider Siamese Neural\nNetworks (SNNs), which are the basic form of contrastive learning, and explore\ntwo types of noise that can impact SNNs, Pair Label Noise (PLN) and Single\nLabel Noise (SLN). Our investigation reveals that SNNs exhibit double descent\nbehaviour regardless of the training setup and that it is further exacerbated\nby noise. We demonstrate that the density of data pairs is crucial for\ngeneralization. When SNNs are trained on sparse datasets with the same amount\nof PLN or SLN, they exhibit comparable generalization properties. However, when\nusing dense datasets, PLN cases generalize worse than SLN ones in the\noverparametrized region, leading to a phenomenon we call Density-Induced Break\nof Similarity (DIBS). In this regime, PLN similarity violation becomes\nmacroscopical, corrupting the dataset to the point where complete interpolation\ncannot be achieved, regardless of the number of model parameters. Our analysis\nalso delves into the correspondence between online optimization and offline\ngeneralization in similarity learning. The results show that this equivalence\nfails in the presence of label noise in all the scenarios considered.\n","authors":["Nayara Fonseca","Veronica Guidetti"],"pdf_url":"https://arxiv.org/pdf/2201.12803v3.pdf","comment":"v3: version accepted at ECAI 2023 + Supplementary Material"},{"id":"http://arxiv.org/abs/2307.12872v1","updated":"2023-07-24T15:10:22Z","published":"2023-07-24T15:10:22Z","title":"Data-free Black-box Attack based on Diffusion Model","summary":"  Since the training data for the target model in a data-free black-box attack\nis not available, most recent schemes utilize GANs to generate data for\ntraining substitute model. However, these GANs-based schemes suffer from low\ntraining efficiency as the generator needs to be retrained for each target\nmodel during the substitute training process, as well as low generation\nquality. To overcome these limitations, we consider utilizing the diffusion\nmodel to generate data, and propose a data-free black-box attack scheme based\non diffusion model to improve the efficiency and accuracy of substitute\ntraining. Despite the data generated by the diffusion model exhibits high\nquality, it presents diverse domain distributions and contains many samples\nthat do not meet the discriminative criteria of the target model. To further\nfacilitate the diffusion model to generate data suitable for the target model,\nwe propose a Latent Code Augmentation (LCA) method to guide the diffusion model\nin generating data. With the guidance of LCA, the data generated by the\ndiffusion model not only meets the discriminative criteria of the target model\nbut also exhibits high diversity. By utilizing this data, it is possible to\ntrain substitute model that closely resemble the target model more efficiently.\nExtensive experiments demonstrate that our LCA achieves higher attack success\nrates and requires fewer query budgets compared to GANs-based schemes for\ndifferent target models.\n","authors":["Mingwen Shao","Lingzhuang Meng","Yuanjian Qiao","Lixu Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.12872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12868v1","updated":"2023-07-24T15:06:42Z","published":"2023-07-24T15:06:42Z","title":"Understanding the Latent Space of Diffusion Models through the Lens of\n  Riemannian Geometry","summary":"  Despite the success of diffusion models (DMs), we still lack a thorough\nunderstanding of their latent space. To understand the latent space\n$\\mathbf{x}_t \\in \\mathcal{X}$, we analyze them from a geometrical perspective.\nSpecifically, we utilize the pullback metric to find the local latent basis in\n$\\mathcal{X}$ and their corresponding local tangent basis in $\\mathcal{H}$, the\nintermediate feature maps of DMs. The discovered latent basis enables\nunsupervised image editing capability through latent space traversal. We\ninvestigate the discovered structure from two perspectives. First, we examine\nhow geometric structure evolves over diffusion timesteps. Through analysis, we\nshow that 1) the model focuses on low-frequency components early in the\ngenerative process and attunes to high-frequency details later; 2) At early\ntimesteps, different samples share similar tangent spaces; and 3) The simpler\ndatasets that DMs trained on, the more consistent the tangent space for each\ntimestep. Second, we investigate how the geometric structure changes based on\ntext conditioning in Stable Diffusion. The results show that 1) similar prompts\nyield comparable tangent spaces; and 2) the model depends less on text\nconditions in later timesteps. To the best of our knowledge, this paper is the\nfirst to present image editing through $\\mathbf{x}$-space traversal and provide\nthorough analyses of the latent structure of DMs.\n","authors":["Yong-Hyun Park","Mingi Kwon","Jaewoong Choi","Junghyo Jo","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2307.12868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09224v2","updated":"2023-07-24T15:05:55Z","published":"2023-06-15T16:03:01Z","title":"Encyclopedic VQA: Visual questions about detailed properties of\n  fine-grained categories","summary":"  We propose Encyclopedic-VQA, a large scale visual question answering (VQA)\ndataset featuring visual questions about detailed properties of fine-grained\ncategories and instances. It contains 221k unique question+answer pairs each\nmatched with (up to) 5 images, resulting in a total of 1M VQA samples.\nMoreover, our dataset comes with a controlled knowledge base derived from\nWikipedia, marking the evidence to support each answer. Empirically, we show\nthat our dataset poses a hard challenge for large vision+language models as\nthey perform poorly on our dataset: PaLI [14] is state-of-the-art on OK-VQA\n[37], yet it only achieves 13.0% accuracy on our dataset. Moreover, we\nexperimentally show that progress on answering our encyclopedic questions can\nbe achieved by augmenting large models with a mechanism that retrieves relevant\ninformation from the knowledge base. An oracle experiment with perfect\nretrieval achieves 87.0% accuracy on the single-hop portion of our dataset, and\nan automatic retrieval-augmented prototype yields 48.8%. We believe that our\ndataset enables future research on retrieval-augmented vision+language models.\nIt is available at\nhttps://github.com/google-research/google-research/tree/master/encyclopedic_vqa .\n","authors":["Thomas Mensink","Jasper Uijlings","Lluis Castrejon","Arushi Goel","Felipe Cadar","Howard Zhou","Fei Sha","André Araujo","Vittorio Ferrari"],"pdf_url":"https://arxiv.org/pdf/2306.09224v2.pdf","comment":"ICCV'23"},{"id":"http://arxiv.org/abs/2307.12858v1","updated":"2023-07-24T14:57:40Z","published":"2023-07-24T14:57:40Z","title":"Treatment Outcome Prediction for Intracerebral Hemorrhage via Generative\n  Prognostic Model with Imaging and Tabular Data","summary":"  Intracerebral hemorrhage (ICH) is the second most common and deadliest form\nof stroke. Despite medical advances, predicting treat ment outcomes for ICH\nremains a challenge. This paper proposes a novel prognostic model that utilizes\nboth imaging and tabular data to predict treatment outcome for ICH. Our model\nis trained on observational data collected from non-randomized controlled\ntrials, providing reliable predictions of treatment success. Specifically, we\npropose to employ a variational autoencoder model to generate a low-dimensional\nprognostic score, which can effectively address the selection bias resulting\nfrom the non-randomized controlled trials. Importantly, we develop a\nvariational distributions combination module that combines the information from\nimaging data, non-imaging clinical data, and treatment assignment to accurately\ngenerate the prognostic score. We conducted extensive experiments on a\nreal-world clinical dataset of intracerebral hemorrhage. Our proposed method\ndemonstrates a substantial improvement in treatment outcome prediction compared\nto existing state-of-the-art approaches. Code is available at\nhttps://github.com/med-air/TOP-GPM\n","authors":["Wenao Ma","Cheng Chen","Jill Abrigo","Calvin Hoi-Kwan Mak","Yuqi Gong","Nga Yan Chan","Chu Han","Zaiyi Liu","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2307.12858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12854v1","updated":"2023-07-24T14:55:15Z","published":"2023-07-24T14:55:15Z","title":"Multiscale Video Pretraining for Long-Term Activity Forecasting","summary":"  Long-term activity forecasting is an especially challenging research problem\nbecause it requires understanding the temporal relationships between observed\nactions, as well as the variability and complexity of human activities. Despite\nrelying on strong supervision via expensive human annotations, state-of-the-art\nforecasting approaches often generalize poorly to unseen data. To alleviate\nthis issue, we propose Multiscale Video Pretraining (MVP), a novel\nself-supervised pretraining approach that learns robust representations for\nforecasting by learning to predict contextualized representations of future\nvideo clips over multiple timescales. MVP is based on our observation that\nactions in videos have a multiscale nature, where atomic actions typically\noccur at a short timescale and more complex actions may span longer timescales.\nWe compare MVP to state-of-the-art self-supervised video learning approaches on\ndownstream long-term forecasting tasks including long-term action anticipation\nand video summary prediction. Our comprehensive experiments across the Ego4D\nand Epic-Kitchens-55/100 datasets demonstrate that MVP out-performs\nstate-of-the-art methods by significant margins. Notably, MVP obtains a\nrelative performance gain of over 20% accuracy in video summary forecasting\nover existing methods.\n","authors":["Reuben Tan","Matthias De Lange","Michael Iuzzolino","Bryan A. Plummer","Kate Saenko","Karl Ridgeway","Lorenzo Torresani"],"pdf_url":"https://arxiv.org/pdf/2307.12854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11630v3","updated":"2023-07-24T14:53:51Z","published":"2023-03-21T06:54:18Z","title":"BoxSnake: Polygonal Instance Segmentation with Box Supervision","summary":"  Box-supervised instance segmentation has gained much attention as it requires\nonly simple box annotations instead of costly mask or polygon annotations.\nHowever, existing box-supervised instance segmentation models mainly focus on\nmask-based frameworks. We propose a new end-to-end training technique, termed\nBoxSnake, to achieve effective polygonal instance segmentation using only box\nannotations for the first time. Our method consists of two loss functions: (1)\na point-based unary loss that constrains the bounding box of predicted polygons\nto achieve coarse-grained segmentation; and (2) a distance-aware pairwise loss\nthat encourages the predicted polygons to fit the object boundaries. Compared\nwith the mask-based weakly-supervised methods, BoxSnake further reduces the\nperformance gap between the predicted segmentation and the bounding box, and\nshows significant superiority on the Cityscapes dataset. The code has been\navailable publicly.\n","authors":["Rui Yang","Lin Song","Yixiao Ge","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2303.11630v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12853v1","updated":"2023-07-24T14:53:23Z","published":"2023-07-24T14:53:23Z","title":"Spatiotemporal Modeling Encounters 3D Medical Image Analysis:\n  Slice-Shift UNet with Multi-View Fusion","summary":"  As a fundamental part of computational healthcare, Computer Tomography (CT)\nand Magnetic Resonance Imaging (MRI) provide volumetric data, making the\ndevelopment of algorithms for 3D image analysis a necessity. Despite being\ncomputationally cheap, 2D Convolutional Neural Networks can only extract\nspatial information. In contrast, 3D CNNs can extract three-dimensional\nfeatures, but they have higher computational costs and latency, which is a\nlimitation for clinical practice that requires fast and efficient models.\nInspired by the field of video action recognition we propose a new 2D-based\nmodel dubbed Slice SHift UNet (SSH-UNet) which encodes three-dimensional\nfeatures at 2D CNN's complexity. More precisely multi-view features are\ncollaboratively learned by performing 2D convolutions along the three\northogonal planes of a volume and imposing a weights-sharing mechanism. The\nthird dimension, which is neglected by the 2D convolution, is reincorporated by\nshifting a portion of the feature maps along the slices' axis. The\neffectiveness of our approach is validated in Multi-Modality Abdominal\nMulti-Organ Segmentation (AMOS) and Multi-Atlas Labeling Beyond the Cranial\nVault (BTCV) datasets, showing that SSH-UNet is more efficient while on par in\nperformance with state-of-the-art architectures.\n","authors":["C. I. Ugwu","S. Casarin","O. Lanz"],"pdf_url":"https://arxiv.org/pdf/2307.12853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12845v1","updated":"2023-07-24T14:43:07Z","published":"2023-07-24T14:43:07Z","title":"Multi-View Vertebra Localization and Identification from CT Images","summary":"  Accurately localizing and identifying vertebrae from CT images is crucial for\nvarious clinical applications. However, most existing efforts are performed on\n3D with cropping patch operation, suffering from the large computation costs\nand limited global information. In this paper, we propose a multi-view vertebra\nlocalization and identification from CT images, converting the 3D problem into\na 2D localization and identification task on different views. Without the\nlimitation of the 3D cropped patch, our method can learn the multi-view global\ninformation naturally. Moreover, to better capture the anatomical structure\ninformation from different view perspectives, a multi-view contrastive learning\nstrategy is developed to pre-train the backbone. Additionally, we further\npropose a Sequence Loss to maintain the sequential structure embedded along the\nvertebrae. Evaluation results demonstrate that, with only two 2D networks, our\nmethod can localize and identify vertebrae in CT images accurately, and\noutperforms the state-of-the-art methods consistently. Our code is available at\nhttps://github.com/ShanghaiTech-IMPACT/Multi-View-Vertebra-Localization-and-Identification-from-CT-Images.\n","authors":["Han Wu","Jiadong Zhang","Yu Fang","Zhentao Liu","Nizhuan Wang","Zhiming Cui","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2307.12845v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2306.15599v2","updated":"2023-07-24T14:41:40Z","published":"2023-06-27T16:37:37Z","title":"Coupling a Recurrent Neural Network to SPAD TCSPC Systems for Real-time\n  Fluorescence Lifetime Imaging","summary":"  Fluorescence lifetime imaging (FLI) has been receiving increased attention in\nrecent years as a powerful diagnostic technique in biological and medical\nresearch. However, existing FLI systems often suffer from a tradeoff between\nprocessing speed, accuracy, and robustness. In this paper, we propose a robust\napproach that enables fast FLI with no degradation of accuracy. The approach is\nbased on a SPAD TCSPC system coupled to a recurrent neural network (RNN) that\naccurately estimates the fluorescence lifetime directly from raw timestamps\nwithout building histograms, thereby drastically reducing transfer data volumes\nand hardware resource utilization, thus enabling FLI acquisition at video rate.\nWe train two variants of the RNN on a synthetic dataset and compare the results\nto those obtained using center-of-mass method (CMM) and least squares fitting\n(LS fitting). Results demonstrate that two RNN variants, gated recurrent unit\n(GRU) and long short-term memory (LSTM), are comparable to CMM and LS fitting\nin terms of accuracy, while outperforming them in background noise by a large\nmargin. To explore the ultimate limits of the approach, we derived the\nCramer-Rao lower bound of the measurement, showing that RNN yields lifetime\nestimations with near-optimal precision. Moreover, our FLI model, which is\npurely trained on synthetic datasets, works well with never-seen-before,\nreal-world data. To demonstrate real-time operation, we have built a FLI\nmicroscope based on Piccolo, a 32x32 SPAD sensor developed in our lab. Four\nquantized GRU cores, capable of processing up to 4 million photons per second,\nare deployed on a Xilinx Kintex-7 FPGA. Powered by the GRU, the FLI setup can\nretrieve real-time fluorescence lifetime images at up to 10 frames per second.\nThe proposed FLI system is promising and ideally suited for biomedical\napplications.\n","authors":["Yang Lin","Paul Mos","Andrei Ardelean","Claudio Bruschini","Edoardo Charbon"],"pdf_url":"https://arxiv.org/pdf/2306.15599v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09696v2","updated":"2023-07-24T14:36:24Z","published":"2023-07-19T00:41:39Z","title":"Towards Saner Deep Image Registration","summary":"  With recent advances in computing hardware and surges of deep-learning\narchitectures, learning-based deep image registration methods have surpassed\ntheir traditional counterparts, in terms of metric performance and inference\ntime. However, these methods focus on improving performance measurements such\nas Dice, resulting in less attention given to model behaviors that are equally\ndesirable for registrations, especially for medical imaging. This paper\ninvestigates these behaviors for popular learning-based deep registrations\nunder a sanity-checking microscope. We find that most existing registrations\nsuffer from low inverse consistency and nondiscrimination of identical pairs\ndue to overly optimized image similarities. To rectify these behaviors, we\npropose a novel regularization-based sanity-enforcer method that imposes two\nsanity checks on the deep model to reduce its inverse consistency errors and\nincrease its discriminative power simultaneously. Moreover, we derive a set of\ntheoretical guarantees for our sanity-checked image registration method, with\nexperimental results supporting our theoretical findings and their\neffectiveness in increasing the sanity of models without sacrificing any\nperformance. Our code and models are available at\nhttps://github.com/tuffr5/Saner-deep-registration.\n","authors":["Bin Duan","Ming Zhong","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2307.09696v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12837v1","updated":"2023-07-24T14:35:46Z","published":"2023-07-24T14:35:46Z","title":"EPIC-KITCHENS-100 Unsupervised Domain Adaptation Challenge: Mixed\n  Sequences Prediction","summary":"  This report presents the technical details of our approach for the\nEPIC-Kitchens-100 Unsupervised Domain Adaptation (UDA) Challenge in Action\nRecognition. Our approach is based on the idea that the order in which actions\nare performed is similar between the source and target domains. Based on this,\nwe generate a modified sequence by randomly combining actions from the source\nand target domains. As only unlabelled target data are available under the UDA\nsetting, we use a standard pseudo-labeling strategy for extracting action\nlabels for the target. We then ask the network to predict the resulting action\nsequence. This allows to integrate information from both domains during\ntraining and to achieve better transfer results on target. Additionally, to\nbetter incorporate sequence information, we use a language model to filter\nunlikely sequences. Lastly, we employed a co-occurrence matrix to eliminate\nunseen combinations of verbs and nouns. Our submission, labeled as 'sshayan',\ncan be found on the leaderboard, where it currently holds the 2nd position for\n'verb' and the 4th position for both 'noun' and 'action'.\n","authors":["Amirshayan Nasirimajd","Simone Alberto Peirone","Chiara Plizzari","Barbara Caputo"],"pdf_url":"https://arxiv.org/pdf/2307.12837v1.pdf","comment":"2nd place in the 2023 EPIC-KITCHENS-100 Unsupervised Domain\n  Adaptation Challenge for Action Recognition"},{"id":"http://arxiv.org/abs/2307.12822v1","updated":"2023-07-24T14:19:36Z","published":"2023-07-24T14:19:36Z","title":"Learning Provably Robust Estimators for Inverse Problems via Jittering","summary":"  Deep neural networks provide excellent performance for inverse problems such\nas denoising. However, neural networks can be sensitive to adversarial or\nworst-case perturbations. This raises the question of whether such networks can\nbe trained efficiently to be worst-case robust. In this paper, we investigate\nwhether jittering, a simple regularization technique that adds isotropic\nGaussian noise during training, is effective for learning worst-case robust\nestimators for inverse problems. While well studied for prediction in\nclassification tasks, the effectiveness of jittering for inverse problems has\nnot been systematically investigated. In this paper, we present a novel\nanalytical characterization of the optimal $\\ell_2$-worst-case robust estimator\nfor linear denoising and show that jittering yields optimal robust denoisers.\nFurthermore, we examine jittering empirically via training deep neural networks\n(U-nets) for natural image denoising, deconvolution, and accelerated magnetic\nresonance imaging (MRI). The results show that jittering significantly enhances\nthe worst-case robustness, but can be suboptimal for inverse problems beyond\ndenoising. Moreover, our results imply that training on real data which often\ncontains slight noise is somewhat robustness enhancing.\n","authors":["Anselm Krainovic","Mahdi Soltanolkotabi","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2307.12822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12813v1","updated":"2023-07-24T14:06:54Z","published":"2023-07-24T14:06:54Z","title":"Exposing the Troublemakers in Described Object Detection","summary":"  Detecting objects based on language descriptions is a popular task that\nincludes Open-Vocabulary object Detection (OVD) and Referring Expression\nComprehension (REC). In this paper, we advance them to a more practical setting\ncalled Described Object Detection (DOD) by expanding category names to flexible\nlanguage expressions for OVD and overcoming the limitation of REC to only\ngrounding the pre-existing object. We establish the research foundation for DOD\ntasks by constructing a Description Detection Dataset ($D^3$), featuring\nflexible language expressions and annotating all described objects without\nomission. By evaluating previous SOTA methods on $D^3$, we find some\ntroublemakers that fail current REC, OVD, and bi-functional methods. REC\nmethods struggle with confidence scores, rejecting negative instances, and\nmulti-target scenarios, while OVD methods face constraints with long and\ncomplex descriptions. Recent bi-functional methods also do not work well on DOD\ndue to their separated training procedures and inference strategies for REC and\nOVD tasks. Building upon the aforementioned findings, we propose a baseline\nthat largely improves REC methods by reconstructing the training data and\nintroducing a binary classification sub-task, outperforming existing methods.\nData and code is available at https://github.com/shikras/d-cube.\n","authors":["Chi Xie","Zhao Zhang","Yixuan Wu","Feng Zhu","Rui Zhao","Shuang Liang"],"pdf_url":"https://arxiv.org/pdf/2307.12813v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2307.02148v2","updated":"2023-07-24T13:59:50Z","published":"2023-07-05T09:44:02Z","title":"Compound Attention and Neighbor Matching Network for Multi-contrast MRI\n  Super-resolution","summary":"  Multi-contrast magnetic resonance imaging (MRI) reflects information about\nhuman tissue from different perspectives and has many clinical applications. By\nutilizing the complementary information among different modalities,\nmulti-contrast super-resolution (SR) of MRI can achieve better results than\nsingle-image super-resolution. However, existing methods of multi-contrast MRI\nSR have the following shortcomings that may limit their performance: First,\nexisting methods either simply concatenate the reference and degraded features\nor exploit global feature-matching between them, which are unsuitable for\nmulti-contrast MRI SR. Second, although many recent methods employ transformers\nto capture long-range dependencies in the spatial dimension, they neglect that\nself-attention in the channel dimension is also important for low-level vision\ntasks. To address these shortcomings, we proposed a novel network architecture\nwith compound-attention and neighbor matching (CANM-Net) for multi-contrast MRI\nSR: The compound self-attention mechanism effectively captures the dependencies\nin both spatial and channel dimension; the neighborhood-based feature-matching\nmodules are exploited to match degraded features and adjacent reference\nfeatures and then fuse them to obtain the high-quality images. We conduct\nexperiments of SR tasks on the IXI, fastMRI, and real-world scanning datasets.\nThe CANM-Net outperforms state-of-the-art approaches in both retrospective and\nprospective experiments. Moreover, the robustness study in our work shows that\nthe CANM-Net still achieves good performance when the reference and degraded\nimages are imperfectly registered, proving good potential in clinical\napplications.\n","authors":["Wenxuan Chen","Sirui Wu","Shuai Wang","Zhongsen Li","Jia Yang","Huifeng Yao","Xiaomeng Li","Xiaolei Song"],"pdf_url":"https://arxiv.org/pdf/2307.02148v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2211.16761v3","updated":"2023-07-24T13:53:26Z","published":"2022-11-30T05:59:23Z","title":"Improving Cross-Modal Retrieval with Set of Diverse Embeddings","summary":"  Cross-modal retrieval across image and text modalities is a challenging task\ndue to its inherent ambiguity: An image often exhibits various situations, and\na caption can be coupled with diverse images. Set-based embedding has been\nstudied as a solution to this problem. It seeks to encode a sample into a set\nof different embedding vectors that capture different semantics of the sample.\nIn this paper, we present a novel set-based embedding method, which is distinct\nfrom previous work in two aspects. First, we present a new similarity function\ncalled smooth-Chamfer similarity, which is designed to alleviate the side\neffects of existing similarity functions for set-based embedding. Second, we\npropose a novel set prediction module to produce a set of embedding vectors\nthat effectively captures diverse semantics of input by the slot attention\nmechanism. Our method is evaluated on the COCO and Flickr30K datasets across\ndifferent visual backbones, where it outperforms existing methods including\nones that demand substantially larger computation at inference.\n","authors":["Dongwon Kim","Namyup Kim","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2211.16761v3.pdf","comment":"Accepted to CVPR 2023 (Highlight)"},{"id":"http://arxiv.org/abs/2307.12790v1","updated":"2023-07-24T13:39:21Z","published":"2023-07-24T13:39:21Z","title":"Compact & Capable: Harnessing Graph Neural Networks and Edge Convolution\n  for Medical Image Classification","summary":"  Graph-based neural network models are gaining traction in the field of\nrepresentation learning due to their ability to uncover latent topological\nrelationships between entities that are otherwise challenging to identify.\nThese models have been employed across a diverse range of domains, encompassing\ndrug discovery, protein interactions, semantic segmentation, and fluid dynamics\nresearch. In this study, we investigate the potential of Graph Neural Networks\n(GNNs) for medical image classification. We introduce a novel model that\ncombines GNNs and edge convolution, leveraging the interconnectedness of RGB\nchannel feature values to strongly represent connections between crucial graph\nnodes. Our proposed model not only performs on par with state-of-the-art Deep\nNeural Networks (DNNs) but does so with 1000 times fewer parameters, resulting\nin reduced training time and data requirements. We compare our Graph\nConvolutional Neural Network (GCNN) to pre-trained DNNs for classifying\nMedMNIST dataset classes, revealing promising prospects for GNNs in medical\nimage analysis. Our results also encourage further exploration of advanced\ngraph-based models such as Graph Attention Networks (GAT) and Graph\nAuto-Encoders in the medical imaging domain. The proposed model yields more\nreliable, interpretable, and accurate outcomes for tasks like semantic\nsegmentation and image classification compared to simpler GCNNs\n","authors":["Aryan Singh","Pepijn Van de Ven","Ciarán Eising","Patrick Denny"],"pdf_url":"https://arxiv.org/pdf/2307.12790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.13170v4","updated":"2023-07-24T13:35:28Z","published":"2022-04-27T20:04:24Z","title":"AdaBest: Minimizing Client Drift in Federated Learning via Adaptive Bias\n  Estimation","summary":"  In Federated Learning (FL), a number of clients or devices collaborate to\ntrain a model without sharing their data. Models are optimized locally at each\nclient and further communicated to a central hub for aggregation. While FL is\nan appealing decentralized training paradigm, heterogeneity among data from\ndifferent clients can cause the local optimization to drift away from the\nglobal objective. In order to estimate and therefore remove this drift,\nvariance reduction techniques have been incorporated into FL optimization\nrecently. However, these approaches inaccurately estimate the clients' drift\nand ultimately fail to remove it properly. In this work, we propose an adaptive\nalgorithm that accurately estimates drift across clients. In comparison to\nprevious works, our approach necessitates less storage and communication\nbandwidth, as well as lower compute costs. Additionally, our proposed\nmethodology induces stability by constraining the norm of estimates for client\ndrift, making it more practical for large scale FL. Experimental findings\ndemonstrate that the proposed algorithm converges significantly faster and\nachieves higher accuracy than the baselines across various FL benchmarks.\n","authors":["Farshid Varno","Marzie Saghayi","Laya Rafiee Sevyeri","Sharut Gupta","Stan Matwin","Mohammad Havaei"],"pdf_url":"https://arxiv.org/pdf/2204.13170v4.pdf","comment":"Published as a conference paper at ECCV 2022; Corrected some typos in\n  the text and a baseline algorithm"},{"id":"http://arxiv.org/abs/2303.12540v2","updated":"2023-07-24T13:35:16Z","published":"2023-03-22T13:16:37Z","title":"Deployment of Image Analysis Algorithms under Prevalence Shifts","summary":"  Domain gaps are among the most relevant roadblocks in the clinical\ntranslation of machine learning (ML)-based solutions for medical image\nanalysis. While current research focuses on new training paradigms and network\narchitectures, little attention is given to the specific effect of prevalence\nshifts on an algorithm deployed in practice. Such discrepancies between class\nfrequencies in the data used for a method's development/validation and that in\nits deployment environment(s) are of great importance, for example in the\ncontext of artificial intelligence (AI) democratization, as disease prevalences\nmay vary widely across time and location. Our contribution is twofold. First,\nwe empirically demonstrate the potentially severe consequences of missing\nprevalence handling by analyzing (i) the extent of miscalibration, (ii) the\ndeviation of the decision threshold from the optimum, and (iii) the ability of\nvalidation metrics to reflect neural network performance on the deployment\npopulation as a function of the discrepancy between development and deployment\nprevalence. Second, we propose a workflow for prevalence-aware image\nclassification that uses estimated deployment prevalences to adjust a trained\nclassifier to a new environment, without requiring additional annotated\ndeployment data. Comprehensive experiments based on a diverse set of 30 medical\nclassification tasks showcase the benefit of the proposed workflow in\ngenerating better classifier decisions and more reliable performance estimates\ncompared to current practice.\n","authors":["Patrick Godau","Piotr Kalinowski","Evangelia Christodoulou","Annika Reinke","Minu Tizabi","Luciana Ferrer","Paul Jäger","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2303.12540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12775v1","updated":"2023-07-24T13:24:56Z","published":"2023-07-24T13:24:56Z","title":"Is attention all you need in medical image analysis? A review","summary":"  Medical imaging is a key component in clinical diagnosis, treatment planning\nand clinical trial design, accounting for almost 90% of all healthcare data.\nCNNs achieved performance gains in medical image analysis (MIA) over the last\nyears. CNNs can efficiently model local pixel interactions and be trained on\nsmall-scale MI data. The main disadvantage of typical CNN models is that they\nignore global pixel relationships within images, which limits their\ngeneralisation ability to understand out-of-distribution data with different\n'global' information. The recent progress of Artificial Intelligence gave rise\nto Transformers, which can learn global relationships from data. However, full\nTransformer models need to be trained on large-scale data and involve\ntremendous computational complexity. Attention and Transformer compartments\n(Transf/Attention) which can well maintain properties for modelling global\nrelationships, have been proposed as lighter alternatives of full Transformers.\nRecently, there is an increasing trend to co-pollinate complementary\nlocal-global properties from CNN and Transf/Attention architectures, which led\nto a new era of hybrid models. The past years have witnessed substantial growth\nin hybrid CNN-Transf/Attention models across diverse MIA problems. In this\nsystematic review, we survey existing hybrid CNN-Transf/Attention models,\nreview and unravel key architectural designs, analyse breakthroughs, and\nevaluate current and future opportunities as well as challenges. We also\nintroduced a comprehensive analysis framework on generalisation opportunities\nof scientific and clinical impact, based on which new data-driven domain\ngeneralisation and adaptation methods can be stimulated.\n","authors":["Giorgos Papanastasiou","Nikolaos Dikaios","Jiahao Huang","Chengjia Wang","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12774v1","updated":"2023-07-24T13:24:19Z","published":"2023-07-24T13:24:19Z","title":"Fast Full-frame Video Stabilization with Iterative Optimization","summary":"  Video stabilization refers to the problem of transforming a shaky video into\na visually pleasing one. The question of how to strike a good trade-off between\nvisual quality and computational speed has remained one of the open challenges\nin video stabilization. Inspired by the analogy between wobbly frames and\njigsaw puzzles, we propose an iterative optimization-based learning approach\nusing synthetic datasets for video stabilization, which consists of two\ninteracting submodules: motion trajectory smoothing and full-frame outpainting.\nFirst, we develop a two-level (coarse-to-fine) stabilizing algorithm based on\nthe probabilistic flow field. The confidence map associated with the estimated\noptical flow is exploited to guide the search for shared regions through\nbackpropagation. Second, we take a divide-and-conquer approach and propose a\nnovel multiframe fusion strategy to render full-frame stabilized views. An\nimportant new insight brought about by our iterative optimization approach is\nthat the target video can be interpreted as the fixed point of nonlinear\nmapping for video stabilization. We formulate video stabilization as a problem\nof minimizing the amount of jerkiness in motion trajectories, which guarantees\nconvergence with the help of fixed-point theory. Extensive experimental results\nare reported to demonstrate the superiority of the proposed approach in terms\nof computational speed and visual quality. The code will be available on\nGitHub.\n","authors":["Weiyue Zhao","Xin Li","Zhan Peng","Xianrui Luo","Xinyi Ye","Hao Lu","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2307.12774v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.12761v1","updated":"2023-07-24T13:05:36Z","published":"2023-07-24T13:05:36Z","title":"LiDAR Meta Depth Completion","summary":"  Depth estimation is one of the essential tasks to be addressed when creating\nmobile autonomous systems. While monocular depth estimation methods have\nimproved in recent times, depth completion provides more accurate and reliable\ndepth maps by additionally using sparse depth information from other sensors\nsuch as LiDAR. However, current methods are specifically trained for a single\nLiDAR sensor. As the scanning pattern differs between sensors, every new sensor\nwould require re-training a specialized depth completion model, which is\ncomputationally inefficient and not flexible. Therefore, we propose to\ndynamically adapt the depth completion model to the used sensor type enabling\nLiDAR adaptive depth completion. Specifically, we propose a meta depth\ncompletion network that uses data patterns derived from the data to learn a\ntask network to alter weights of the main depth completion network to solve a\ngiven depth completion task effectively. The method demonstrates a strong\ncapability to work on multiple LiDAR scanning patterns and can also generalize\nto scanning patterns that are unseen during training. While using a single\nmodel, our method yields significantly better results than a non-adaptive\nbaseline trained on different LiDAR patterns. It outperforms LiDAR-specific\nexpert models for very sparse cases. These advantages allow flexible deployment\nof a single depth completion model on different sensors, which could also prove\nvaluable to process the input of nascent LiDAR technology with adaptive instead\nof fixed scanning patterns.\n","authors":["Wolfgang Boettcher","Lukas Hoyer","Ozan Unal","Dengxin Dai"],"pdf_url":"https://arxiv.org/pdf/2307.12761v1.pdf","comment":"Accepted at IROS 2023"},{"id":"http://arxiv.org/abs/2209.11531v2","updated":"2023-07-24T13:04:48Z","published":"2022-09-23T11:36:32Z","title":"Deep Learning-based Anonymization of Chest Radiographs: A\n  Utility-preserving Measure for Patient Privacy","summary":"  Robust and reliable anonymization of chest radiographs constitutes an\nessential step before publishing large datasets of such for research purposes.\nThe conventional anonymization process is carried out by obscuring personal\ninformation in the images with black boxes and removing or replacing\nmeta-information. However, such simple measures retain biometric information in\nthe chest radiographs, allowing patients to be re-identified by a linkage\nattack. Therefore, there is an urgent need to obfuscate the biometric\ninformation appearing in the images. We propose the first deep learning-based\napproach (PriCheXy-Net) to targetedly anonymize chest radiographs while\nmaintaining data utility for diagnostic and machine learning purposes. Our\nmodel architecture is a composition of three independent neural networks that,\nwhen collectively used, allow for learning a deformation field that is able to\nimpede patient re-identification. Quantitative results on the ChestX-ray14\ndataset show a reduction of patient re-identification from 81.8% to 57.7% (AUC)\nafter re-training with little impact on the abnormality classification\nperformance. This indicates the ability to preserve underlying abnormality\npatterns while increasing patient privacy. Lastly, we compare our proposed\nanonymization approach with two other obfuscation-based methods (Privacy-Net,\nDP-Pix) and demonstrate the superiority of our method towards resolving the\nprivacy-utility trade-off for chest radiographs.\n","authors":["Kai Packhäuser","Sebastian Gündel","Florian Thamm","Felix Denzinger","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2209.11531v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.07620v2","updated":"2023-07-24T13:03:17Z","published":"2023-07-14T20:39:07Z","title":"Generalizable Embeddings with Cross-batch Metric Learning","summary":"  Global average pooling (GAP) is a popular component in deep metric learning\n(DML) for aggregating features. Its effectiveness is often attributed to\ntreating each feature vector as a distinct semantic entity and GAP as a\ncombination of them. Albeit substantiated, such an explanation's algorithmic\nimplications to learn generalizable entities to represent unseen classes, a\ncrucial DML goal, remain unclear. To address this, we formulate GAP as a convex\ncombination of learnable prototypes. We then show that the prototype learning\ncan be expressed as a recursive process fitting a linear predictor to a batch\nof samples. Building on that perspective, we consider two batches of disjoint\nclasses at each iteration and regularize the learning by expressing the samples\nof a batch with the prototypes that are fitted to the other batch. We validate\nour approach on 4 popular DML benchmarks.\n","authors":["Yeti Z. Gurbuz","A. Aydin Alatan"],"pdf_url":"https://arxiv.org/pdf/2307.07620v2.pdf","comment":"\\c{opyright} 2023 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2307.12751v1","updated":"2023-07-24T12:42:45Z","published":"2023-07-24T12:42:45Z","title":"ICF-SRSR: Invertible scale-Conditional Function for Self-Supervised\n  Real-world Single Image Super-Resolution","summary":"  Single image super-resolution (SISR) is a challenging ill-posed problem that\naims to up-sample a given low-resolution (LR) image to a high-resolution (HR)\ncounterpart. Due to the difficulty in obtaining real LR-HR training pairs,\nrecent approaches are trained on simulated LR images degraded by simplified\ndown-sampling operators, e.g., bicubic. Such an approach can be problematic in\npractice because of the large gap between the synthesized and real-world LR\nimages. To alleviate the issue, we propose a novel Invertible scale-Conditional\nFunction (ICF), which can scale an input image and then restore the original\ninput with different scale conditions. By leveraging the proposed ICF, we\nconstruct a novel self-supervised SISR framework (ICF-SRSR) to handle the\nreal-world SR task without using any paired/unpaired training data.\nFurthermore, our ICF-SRSR can generate realistic and feasible LR-HR pairs,\nwhich can make existing supervised SISR networks more robust. Extensive\nexperiments demonstrate the effectiveness of the proposed method in handling\nSISR in a fully self-supervised manner. Our ICF-SRSR demonstrates superior\nperformance compared to the existing methods trained on synthetic paired images\nin real-world scenarios and exhibits comparable performance compared to\nstate-of-the-art supervised/unsupervised methods on public benchmark datasets.\n","authors":["Reyhaneh Neshatavar","Mohsen Yavartanoo","Sanghyun Son","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.12751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09629v2","updated":"2023-07-24T12:33:09Z","published":"2023-02-19T17:15:56Z","title":"BiofilmScanner: A Computational Intelligence Approach to Obtain\n  Bacterial Cell Morphological Attributes from Biofilm Image","summary":"  Desulfovibrio alaskensis G20 (DA-G20) is utilized as a model for\nsulfate-reducing bacteria (SRB) that are associated with corrosion issues\ncaused by microorganisms. SRB-based biofilms are thought to be responsible for\nthe billion-dollar-per-year bio-corrosion of metal infrastructure.\nUnderstanding the extraction of the bacterial cells' shape and size properties\nin the SRB-biofilm at different growth stages will assist with the design of\nanti-corrosion techniques. However, numerous issues affect current approaches,\nincluding time-consuming geometric property extraction, low efficiency, and\nhigh error rates. This paper proposes BiofilScanner, a Yolact-based deep\nlearning method integrated with invariant moments to address these problems.\nOur approach efficiently detects and segments bacterial cells in an SRB image\nwhile simultaneously invariant moments measure the geometric characteristics of\nthe segmented cells with low errors. The numerical experiments of the proposed\nmethod demonstrate that the BiofilmScanner is 2.1x and 6.8x faster than our\nearlier Mask-RCNN and DLv3+ methods for detecting, segmenting, and measuring\nthe geometric properties of the cell. Furthermore, the BiofilmScanner achieved\nan F1-score of 85.28% while Mask-RCNN and DLv3+ obtained F1-scores of 77.67%\nand 75.18%, respectively.\n","authors":["Md Hafizur Rahman","Md Ali Azam","Md Abir Hossen","Shankarachary Ragi","Venkataramana Gadhamshetty"],"pdf_url":"https://arxiv.org/pdf/2302.09629v2.pdf","comment":"Submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2307.12732v1","updated":"2023-07-24T12:24:07Z","published":"2023-07-24T12:24:07Z","title":"CLIP-KD: An Empirical Study of Distilling CLIP Models","summary":"  CLIP has become a promising language-supervised visual pre-training framework\nand achieves excellent performance over a wide range of tasks. This paper aims\nto distill small CLIP models supervised by a large teacher CLIP model. We\npropose several distillation strategies, including relation, feature, gradient\nand contrastive paradigm, to examine the impact on CLIP distillation. We show\nthat the simplest feature mimicry with MSE loss performs best. Moreover,\ninteractive contrastive learning and relation-based distillation are also\ncritical in performance improvement. We apply the unified method to distill\nseveral student networks trained on 15 million (image, text) pairs.\nDistillation improves the student CLIP models consistently over zero-shot\nImageNet classification and cross-modal retrieval benchmarks. We hope our\nempirical study will become an important baseline for future CLIP distillation\nresearch. The code is available at \\url{https://github.com/winycg/CLIP-KD}.\n","authors":["Chuanguang Yang","Zhulin An","Libo Huang","Junyu Bi","Xinqiang Yu","Han Yang","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2307.12732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12730v1","updated":"2023-07-24T12:22:19Z","published":"2023-07-24T12:22:19Z","title":"COCO-O: A Benchmark for Object Detectors under Natural Distribution\n  Shifts","summary":"  Practical object detection application can lose its effectiveness on image\ninputs with natural distribution shifts. This problem leads the research\ncommunity to pay more attention on the robustness of detectors under\nOut-Of-Distribution (OOD) inputs. Existing works construct datasets to\nbenchmark the detector's OOD robustness for a specific application scenario,\ne.g., Autonomous Driving. However, these datasets lack universality and are\nhard to benchmark general detectors built on common tasks such as COCO. To give\na more comprehensive robustness assessment, we introduce\nCOCO-O(ut-of-distribution), a test dataset based on COCO with 6 types of\nnatural distribution shifts. COCO-O has a large distribution gap with training\ndata and results in a significant 55.7% relative performance drop on a Faster\nR-CNN detector. We leverage COCO-O to conduct experiments on more than 100\nmodern object detectors to investigate if their improvements are credible or\njust over-fitting to the COCO test set. Unfortunately, most classic detectors\nin early years do not exhibit strong OOD generalization. We further study the\nrobustness effect on recent breakthroughs of detector's architecture design,\naugmentation and pre-training techniques. Some empirical findings are revealed:\n1) Compared with detection head or neck, backbone is the most important part\nfor robustness; 2) An end-to-end detection transformer design brings no\nenhancement, and may even reduce robustness; 3) Large-scale foundation models\nhave made a great leap on robust object detection. We hope our COCO-O could\nprovide a rich testbed for robustness study of object detection. The dataset\nwill be available at\n\\url{https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o}.\n","authors":["Xiaofeng Mao","Yuefeng Chen","Yao Zhu","Da Chen","Hang Su","Rong Zhang","Hui Xue"],"pdf_url":"https://arxiv.org/pdf/2307.12730v1.pdf","comment":"To appear in ICCV2023,\n  https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o"},{"id":"http://arxiv.org/abs/2307.12729v1","updated":"2023-07-24T12:21:33Z","published":"2023-07-24T12:21:33Z","title":"Persistent-Transient Duality: A Multi-mechanism Approach for Modeling\n  Human-Object Interaction","summary":"  Humans are highly adaptable, swiftly switching between different modes to\nprogressively handle different tasks, situations and contexts. In Human-object\ninteraction (HOI) activities, these modes can be attributed to two mechanisms:\n(1) the large-scale consistent plan for the whole activity and (2) the\nsmall-scale children interactive actions that start and end along the timeline.\nWhile neuroscience and cognitive science have confirmed this multi-mechanism\nnature of human behavior, machine modeling approaches for human motion are\ntrailing behind. While attempted to use gradually morphing structures (e.g.,\ngraph attention networks) to model the dynamic HOI patterns, they miss the\nexpeditious and discrete mode-switching nature of the human motion. To bridge\nthat gap, this work proposes to model two concurrent mechanisms that jointly\ncontrol human motion: the Persistent process that runs continually on the\nglobal scale, and the Transient sub-processes that operate intermittently on\nthe local context of the human while interacting with objects. These two\nmechanisms form an interactive Persistent-Transient Duality that\nsynergistically governs the activity sequences. We model this conceptual\nduality by a parent-child neural network of Persistent and Transient channels\nwith a dedicated neural module for dynamic mechanism switching. The framework\nis trialed on HOI motion forecasting. On two rich datasets and a wide variety\nof settings, the model consistently delivers superior performances, proving its\nsuitability for the challenge.\n","authors":["Hung Tran","Vuong Le","Svetha Venkatesh","Truyen Tran"],"pdf_url":"https://arxiv.org/pdf/2307.12729v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2303.12865v3","updated":"2023-07-24T12:08:50Z","published":"2023-03-22T18:59:48Z","title":"NeRF-GAN Distillation for Efficient 3D-Aware Generation with\n  Convolutions","summary":"  Pose-conditioned convolutional generative models struggle with high-quality\n3D-consistent image generation from single-view datasets, due to their lack of\nsufficient 3D priors. Recently, the integration of Neural Radiance Fields\n(NeRFs) and generative models, such as Generative Adversarial Networks (GANs),\nhas transformed 3D-aware generation from single-view images. NeRF-GANs exploit\nthe strong inductive bias of neural 3D representations and volumetric rendering\nat the cost of higher computational complexity. This study aims at revisiting\npose-conditioned 2D GANs for efficient 3D-aware generation at inference time by\ndistilling 3D knowledge from pretrained NeRF-GANs. We propose a simple and\neffective method, based on re-using the well-disentangled latent space of a\npre-trained NeRF-GAN in a pose-conditioned convolutional network to directly\ngenerate 3D-consistent images corresponding to the underlying 3D\nrepresentations. Experiments on several datasets demonstrate that the proposed\nmethod obtains results comparable with volumetric rendering in terms of quality\nand 3D consistency while benefiting from the computational advantage of\nconvolutional networks. The code will be available at:\nhttps://github.com/mshahbazi72/NeRF-GAN-Distillation\n","authors":["Mohamad Shahbazi","Evangelos Ntavelis","Alessio Tonioni","Edo Collins","Danda Pani Paudel","Martin Danelljan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.12865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12721v1","updated":"2023-07-24T12:03:50Z","published":"2023-07-24T12:03:50Z","title":"AMAE: Adaptation of Pre-Trained Masked Autoencoder for Dual-Distribution\n  Anomaly Detection in Chest X-Rays","summary":"  Unsupervised anomaly detection in medical images such as chest radiographs is\nstepping into the spotlight as it mitigates the scarcity of the labor-intensive\nand costly expert annotation of anomaly data. However, nearly all existing\nmethods are formulated as a one-class classification trained only on\nrepresentations from the normal class and discard a potentially significant\nportion of the unlabeled data. This paper focuses on a more practical setting,\ndual distribution anomaly detection for chest X-rays, using the entire training\ndata, including both normal and unlabeled images. Inspired by a modern\nself-supervised vision transformer model trained using partial image inputs to\nreconstruct missing image regions -- we propose AMAE, a two-stage algorithm for\nadaptation of the pre-trained masked autoencoder (MAE). Starting from MAE\ninitialization, AMAE first creates synthetic anomalies from only normal\ntraining images and trains a lightweight classifier on frozen transformer\nfeatures. Subsequently, we propose an adaptation strategy to leverage unlabeled\nimages containing anomalies. The adaptation scheme is accomplished by assigning\npseudo-labels to unlabeled images and using two separate MAE based modules to\nmodel the normative and anomalous distributions of pseudo-labeled images. The\neffectiveness of the proposed adaptation strategy is evaluated with different\nanomaly ratios in an unlabeled training set. AMAE leads to consistent\nperformance gains over competing self-supervised and dual distribution anomaly\ndetection methods, setting the new state-of-the-art on three public chest X-ray\nbenchmarks: RSNA, NIH-CXR, and VinDr-CXR.\n","authors":["Behzad Bozorgtabar","Dwarikanath Mahapatra","Jean-Philippe Thiran"],"pdf_url":"https://arxiv.org/pdf/2307.12721v1.pdf","comment":"To be presented at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.12718v1","updated":"2023-07-24T11:59:07Z","published":"2023-07-24T11:59:07Z","title":"CarPatch: A Synthetic Benchmark for Radiance Field Evaluation on Vehicle\n  Components","summary":"  Neural Radiance Fields (NeRFs) have gained widespread recognition as a highly\neffective technique for representing 3D reconstructions of objects and scenes\nderived from sets of images. Despite their efficiency, NeRF models can pose\nchallenges in certain scenarios such as vehicle inspection, where the lack of\nsufficient data or the presence of challenging elements (e.g. reflections)\nstrongly impact the accuracy of the reconstruction. To this aim, we introduce\nCarPatch, a novel synthetic benchmark of vehicles. In addition to a set of\nimages annotated with their intrinsic and extrinsic camera parameters, the\ncorresponding depth maps and semantic segmentation masks have been generated\nfor each view. Global and part-based metrics have been defined and used to\nevaluate, compare, and better characterize some state-of-the-art techniques.\nThe dataset is publicly released at\nhttps://aimagelab.ing.unimore.it/go/carpatch and can be used as an evaluation\nguide and as a baseline for future work on this challenging topic.\n","authors":["Davide Di Nucci","Alessandro Simoni","Matteo Tomei","Luca Ciuffreda","Roberto Vezzani","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2307.12718v1.pdf","comment":"Accepted at ICIAP2023"},{"id":"http://arxiv.org/abs/2307.12717v1","updated":"2023-07-24T11:58:58Z","published":"2023-07-24T11:58:58Z","title":"Dense Transformer based Enhanced Coding Network for Unsupervised Metal\n  Artifact Reduction","summary":"  CT images corrupted by metal artifacts have serious negative effects on\nclinical diagnosis. Considering the difficulty of collecting paired data with\nground truth in clinical settings, unsupervised methods for metal artifact\nreduction are of high interest. However, it is difficult for previous\nunsupervised methods to retain structural information from CT images while\nhandling the non-local characteristics of metal artifacts. To address these\nchallenges, we proposed a novel Dense Transformer based Enhanced Coding Network\n(DTEC-Net) for unsupervised metal artifact reduction. Specifically, we\nintroduce a Hierarchical Disentangling Encoder, supported by the high-order\ndense process, and transformer to obtain densely encoded sequences with\nlong-range correspondence. Then, we present a second-order disentanglement\nmethod to improve the dense sequence's decoding process. Extensive experiments\nand model discussions illustrate DTEC-Net's effectiveness, which outperforms\nthe previous state-of-the-art methods on a benchmark dataset, and greatly\nreduces metal artifacts while restoring richer texture details.\n","authors":["Wangduo Xie","Matthew B. Blaschko"],"pdf_url":"https://arxiv.org/pdf/2307.12717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09340v3","updated":"2023-07-24T11:34:21Z","published":"2023-03-16T14:21:45Z","title":"Improving Automated Hemorrhage Detection in Sparse-view Computed\n  Tomography via Deep Convolutional Neural Network based Artifact Reduction","summary":"  Purpose: Sparse-view computed tomography (CT) is an effective way to reduce\ndose by lowering the total number of views acquired, albeit at the expense of\nimage quality, which, in turn, can impact the ability to detect diseases. We\nexplore deep learning-based artifact reduction in sparse-view cranial CT scans\nand its impact on automated hemorrhage detection. Methods: We trained a U-Net\nfor artefact reduction on simulated sparse-view cranial CT scans from 3000\npatients obtained from a public dataset and reconstructed with varying levels\nof sub-sampling. Additionally, we trained a convolutional neural network on\nfully sampled CT data from 17,545 patients for automated hemorrhage detection.\nWe evaluated the classification performance using the area under the receiver\noperator characteristic curves (AUC-ROCs) with corresponding 95% confidence\nintervals (CIs) and the DeLong test, along with confusion matrices. The\nperformance of the U-Net was compared to an analytical approach based on total\nvariation (TV). Results: The U-Net performed superior compared to unprocessed\nand TV-processed images with respect to image quality and automated hemorrhage\ndiagnosis. With U-Net post-processing, the number of views can be reduced from\n4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;\n0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256\nviews (0.967; 0.964-0.969) with a slight performance decrease (P<.001).\nConclusion: The results suggest that U-Net based artifact reduction\nsubstantially enhances automated hemorrhage detection in sparse-view cranial\nCTs. Our findings highlight that appropriate post-processing is crucial for\noptimal image quality and diagnostic accuracy while minimizing radiation dose.\n","authors":["Johannes Thalhammer","Manuel Schultheiss","Tina Dorosti","Tobias Lasser","Franz Pfeiffer","Daniela Pfeiffer","Florian Schaff"],"pdf_url":"https://arxiv.org/pdf/2303.09340v3.pdf","comment":"11 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2011.09094v3","updated":"2023-07-24T11:28:46Z","published":"2020-11-18T05:16:11Z","title":"UP-DETR: Unsupervised Pre-training for Object Detection with\n  Transformers","summary":"  DEtection TRansformer (DETR) for object detection reaches competitive\nperformance compared with Faster R-CNN via a transformer encoder-decoder\narchitecture. However, trained with scratch transformers, DETR needs\nlarge-scale training data and an extreme long training schedule even on COCO\ndataset. Inspired by the great success of pre-training transformers in natural\nlanguage processing, we propose a novel pretext task named random query patch\ndetection in Unsupervised Pre-training DETR (UP-DETR). Specifically, we\nrandomly crop patches from the given image and then feed them as queries to the\ndecoder. The model is pre-trained to detect these query patches from the input\nimage. During the pre-training, we address two critical issues: multi-task\nlearning and multi-query localization. (1) To trade off classification and\nlocalization preferences in the pretext task, we find that freezing the CNN\nbackbone is the prerequisite for the success of pre-training transformers. (2)\nTo perform multi-query localization, we develop UP-DETR with multi-query patch\ndetection with attention mask. Besides, UP-DETR also provides a unified\nperspective for fine-tuning object detection and one-shot detection tasks. In\nour experiments, UP-DETR significantly boosts the performance of DETR with\nfaster convergence and higher average precision on object detection, one-shot\ndetection and panoptic segmentation. Code and pre-training models:\nhttps://github.com/dddzg/up-detr.\n","authors":["Zhigang Dai","Bolun Cai","Yugeng Lin","Junying Chen"],"pdf_url":"https://arxiv.org/pdf/2011.09094v3.pdf","comment":"Accepted by TPAMI 2022 and CVPR 2021"},{"id":"http://arxiv.org/abs/2307.12698v1","updated":"2023-07-24T11:27:14Z","published":"2023-07-24T11:27:14Z","title":"MC-JEPA: A Joint-Embedding Predictive Architecture for Self-Supervised\n  Learning of Motion and Content Features","summary":"  Self-supervised learning of visual representations has been focusing on\nlearning content features, which do not capture object motion or location, and\nfocus on identifying and differentiating objects in images and videos. On the\nother hand, optical flow estimation is a task that does not involve\nunderstanding the content of the images on which it is estimated. We unify the\ntwo approaches and introduce MC-JEPA, a joint-embedding predictive architecture\nand self-supervised learning approach to jointly learn optical flow and content\nfeatures within a shared encoder, demonstrating that the two associated\nobjectives; the optical flow estimation objective and the self-supervised\nlearning objective; benefit from each other and thus learn content features\nthat incorporate motion information. The proposed approach achieves performance\non-par with existing unsupervised optical flow benchmarks, as well as with\ncommon self-supervised learning approaches on downstream tasks such as semantic\nsegmentation of images and videos.\n","authors":["Adrien Bardes","Jean Ponce","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2307.12698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10763v3","updated":"2023-07-24T11:15:47Z","published":"2023-02-12T12:19:57Z","title":"Contrastive Learning and the Emergence of Attributes Associations","summary":"  In response to an object presentation, supervised learning schemes generally\nrespond with a parsimonious label. Upon a similar presentation we humans\nrespond again with a label, but are flooded, in addition, by a myriad of\nassociations. A significant portion of these consist of the presented object\nattributes. Contrastive learning is a semi-supervised learning scheme based on\nthe application of identity preserving transformations on the object input\nrepresentations. It is conjectured in this work that these same applied\ntransformations preserve, in addition to the identity of the presented object,\nalso the identity of its semantically meaningful attributes. The corollary of\nthis is that the output representations of such a contrastive learning scheme\ncontain valuable information not only for the classification of the presented\nobject, but also for the presence or absence decision of any attribute of\ninterest. Simulation results which demonstrate this idea and the feasibility of\nthis conjecture are presented.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2302.10763v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2304.02941v2","updated":"2023-07-24T10:57:15Z","published":"2023-04-06T08:56:18Z","title":"Dr. KID: Direct Remeshing and K-set Isometric Decomposition for Scalable\n  Physicalization of Organic Shapes","summary":"  Dr. KID is an algorithm that uses isometric decomposition for the\nphysicalization of potato-shaped organic models in a puzzle fashion. The\nalgorithm begins with creating a simple, regular triangular surface mesh of\norganic shapes, followed by iterative k-means clustering and remeshing. For\nclustering, we need similarity between triangles (segments) which is defined as\na distance function. The distance function maps each triangle's shape to a\nsingle point in the virtual 3D space. Thus, the distance between the triangles\nindicates their degree of dissimilarity. K-means clustering uses this distance\nand sorts of segments into k classes. After this, remeshing is applied to\nminimize the distance between triangles within the same cluster by making their\nshapes identical. Clustering and remeshing are repeated until the distance\nbetween triangles in the same cluster reaches an acceptable threshold. We adopt\na curvature-aware strategy to determine the surface thickness and finalize\npuzzle pieces for 3D printing. Identical hinges and holes are created for\nassembling the puzzle components. For smoother outcomes, we use triangle\nsubdivision along with curvature-aware clustering, generating curved triangular\npatches for 3D printing. Our algorithm was evaluated using various models, and\nthe 3D-printed results were analyzed. Findings indicate that our algorithm\nperforms reliably on target organic shapes with minimal loss of input geometry.\n","authors":["Dawar Khan","Ciril Bohak","Ivan Viola"],"pdf_url":"https://arxiv.org/pdf/2304.02941v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12676v1","updated":"2023-07-24T10:30:54Z","published":"2023-07-24T10:30:54Z","title":"Damage Vision Mining Opportunity for Imbalanced Anomaly Detection","summary":"  In past decade, previous balanced datasets have been used to advance\nalgorithms for classification, object detection, semantic segmentation, and\nanomaly detection in industrial applications. Specifically, for condition-based\nmaintenance, automating visual inspection is crucial to ensure high quality.\nDeterioration prognostic attempts to optimize the fine decision process for\npredictive maintenance and proactive repair. In civil infrastructure and living\nenvironment, damage data mining cannot avoid the imbalanced data issue because\nof rare unseen events and high quality status by improved operations. For\nvisual inspection, deteriorated class acquired from the surface of concrete and\nsteel components are occasionally imbalanced. From numerous related surveys, we\nsummarize that imbalanced data problems can be categorized into four types; 1)\nmissing range of target and label valuables, 2) majority-minority class\nimbalance, 3) foreground-background of spatial imbalance, 4) long-tailed class\nof pixel-wise imbalance. Since 2015, there has been many imbalanced studies\nusing deep learning approaches that includes regression, image classification,\nobject detection, semantic segmentation. However, anomaly detection for\nimbalanced data is not yet well known. In the study, we highlight one-class\nanomaly detection application whether anomalous class or not, and demonstrate\nclear examples on imbalanced vision datasets: wooden, concrete deterioration,\nand disaster damage. We provide key results on damage vision mining advantage,\nhypothesizing that the more effective range of positive ratio, the higher\naccuracy gain of anomaly detection application. Finally, the applicability of\nthe damage learning methods, limitations, and future works are mentioned.\n","authors":["Takato Yasuno"],"pdf_url":"https://arxiv.org/pdf/2307.12676v1.pdf","comment":"12 pages, 14 figures, 8 tables"},{"id":"http://arxiv.org/abs/2307.12674v1","updated":"2023-07-24T10:24:13Z","published":"2023-07-24T10:24:13Z","title":"Industrial Segment Anything -- a Case Study in Aircraft Manufacturing,\n  Intralogistics, Maintenance, Repair, and Overhaul","summary":"  Deploying deep learning-based applications in specialized domains like the\naircraft production industry typically suffers from the training data\navailability problem. Only a few datasets represent non-everyday objects,\nsituations, and tasks. Recent advantages in research around Vision Foundation\nModels (VFM) opened a new area of tasks and models with high generalization\ncapabilities in non-semantic and semantic predictions. As recently demonstrated\nby the Segment Anything Project, exploiting VFM's zero-shot capabilities is a\npromising direction in tackling the boundaries spanned by data, context, and\nsensor variety. Although, investigating its application within specific domains\nis subject to ongoing research. This paper contributes here by surveying\napplications of the SAM in aircraft production-specific use cases. We include\nmanufacturing, intralogistics, as well as maintenance, repair, and overhaul\nprocesses, also representing a variety of other neighboring industrial domains.\nBesides presenting the various use cases, we further discuss the injection of\ndomain knowledge.\n","authors":["Keno Moenck","Arne Wendt","Philipp Prünte","Julian Koch","Arne Sahrhage","Johann Gierecker","Ole Schmedemann","Falko Kähler","Dirk Holst","Martin Gomse","Thorsten Schüppstuhl","Daniel Schoepflin"],"pdf_url":"https://arxiv.org/pdf/2307.12674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12672v1","updated":"2023-07-24T10:20:14Z","published":"2023-07-24T10:20:14Z","title":"Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked\n  Image Modeling","summary":"  In dynamic Magnetic Resonance Imaging (MRI), k-space is typically\nundersampled due to limited scan time, resulting in aliasing artifacts in the\nimage domain. Hence, dynamic MR reconstruction requires not only modeling\nspatial frequency components in the x and y directions of k-space but also\nconsidering temporal redundancy. Most previous works rely on image-domain\nregularizers (priors) to conduct MR reconstruction. In contrast, we focus on\ninterpolating the undersampled k-space before obtaining images with Fourier\ntransform. In this work, we connect masked image modeling with k-space\ninterpolation and propose a novel Transformer-based k-space Global\nInterpolation Network, termed k-GIN. Our k-GIN learns global dependencies among\nlow- and high-frequency components of 2D+t k-space and uses it to interpolate\nunsampled data. Further, we propose a novel k-space Iterative Refinement Module\n(k-IRM) to enhance the high-frequency components learning. We evaluate our\napproach on 92 in-house 2D+t cardiac MR subjects and compare it to MR\nreconstruction methods with image-domain regularizers. Experiments show that\nour proposed k-space interpolation method quantitatively and qualitatively\noutperforms baseline methods. Importantly, the proposed approach achieves\nsubstantially higher robustness and generalizability in cases of\nhighly-undersampled MR data.\n","authors":["Jiazhen Pan","Suprosanna Shit","Özgün Turgut","Wenqi Huang","Hongwei Bran Li","Nil Stolt-Ansó","Thomas Küstner","Kerstin Hammernik","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2307.12672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07250v2","updated":"2023-07-24T10:10:25Z","published":"2023-04-14T16:58:23Z","title":"Fusing Structure from Motion and Simulation-Augmented Pose Regression\n  from Optical Flow for Challenging Indoor Environments","summary":"  The localization of objects is a crucial task in various applications such as\nrobotics, virtual and augmented reality, and the transportation of goods in\nwarehouses. Recent advances in deep learning have enabled the localization\nusing monocular visual cameras. While structure from motion (SfM) predicts the\nabsolute pose from a point cloud, absolute pose regression (APR) methods learn\na semantic understanding of the environment through neural networks. However,\nboth fields face challenges caused by the environment such as motion blur,\nlighting changes, repetitive patterns, and feature-less structures. This study\naims to address these challenges by incorporating additional information and\nregularizing the absolute pose using relative pose regression (RPR) methods.\nRPR methods suffer under different challenges, i.e., motion blur. The optical\nflow between consecutive images is computed using the Lucas-Kanade algorithm,\nand the relative pose is predicted using an auxiliary small recurrent\nconvolutional network. The fusion of absolute and relative poses is a complex\ntask due to the mismatch between the global and local coordinate systems.\nState-of-the-art methods fusing absolute and relative poses use pose graph\noptimization (PGO) to regularize the absolute pose predictions using relative\nposes. In this work, we propose recurrent fusion networks to optimally align\nabsolute and relative pose predictions to improve the absolute pose prediction.\nWe evaluate eight different recurrent units and construct a simulation\nenvironment to pre-train the APR and RPR networks for better generalized\ntraining. Additionally, we record a large database of different scenarios in a\nchallenging large-scale indoor environment that mimics a warehouse with\ntransportation robots. We conduct hyperparameter searches and experiments to\nshow the effectiveness of our recurrent fusion method compared to PGO.\n","authors":["Felix Ott","Lucas Heublein","David Rügamer","Bernd Bischl","Christopher Mutschler"],"pdf_url":"https://arxiv.org/pdf/2304.07250v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12656v1","updated":"2023-07-24T09:54:49Z","published":"2023-07-24T09:54:49Z","title":"A Theoretically Guaranteed Quaternion Weighted Schatten p-norm\n  Minimization Method for Color Image Restoration","summary":"  Inspired by the fact that the matrix formulated by nonlocal similar patches\nin a natural image is of low rank, the rank approximation issue have been\nextensively investigated over the past decades, among which weighted nuclear\nnorm minimization (WNNM) and weighted Schatten $p$-norm minimization (WSNM) are\ntwo prevailing methods have shown great superiority in various image\nrestoration (IR) problems. Due to the physical characteristic of color images,\ncolor image restoration (CIR) is often a much more difficult task than its\ngrayscale image counterpart. However, when applied to CIR, the traditional\nWNNM/WSNM method only processes three color channels individually and fails to\nconsider their cross-channel correlations. Very recently, a quaternion-based\nWNNM approach (QWNNM) has been developed to mitigate this issue, which is\ncapable of representing the color image as a whole in the quaternion domain and\npreserving the inherent correlation among the three color channels. Despite its\nempirical success, unfortunately, the convergence behavior of QWNNM has not\nbeen strictly studied yet. In this paper, on the one side, we extend the WSNM\ninto quaternion domain and correspondingly propose a novel quaternion-based\nWSNM model (QWSNM) for tackling the CIR problems. Extensive experiments on two\nrepresentative CIR tasks, including color image denoising and deblurring,\ndemonstrate that the proposed QWSNM method performs favorably against many\nstate-of-the-art alternatives, in both quantitative and qualitative\nevaluations. On the other side, more importantly, we preliminarily provide a\ntheoretical convergence analysis, that is, by modifying the quaternion\nalternating direction method of multipliers (QADMM) through a simple\ncontinuation strategy, we theoretically prove that both the solution sequences\ngenerated by the QWNNM and QWSNM have fixed-point convergence guarantees.\n","authors":["Qing-Hua Zhang","Liang-Tian He","Yi-Lun Wang","Liang-Jian Deng","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12656v1.pdf","comment":"46 pages, 10 figures; references added"},{"id":"http://arxiv.org/abs/2302.01162v5","updated":"2023-07-24T09:41:07Z","published":"2023-02-02T15:37:46Z","title":"Get3DHuman: Lifting StyleGAN-Human into a 3D Generative Model using\n  Pixel-aligned Reconstruction Priors","summary":"  Fast generation of high-quality 3D digital humans is important to a vast\nnumber of applications ranging from entertainment to professional concerns.\nRecent advances in differentiable rendering have enabled the training of 3D\ngenerative models without requiring 3D ground truths. However, the quality of\nthe generated 3D humans still has much room to improve in terms of both\nfidelity and diversity. In this paper, we present Get3DHuman, a novel 3D human\nframework that can significantly boost the realism and diversity of the\ngenerated outcomes by only using a limited budget of 3D ground-truth data. Our\nkey observation is that the 3D generator can profit from human-related priors\nlearned through 2D human generators and 3D reconstructors. Specifically, we\nbridge the latent space of Get3DHuman with that of StyleGAN-Human via a\nspecially-designed prior network, where the input latent code is mapped to the\nshape and texture feature volumes spanned by the pixel-aligned 3D\nreconstructor. The outcomes of the prior network are then leveraged as the\nsupervisory signals for the main generator network. To ensure effective\ntraining, we further propose three tailored losses applied to the generated\nfeature volumes and the intermediate feature maps. Extensive experiments\ndemonstrate that Get3DHuman greatly outperforms the other state-of-the-art\napproaches and can support a wide range of applications including shape\ninterpolation, shape re-texturing, and single-view reconstruction through\nlatent inversion.\n","authors":["Zhangyang Xiong","Di Kang","Derong Jin","Weikai Chen","Linchao Bao","Shuguang Cui","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2302.01162v5.pdf","comment":"ICCV 2023, project page:\n  https://x-zhangyang.github.io/2023_Get3DHuman/"},{"id":"http://arxiv.org/abs/2307.12644v1","updated":"2023-07-24T09:35:47Z","published":"2023-07-24T09:35:47Z","title":"Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation\n  of rPPG","summary":"  Remote Photoplethysmography (rPPG) is a technology that utilizes the light\nabsorption properties of hemoglobin, captured via camera, to analyze and\nmeasure blood volume pulse (BVP). By analyzing the measured BVP, various\nphysiological signals such as heart rate, stress levels, and blood pressure can\nbe derived, enabling applications such as the early prediction of\ncardiovascular diseases. rPPG is a rapidly evolving field as it allows the\nmeasurement of vital signals using camera-equipped devices without the need for\nadditional devices such as blood pressure monitors or pulse oximeters, and\nwithout the assistance of medical experts. Despite extensive efforts and\nadvances in this field, serious challenges remain, including issues related to\nskin color, camera characteristics, ambient lighting, and other sources of\nnoise, which degrade performance accuracy. We argue that fair and evaluable\nbenchmarking is urgently required to overcome these challenges and make any\nmeaningful progress from both academic and commercial perspectives. In most\nexisting work, models are trained, tested, and validated only on limited\ndatasets. Worse still, some studies lack available code or reproducibility,\nmaking it difficult to fairly evaluate and compare performance. Therefore, the\npurpose of this study is to provide a benchmarking framework to evaluate\nvarious rPPG techniques across a wide range of datasets for fair evaluation and\ncomparison, including both conventional non-deep neural network (non-DNN) and\ndeep neural network (DNN) methods. GitHub URL:\nhttps://github.com/remotebiosensing/rppg.\n","authors":["Dae Yeol Kim","Eunsu Goh","KwangKee Lee","JongEui Chae","JongHyeon Mun","Junyeong Na","Chae-bong Sohn","Do-Yup Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12644v1.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2304.03981v2","updated":"2023-07-24T09:24:04Z","published":"2023-04-08T10:47:41Z","title":"Uncertainty-inspired Open Set Learning for Retinal Anomaly\n  Identification","summary":"  Failure to recognize samples from the classes unseen during training is a\nmajor limitation of artificial intelligence in the real-world implementation\nfor recognition and classification of retinal anomalies. We established an\nuncertainty-inspired open-set (UIOS) model, which was trained with fundus\nimages of 9 retinal conditions. Besides assessing the probability of each\ncategory, UIOS also calculated an uncertainty score to express its confidence.\nOur UIOS model with thresholding strategy achieved an F1 score of 99.55%,\n97.01% and 91.91% for the internal testing set, external target categories\n(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1\nscore of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS\ncorrectly predicted high uncertainty scores, which would prompt the need for a\nmanual check in the datasets of non-target categories retinal diseases,\nlow-quality fundus images, and non-fundus images. UIOS provides a robust method\nfor real-world screening of retinal anomalies.\n","authors":["Meng Wang","Tian Lin","Lianyu Wang","Aidi Lin","Ke Zou","Xinxing Xu","Yi Zhou","Yuanyuan Peng","Qingquan Meng","Yiming Qian","Guoyao Deng","Zhiqun Wu","Junhong Chen","Jianhong Lin","Mingzhi Zhang","Weifang Zhu","Changqing Zhang","Daoqiang Zhang","Rick Siow Mong Goh","Yong Liu","Chi Pui Pang","Xinjian Chen","Haoyu Chen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2304.03981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12637v1","updated":"2023-07-24T09:22:09Z","published":"2023-07-24T09:22:09Z","title":"PG-RCNN: Semantic Surface Point Generation for 3D Object Detection","summary":"  One of the main challenges in LiDAR-based 3D object detection is that the\nsensors often fail to capture the complete spatial information about the\nobjects due to long distance and occlusion. Two-stage detectors with point\ncloud completion approaches tackle this problem by adding more points to the\nregions of interest (RoIs) with a pre-trained network. However, these methods\ngenerate dense point clouds of objects for all region proposals, assuming that\nobjects always exist in the RoIs. This leads to the indiscriminate point\ngeneration for incorrect proposals as well. Motivated by this, we propose Point\nGeneration R-CNN (PG-RCNN), a novel end-to-end detector that generates semantic\nsurface points of foreground objects for accurate detection. Our method uses a\njointly trained RoI point generation module to process the contextual\ninformation of RoIs and estimate the complete shape and displacement of\nforeground objects. For every generated point, PG-RCNN assigns a semantic\nfeature that indicates the estimated foreground probability. Extensive\nexperiments show that the point clouds generated by our method provide\ngeometrically and semantically rich information for refining false positive and\nmisaligned proposals. PG-RCNN achieves competitive performance on the KITTI\nbenchmark, with significantly fewer parameters than state-of-the-art models.\nThe code is available at https://github.com/quotation2520/PG-RCNN.\n","authors":["Inyong Koo","Inyoung Lee","Se-Ho Kim","Hee-Seon Kim","Woo-jin Jeon","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12637v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.11643v2","updated":"2023-07-24T09:18:52Z","published":"2023-07-21T15:22:32Z","title":"Morphological Image Analysis and Feature Extraction for Reasoning with\n  AI-based Defect Detection and Classification Models","summary":"  As the use of artificial intelligent (AI) models becomes more prevalent in\nindustries such as engineering and manufacturing, it is essential that these\nmodels provide transparent reasoning behind their predictions. This paper\nproposes the AI-Reasoner, which extracts the morphological characteristics of\ndefects (DefChars) from images and utilises decision trees to reason with the\nDefChar values. Thereafter, the AI-Reasoner exports visualisations (i.e.\ncharts) and textual explanations to provide insights into outputs made by\nmasked-based defect detection and classification models. It also provides\neffective mitigation strategies to enhance data pre-processing and overall\nmodel performance. The AI-Reasoner was tested on explaining the outputs of an\nIE Mask R-CNN model using a set of 366 images containing defects. The results\ndemonstrated its effectiveness in explaining the IE Mask R-CNN model's\npredictions. Overall, the proposed AI-Reasoner provides a solution for\nimproving the performance of AI models in industrial applications that require\ndefect analysis.\n","authors":["Jiajun Zhang","Georgina Cosma","Sarah Bugby","Axel Finke","Jason Watkins"],"pdf_url":"https://arxiv.org/pdf/2307.11643v2.pdf","comment":"8 pages, 3 figures, 5 tables; submitted to 2023 IEEE symposium series\n  on computational intelligence (SSCI)"},{"id":"http://arxiv.org/abs/2307.12634v1","updated":"2023-07-24T09:16:05Z","published":"2023-07-24T09:16:05Z","title":"Automatic lobe segmentation using attentive cross entropy and end-to-end\n  fissure generation","summary":"  The automatic lung lobe segmentation algorithm is of great significance for\nthe diagnosis and treatment of lung diseases, however, which has great\nchallenges due to the incompleteness of pulmonary fissures in lung CT images\nand the large variability of pathological features. Therefore, we propose a new\nautomatic lung lobe segmentation framework, in which we urge the model to pay\nattention to the area around the pulmonary fissure during the training process,\nwhich is realized by a task-specific loss function. In addition, we introduce\nan end-to-end pulmonary fissure generation method in the auxiliary pulmonary\nfissure segmentation task, without any additional network branch. Finally, we\npropose a registration-based loss function to alleviate the convergence\ndifficulty of the Dice loss supervised pulmonary fissure segmentation task. We\nachieve 97.83% and 94.75% dice scores on our private dataset STLB and public\nLUNA16 dataset respectively.\n","authors":["Qi Su","Na Wang","Jiawen Xie","Yinan Chen","Xiaofan Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12634v1.pdf","comment":"5 pages, 3 figures, published to 'IEEE International Symposium on\n  Biomedical Imaging (ISBI) 2023'"},{"id":"http://arxiv.org/abs/2307.12630v1","updated":"2023-07-24T09:08:30Z","published":"2023-07-24T09:08:30Z","title":"Semi-Supervised Medical Image Segmentation with Co-Distribution\n  Alignment","summary":"  Medical image segmentation has made significant progress when a large amount\nof labeled data are available. However, annotating medical image segmentation\ndatasets is expensive due to the requirement of professional skills.\nAdditionally, classes are often unevenly distributed in medical images, which\nseverely affects the classification performance on minority classes. To address\nthese problems, this paper proposes Co-Distribution Alignment (Co-DA) for\nsemi-supervised medical image segmentation. Specifically, Co-DA aligns marginal\npredictions on unlabeled data to marginal predictions on labeled data in a\nclass-wise manner with two differently initialized models before using the\npseudo-labels generated by one model to supervise the other. Besides, we design\nan over-expectation cross-entropy loss for filtering the unlabeled pixels to\nreduce noise in their pseudo-labels. Quantitative and qualitative experiments\non three public datasets demonstrate that the proposed approach outperforms\nexisting state-of-the-art semi-supervised medical image segmentation methods on\nboth the 2D CaDIS dataset and the 3D LGE-MRI and ACDC datasets, achieving an\nmIoU of 0.8515 with only 24% labeled data on CaDIS, and a Dice score of 0.8824\nand 0.8773 with only 20% data on LGE-MRI and ACDC, respectively.\n","authors":["Tao Wang","Zhongzheng Huang","Jiawei Wu","Yuanzheng Cai","Zuoyong Li"],"pdf_url":"https://arxiv.org/pdf/2307.12630v1.pdf","comment":"Paper appears in Bioengineering 2023, 10(7), 869"},{"id":"http://arxiv.org/abs/2307.12622v1","updated":"2023-07-24T08:51:49Z","published":"2023-07-24T08:51:49Z","title":"Phase Match for Out-of-Distribution Generalization","summary":"  The Fourier transform, serving as an explicit decomposition method for visual\nsignals, has been employed to explain the out-of-distribution generalization\nbehaviors of Convolutional Neural Networks (CNNs). Previous research and\nempirical studies have indicated that the amplitude spectrum plays a decisive\nrole in CNN recognition, but it is susceptible to disturbance caused by\ndistribution shifts. On the other hand, the phase spectrum preserves\nhighly-structured spatial information, which is crucial for visual\nrepresentation learning. In this paper, we aim to clarify the relationships\nbetween Domain Generalization (DG) and the frequency components by introducing\na Fourier-based structural causal model. Specifically, we interpret the phase\nspectrum as semi-causal factors and the amplitude spectrum as non-causal\nfactors. Building upon these observations, we propose Phase Match (PhaMa) to\naddress DG problems. Our method introduces perturbations on the amplitude\nspectrum and establishes spatial relationships to match the phase components.\nThrough experiments on multiple benchmarks, we demonstrate that our proposed\nmethod achieves state-of-the-art performance in domain generalization and\nout-of-distribution robustness tasks.\n","authors":["Chengming Hu","Rui Wang","Hao Chen","Zhouwang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12619v1","updated":"2023-07-24T08:49:20Z","published":"2023-07-24T08:49:20Z","title":"Sparse annotation strategies for segmentation of short axis cardiac MRI","summary":"  Short axis cardiac MRI segmentation is a well-researched topic, with\nexcellent results achieved by state-of-the-art models in a supervised setting.\nHowever, annotating MRI volumes is time-consuming and expensive. Many different\napproaches (e.g. transfer learning, data augmentation, few-shot learning, etc.)\nhave emerged in an effort to use fewer annotated data and still achieve similar\nperformance as a fully supervised model. Nevertheless, to the best of our\nknowledge, none of these works focus on which slices of MRI volumes are most\nimportant to annotate for yielding the best segmentation results. In this\npaper, we investigate the effects of training with sparse volumes, i.e.\nreducing the number of cases annotated, and sparse annotations, i.e. reducing\nthe number of slices annotated per case. We evaluate the segmentation\nperformance using the state-of-the-art nnU-Net model on two public datasets to\nidentify which slices are the most important to annotate. We have shown that\ntraining on a significantly reduced dataset (48 annotated volumes) can give a\nDice score greater than 0.85 and results comparable to using the full dataset\n(160 and 240 volumes for each dataset respectively). In general, training on\nmore slice annotations provides more valuable information compared to training\non more volumes. Further, annotating slices from the middle of volumes yields\nthe most beneficial results in terms of segmentation performance, and the\napical region the worst. When evaluating the trade-off between annotating\nvolumes against slices, annotating as many slices as possible instead of\nannotating more volumes is a better strategy.\n","authors":["Josh Stein","Maxime Di Folco","Julia Schnabel"],"pdf_url":"https://arxiv.org/pdf/2307.12619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12618v1","updated":"2023-07-24T08:47:45Z","published":"2023-07-24T08:47:45Z","title":"Attribute Regularized Soft Introspective VAE: Towards Cardiac Attribute\n  Regularization Through MRI Domains","summary":"  Deep generative models have emerged as influential instruments for data\ngeneration and manipulation. Enhancing the controllability of these models by\nselectively modifying data attributes has been a recent focus. Variational\nAutoencoders (VAEs) have shown promise in capturing hidden attributes but often\nproduce blurry reconstructions. Controlling these attributes through different\nimaging domains is difficult in medical imaging. Recently, Soft Introspective\nVAE leverage the benefits of both VAEs and Generative Adversarial Networks\n(GANs), which have demonstrated impressive image synthesis capabilities, by\nincorporating an adversarial loss into VAE training. In this work, we propose\nthe Attributed Soft Introspective VAE (Attri-SIVAE) by incorporating an\nattribute regularized loss, into the Soft-Intro VAE framework. We evaluate\nexperimentally the proposed method on cardiac MRI data from different domains,\nsuch as various scanner vendors and acquisition centers. The proposed method\nachieves similar performance in terms of reconstruction and regularization\ncompared to the state-of-the-art Attributed regularized VAE but additionally\nalso succeeds in keeping the same regularization level when tested on a\ndifferent dataset, unlike the compared method.\n","authors":["Maxime Di Folco","Cosmin Bercea","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2307.12618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12616v1","updated":"2023-07-24T08:44:25Z","published":"2023-07-24T08:44:25Z","title":"CTVIS: Consistent Training for Online Video Instance Segmentation","summary":"  The discrimination of instance embeddings plays a vital role in associating\ninstances across time for online video instance segmentation (VIS). Instance\nembedding learning is directly supervised by the contrastive loss computed upon\nthe contrastive items (CIs), which are sets of anchor/positive/negative\nembeddings. Recent online VIS methods leverage CIs sourced from one reference\nframe only, which we argue is insufficient for learning highly discriminative\nembeddings. Intuitively, a possible strategy to enhance CIs is replicating the\ninference phase during training. To this end, we propose a simple yet effective\ntraining strategy, called Consistent Training for Online VIS (CTVIS), which\ndevotes to aligning the training and inference pipelines in terms of building\nCIs. Specifically, CTVIS constructs CIs by referring inference the\nmomentum-averaged embedding and the memory bank storage mechanisms, and adding\nnoise to the relevant embeddings. Such an extension allows a reliable\ncomparison between embeddings of current instances and the stable\nrepresentations of historical instances, thereby conferring an advantage in\nmodeling VIS challenges such as occlusion, re-identification, and deformation.\nEmpirically, CTVIS outstrips the SOTA VIS models by up to +5.0 points on three\nVIS benchmarks, including YTVIS19 (55.1% AP), YTVIS21 (50.1% AP) and OVIS\n(35.5% AP). Furthermore, we find that pseudo-videos transformed from images can\ntrain robust models surpassing fully-supervised ones.\n","authors":["Kaining Ying","Qing Zhong","Weian Mao","Zhenhua Wang","Hao Chen","Lin Yuanbo Wu","Yifan Liu","Chengxiang Fan","Yunzhi Zhuge","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2307.12616v1.pdf","comment":"Accepted by ICCV 2023. The code is available at\n  https://github.com/KainingYing/CTVIS"},{"id":"http://arxiv.org/abs/2307.12612v1","updated":"2023-07-24T08:39:11Z","published":"2023-07-24T08:39:11Z","title":"Less is More: Focus Attention for Efficient DETR","summary":"  DETR-like models have significantly boosted the performance of detectors and\neven outperformed classical convolutional models. However, all tokens are\ntreated equally without discrimination brings a redundant computational burden\nin the traditional encoder structure. The recent sparsification strategies\nexploit a subset of informative tokens to reduce attention complexity\nmaintaining performance through the sparse encoder. But these methods tend to\nrely on unreliable model statistics. Moreover, simply reducing the token\npopulation hinders the detection performance to a large extent, limiting the\napplication of these sparse models. We propose Focus-DETR, which focuses\nattention on more informative tokens for a better trade-off between computation\nefficiency and model accuracy. Specifically, we reconstruct the encoder with\ndual attention, which includes a token scoring mechanism that considers both\nlocalization and category semantic information of the objects from multi-scale\nfeature maps. We efficiently abandon the background queries and enhance the\nsemantic interaction of the fine-grained object queries based on the scores.\nCompared with the state-of-the-art sparse DETR-like detectors under the same\nsetting, our Focus-DETR gets comparable complexity while achieving 50.4AP\n(+2.2) on COCO. The code is available at\nhttps://github.com/huawei-noah/noah-research/tree/master/Focus-DETR and\nhttps://gitee.com/mindspore/models/tree/master/research/cv/Focus-DETR.\n","authors":["Dehua Zheng","Wenhui Dong","Hailin Hu","Xinghao Chen","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12612v1.pdf","comment":"8 pages, 6 figures, accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2307.12607v1","updated":"2023-07-24T08:32:27Z","published":"2023-07-24T08:32:27Z","title":"ExWarp: Extrapolation and Warping-based Temporal Supersampling for\n  High-frequency Displays","summary":"  High-frequency displays are gaining immense popularity because of their\nincreasing use in video games and virtual reality applications. However, the\nissue is that the underlying GPUs cannot continuously generate frames at this\nhigh rate -- this results in a less smooth and responsive experience.\nFurthermore, if the frame rate is not synchronized with the refresh rate, the\nuser may experience screen tearing and stuttering. Previous works propose\nincreasing the frame rate to provide a smooth experience on modern displays by\npredicting new frames based on past or future frames. Interpolation and\nextrapolation are two widely used algorithms that predict new frames.\nInterpolation requires waiting for the future frame to make a prediction, which\nadds additional latency. On the other hand, extrapolation provides a better\nquality of experience because it relies solely on past frames -- it does not\nincur any additional latency. The simplest method to extrapolate a frame is to\nwarp the previous frame using motion vectors; however, the warped frame may\ncontain improperly rendered visual artifacts due to dynamic objects -- this\nmakes it very challenging to design such a scheme. Past work has used DNNs to\nget good accuracy, however, these approaches are slow. This paper proposes\nExwarp -- an approach based on reinforcement learning (RL) to intelligently\nchoose between the slower DNN-based extrapolation and faster warping-based\nmethods to increase the frame rate by 4x with an almost negligible reduction in\nthe perceived image quality.\n","authors":["Akanksha Dixit","Yashashwee Chakrabarty","Smruti R. Sarangi"],"pdf_url":"https://arxiv.org/pdf/2307.12607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07515v2","updated":"2023-07-24T08:10:52Z","published":"2023-04-15T09:39:52Z","title":"S3M: Scalable Statistical Shape Modeling through Unsupervised\n  Correspondences","summary":"  Statistical shape models (SSMs) are an established way to represent the\nanatomy of a population with various clinically relevant applications. However,\nthey typically require domain expertise, and labor-intensive landmark\nannotations to construct. We address these shortcomings by proposing an\nunsupervised method that leverages deep geometric features and functional\ncorrespondences to simultaneously learn local and global shape structures\nacross population anatomies. Our pipeline significantly improves unsupervised\ncorrespondence estimation for SSMs compared to baseline methods, even on highly\nirregular surface topologies. We demonstrate this for two different anatomical\nstructures: the thyroid and a multi-chamber heart dataset. Furthermore, our\nmethod is robust enough to learn from noisy neural network predictions,\npotentially enabling scaling SSMs to larger patient populations without manual\nsegmentation annotation.\n","authors":["Lennart Bastian","Alexander Baumann","Emily Hoppe","Vincent Bürgin","Ha Young Kim","Mahdi Saleh","Benjamin Busam","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2304.07515v2.pdf","comment":"Accepted at MICCAI 2023. 13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.12591v1","updated":"2023-07-24T08:06:46Z","published":"2023-07-24T08:06:46Z","title":"SwinMM: Masked Multi-view with Swin Transformers for 3D Medical Image\n  Segmentation","summary":"  Recent advancements in large-scale Vision Transformers have made significant\nstrides in improving pre-trained models for medical image segmentation.\nHowever, these methods face a notable challenge in acquiring a substantial\namount of pre-training data, particularly within the medical field. To address\nthis limitation, we present Masked Multi-view with Swin Transformers (SwinMM),\na novel multi-view pipeline for enabling accurate and data-efficient\nself-supervised medical image analysis. Our strategy harnesses the potential of\nmulti-view information by incorporating two principal components. In the\npre-training phase, we deploy a masked multi-view encoder devised to\nconcurrently train masked multi-view observations through a range of diverse\nproxy tasks. These tasks span image reconstruction, rotation, contrastive\nlearning, and a novel task that employs a mutual learning paradigm. This new\ntask capitalizes on the consistency between predictions from various\nperspectives, enabling the extraction of hidden multi-view information from 3D\nmedical data. In the fine-tuning stage, a cross-view decoder is developed to\naggregate the multi-view information through a cross-attention block. Compared\nwith the previous state-of-the-art self-supervised learning method Swin UNETR,\nSwinMM demonstrates a notable advantage on several medical image segmentation\ntasks. It allows for a smooth integration of multi-view information,\nsignificantly boosting both the accuracy and data-efficiency of the model. Code\nand models are available at https://github.com/UCSC-VLAA/SwinMM/.\n","authors":["Yiqing Wang","Zihan Li","Jieru Mei","Zihao Wei","Li Liu","Chen Wang","Shengtian Sang","Alan Yuille","Cihang Xie","Yuyin Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.12591v1.pdf","comment":"MICCAI 2023; project page: https://github.com/UCSC-VLAA/SwinMM/"},{"id":"http://arxiv.org/abs/2307.12580v1","updated":"2023-07-24T07:51:40Z","published":"2023-07-24T07:51:40Z","title":"SL: Stable Learning in Source-Free Domain Adaption for Medical Image\n  Segmentation","summary":"  Deep learning techniques for medical image analysis usually suffer from the\ndomain shift between source and target data. Most existing works focus on\nunsupervised domain adaptation (UDA). However, in practical applications,\nprivacy issues are much more severe. For example, the data of different\nhospitals have domain shifts due to equipment problems, and data of the two\ndomains cannot be available simultaneously because of privacy. In this\nchallenge defined as Source-Free UDA, the previous UDA medical methods are\nlimited. Although a variety of medical source-free unsupervised domain adaption\n(MSFUDA) methods have been proposed, we found they fall into an over-fitting\ndilemma called \"longer training, worse performance.\" Therefore, we propose the\nStable Learning (SL) strategy to address the dilemma. SL is a scalable method\nand can be integrated with other research, which consists of Weight\nConsolidation and Entropy Increase. First, we apply Weight Consolidation to\nretain domain-invariant knowledge and then we design Entropy Increase to avoid\nover-learning. Comparative experiments prove the effectiveness of SL. We also\nhave done extensive ablation experiments. Besides, We will release codes\nincluding a variety of MSFUDA methods.\n","authors":["Yixin Chen","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12577v1","updated":"2023-07-24T07:49:01Z","published":"2023-07-24T07:49:01Z","title":"PRIOR: Prototype Representation Joint Learning from Medical Images and\n  Reports","summary":"  Contrastive learning based vision-language joint pre-training has emerged as\na successful representation learning strategy. In this paper, we present a\nprototype representation learning framework incorporating both global and local\nalignment between medical images and reports. In contrast to standard global\nmulti-modality alignment methods, we employ a local alignment module for\nfine-grained representation. Furthermore, a cross-modality conditional\nreconstruction module is designed to interchange information across modalities\nin the training phase by reconstructing masked images and reports. For\nreconstructing long reports, a sentence-wise prototype memory bank is\nconstructed, enabling the network to focus on low-level localized visual and\nhigh-level clinical linguistic features. Additionally, a non-auto-regressive\ngeneration paradigm is proposed for reconstructing non-sequential reports.\nExperimental results on five downstream tasks, including supervised\nclassification, zero-shot classification, image-to-text retrieval, semantic\nsegmentation, and object detection, show the proposed method outperforms other\nstate-of-the-art methods across multiple datasets and under different dataset\nsize settings. The code is available at https://github.com/QtacierP/PRIOR.\n","authors":["Pujin Cheng","Li Lin","Junyan Lyu","Yijin Huang","Wenhan Luo","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2307.12577v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12574v1","updated":"2023-07-24T07:46:06Z","published":"2023-07-24T07:46:06Z","title":"A Good Student is Cooperative and Reliable: CNN-Transformer\n  Collaborative Learning for Semantic Segmentation","summary":"  In this paper, we strive to answer the question \"how to collaboratively learn\nconvolutional neural network (CNN)-based and vision transformer (ViT)-based\nmodels by selecting and exchanging the reliable knowledge between them for\nsemantic segmentation?\" Accordingly, we propose an online knowledge\ndistillation (KD) framework that can simultaneously learn compact yet effective\nCNN-based and ViT-based models with two key technical breakthroughs to take\nfull advantage of CNNs and ViT while compensating their limitations. Firstly,\nwe propose heterogeneous feature distillation (HFD) to improve students'\nconsistency in low-layer feature space by mimicking heterogeneous features\nbetween CNNs and ViT. Secondly, to facilitate the two students to learn\nreliable knowledge from each other, we propose bidirectional selective\ndistillation (BSD) that can dynamically transfer selective knowledge. This is\nachieved by 1) region-wise BSD determining the directions of knowledge\ntransferred between the corresponding regions in the feature space and 2)\npixel-wise BSD discerning which of the prediction knowledge to be transferred\nin the logit space. Extensive experiments on three benchmark datasets\ndemonstrate that our proposed framework outperforms the state-of-the-art online\ndistillation methods by a large margin, and shows its efficacy in learning\ncollaboratively between ViT-based and CNN-based models.\n","authors":["Jinjing Zhu","Yunhao Luo","Xu Zheng","Hao Wang","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12574v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2210.10495v3","updated":"2023-07-24T07:43:31Z","published":"2022-10-19T12:04:47Z","title":"ADPS: Asymmetric Distillation Post-Segmentation for Image Anomaly\n  Detection","summary":"  Knowledge Distillation-based Anomaly Detection (KDAD) methods rely on the\nteacher-student paradigm to detect and segment anomalous regions by contrasting\nthe unique features extracted by both networks. However, existing KDAD methods\nsuffer from two main limitations: 1) the student network can effortlessly\nreplicate the teacher network's representations, and 2) the features of the\nteacher network serve solely as a ``reference standard\" and are not fully\nleveraged. Toward this end, we depart from the established paradigm and instead\npropose an innovative approach called Asymmetric Distillation Post-Segmentation\n(ADPS). Our ADPS employs an asymmetric distillation paradigm that takes\ndistinct forms of the same image as the input of the teacher-student networks,\ndriving the student network to learn discriminating representations for\nanomalous regions.\n  Meanwhile, a customized Weight Mask Block (WMB) is proposed to generate a\ncoarse anomaly localization mask that transfers the distilled knowledge\nacquired from the asymmetric paradigm to the teacher network. Equipped with\nWMB, the proposed Post-Segmentation Module (PSM) is able to effectively detect\nand segment abnormal regions with fine structures and clear boundaries.\nExperimental results demonstrate that the proposed ADPS outperforms the\nstate-of-the-art methods in detecting and segmenting anomalies. Surprisingly,\nADPS significantly improves Average Precision (AP) metric by 9% and 20% on the\nMVTec AD and KolektorSDD2 datasets, respectively.\n","authors":["Peng Xing","Hao Tang","Jinhui Tang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2210.10495v3.pdf","comment":"11pages,9 figures"},{"id":"http://arxiv.org/abs/2307.12571v1","updated":"2023-07-24T07:39:22Z","published":"2023-07-24T07:39:22Z","title":"MataDoc: Margin and Text Aware Document Dewarping for Arbitrary Boundary","summary":"  Document dewarping from a distorted camera-captured image is of great value\nfor OCR and document understanding. The document boundary plays an important\nrole which is more evident than the inner region in document dewarping. Current\nlearning-based methods mainly focus on complete boundary cases, leading to poor\ndocument correction performance of documents with incomplete boundaries. In\ncontrast to these methods, this paper proposes MataDoc, the first method\nfocusing on arbitrary boundary document dewarping with margin and text aware\nregularizations. Specifically, we design the margin regularization by\nexplicitly considering background consistency to enhance boundary perception.\nMoreover, we introduce word position consistency to keep text lines straight in\nrectified document images. To produce a comprehensive evaluation of MataDoc, we\npropose a novel benchmark ArbDoc, mainly consisting of document images with\narbitrary boundaries in four typical scenarios. Extensive experiments confirm\nthe superiority of MataDoc with consideration for the incomplete boundary on\nArbDoc and also demonstrate the effectiveness of the proposed method on\nDocUNet, DIR300, and WarpDoc datasets.\n","authors":["Beiya Dai","Xing li","Qunyi Xie","Yulin Li","Xiameng Qin","Chengquan Zhang","Kun Yao","Junyu Han"],"pdf_url":"https://arxiv.org/pdf/2307.12571v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2307.12560v1","updated":"2023-07-24T07:03:22Z","published":"2023-07-24T07:03:22Z","title":"Interpolating between Images with Diffusion Models","summary":"  One little-explored frontier of image generation and editing is the task of\ninterpolating between two input images, a feature missing from all currently\ndeployed image generation pipelines. We argue that such a feature can expand\nthe creative applications of such models, and propose a method for zero-shot\ninterpolation using latent diffusion models. We apply interpolation in the\nlatent space at a sequence of decreasing noise levels, then perform denoising\nconditioned on interpolated text embeddings derived from textual inversion and\n(optionally) subject poses. For greater consistency, or to specify additional\ncriteria, we can generate several candidates and use CLIP to select the highest\nquality image. We obtain convincing interpolations across diverse subject\nposes, image styles, and image content, and show that standard quantitative\nmetrics such as FID are insufficient to measure the quality of an\ninterpolation. Code and data are available at\nhttps://clintonjwang.github.io/interpolation.\n","authors":["Clinton J. Wang","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2307.12560v1.pdf","comment":"Presented at ICML 2023 Workshop on Challenges of Deploying Generative\n  AI"},{"id":"http://arxiv.org/abs/2203.01923v4","updated":"2023-07-24T06:59:56Z","published":"2022-03-03T18:56:08Z","title":"Recovering 3D Human Mesh from Monocular Images: A Survey","summary":"  Estimating human pose and shape from monocular images is a long-standing\nproblem in computer vision. Since the release of statistical body models, 3D\nhuman mesh recovery has been drawing broader attention. With the same goal of\nobtaining well-aligned and physically plausible mesh results, two paradigms\nhave been developed to overcome challenges in the 2D-to-3D lifting process: i)\nan optimization-based paradigm, where different data terms and regularization\nterms are exploited as optimization objectives; and ii) a regression-based\nparadigm, where deep learning techniques are embraced to solve the problem in\nan end-to-end fashion. Meanwhile, continuous efforts are devoted to improving\nthe quality of 3D mesh labels for a wide range of datasets. Though remarkable\nprogress has been achieved in the past decade, the task is still challenging\ndue to flexible body motions, diverse appearances, complex environments, and\ninsufficient in-the-wild annotations. To the best of our knowledge, this is the\nfirst survey that focuses on the task of monocular 3D human mesh recovery. We\nstart with the introduction of body models and then elaborate recovery\nframeworks and training objectives by providing in-depth analyses of their\nstrengths and weaknesses. We also summarize datasets, evaluation metrics, and\nbenchmark results. Open issues and future directions are discussed in the end,\nhoping to motivate researchers and facilitate their research in this area. A\nregularly updated project page can be found at\nhttps://github.com/tinatiansjz/hmr-survey.\n","authors":["Yating Tian","Hongwen Zhang","Yebin Liu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2203.01923v4.pdf","comment":"Accepted to IEEE TPAMI, Survey on monocular 3D human mesh recovery,\n  Project page: https://github.com/tinatiansjz/hmr-survey"},{"id":"http://arxiv.org/abs/2307.12558v1","updated":"2023-07-24T06:51:07Z","published":"2023-07-24T06:51:07Z","title":"Revisiting Event-based Video Frame Interpolation","summary":"  Dynamic vision sensors or event cameras provide rich complementary\ninformation for video frame interpolation. Existing state-of-the-art methods\nfollow the paradigm of combining both synthesis-based and warping networks.\nHowever, few of those methods fully respect the intrinsic characteristics of\nevents streams. Given that event cameras only encode intensity changes and\npolarity rather than color intensities, estimating optical flow from events is\narguably more difficult than from RGB information. We therefore propose to\nincorporate RGB information in an event-guided optical flow refinement\nstrategy. Moreover, in light of the quasi-continuous nature of the time signals\nprovided by event cameras, we propose a divide-and-conquer strategy in which\nevent-based intermediate frame synthesis happens incrementally in multiple\nsimplified stages rather than in a single, long stage. Extensive experiments on\nboth synthetic and real-world datasets show that these modifications lead to\nmore reliable and realistic intermediate frame results than previous video\nframe interpolation methods. Our findings underline that a careful\nconsideration of event characteristics such as high temporal density and\nelevated noise benefits interpolation accuracy.\n","authors":["Jiaben Chen","Yichen Zhu","Dongze Lian","Jiaqi Yang","Yifu Wang","Renrui Zhang","Xinhang Liu","Shenhan Qian","Laurent Kneip","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2307.12558v1.pdf","comment":"Accepted by IROS2023 Project Site:\n  https://jiabenchen.github.io/revisit_event"},{"id":"http://arxiv.org/abs/2307.12548v1","updated":"2023-07-24T06:33:52Z","published":"2023-07-24T06:33:52Z","title":"MFMAN-YOLO: A Method for Detecting Pole-like Obstacles in Complex\n  Environment","summary":"  In real-world traffic, there are various uncertainties and complexities in\nroad and weather conditions. To solve the problem that the feature information\nof pole-like obstacles in complex environments is easily lost, resulting in low\ndetection accuracy and low real-time performance, a multi-scale hybrid\nattention mechanism detection algorithm is proposed in this paper. First, the\noptimal transport function Monge-Kantorovich (MK) is incorporated not only to\nsolve the problem of overlapping multiple prediction frames with optimal\nmatching but also the MK function can be regularized to prevent model\nover-fitting; then, the features at different scales are up-sampled separately\naccording to the optimized efficient multi-scale feature pyramid. Finally, the\nextraction of multi-scale feature space channel information is enhanced in\ncomplex environments based on the hybrid attention mechanism, which suppresses\nthe irrelevant complex environment background information and focuses the\nfeature information of pole-like obstacles. Meanwhile, this paper conducts real\nroad test experiments in a variety of complex environments. The experimental\nresults show that the detection precision, recall, and average precision of the\nmethod are 94.7%, 93.1%, and 97.4%, respectively, and the detection frame rate\nis 400 f/s. This research method can detect pole-like obstacles in a complex\nroad environment in real time and accurately, which further promotes innovation\nand progress in the field of automatic driving.\n","authors":["Lei Cai","Hao Wang","Congling Zhou","Yongqiang Wang","Boyu Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12548v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2301.01482v5","updated":"2023-07-24T06:31:58Z","published":"2023-01-04T08:22:34Z","title":"Underwater Object Tracker: UOSTrack for Marine Organism Grasping of\n  Underwater Vehicles","summary":"  A visual single-object tracker is an indispensable component of underwater\nvehicles (UVs) in marine organism grasping tasks. Its accuracy and stability\nare imperative to guide the UVs to perform grasping behavior. Although\nsingle-object trackers show competitive performance in the challenge of\nunderwater image degradation, there are still issues with sample imbalance and\nexclusion of similar objects that need to be addressed for application in\nmarine organism grasping. This paper proposes Underwater OSTrack (UOSTrack),\nwhich consists of underwater image and open-air sequence hybrid training\n(UOHT), and motion-based post-processing (MBPP). The UOHT training paradigm is\ndesigned to train the sample-imbalanced underwater tracker so that the tracker\nis exposed to a great number of underwater domain training samples and learns\nthe feature expressions. The MBPP paradigm is proposed to exclude similar\nobjects. It uses the estimation box predicted with a Kalman filter and the\ncandidate boxes in the response map to relocate the lost tracked object in the\ncandidate area. UOSTrack achieves an average performance improvement of 4.41%\nand 7.98% maximum compared to state-of-the-art methods on various benchmarks,\nrespectively. Field experiments have verified the accuracy and stability of our\nproposed UOSTrack for UVs in marine organism grasping tasks. More details can\nbe found at https://github.com/LiYunfengLYF/UOSTrack.\n","authors":["Yunfeng Li","Bo Wang","Ye Li","Zhuoyan Liu","Wei Huo","Yueming Li","Jian Cao"],"pdf_url":"https://arxiv.org/pdf/2301.01482v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12545v1","updated":"2023-07-24T06:22:37Z","published":"2023-07-24T06:22:37Z","title":"Towards Video Anomaly Retrieval from Video Anomaly Detection: New\n  Benchmarks and Model","summary":"  Video anomaly detection (VAD) has been paid increasing attention due to its\npotential applications, its current dominant tasks focus on online detecting\nanomalies% at the frame level, which can be roughly interpreted as the binary\nor multiple event classification. However, such a setup that builds\nrelationships between complicated anomalous events and single labels, e.g.,\n``vandalism'', is superficial, since single labels are deficient to\ncharacterize anomalous events. In reality, users tend to search a specific\nvideo rather than a series of approximate videos. Therefore, retrieving\nanomalous events using detailed descriptions is practical and positive but few\nresearches focus on this. In this context, we propose a novel task called Video\nAnomaly Retrieval (VAR), which aims to pragmatically retrieve relevant\nanomalous videos by cross-modalities, e.g., language descriptions and\nsynchronous audios. Unlike the current video retrieval where videos are assumed\nto be temporally well-trimmed with short duration, VAR is devised to retrieve\nlong untrimmed videos which may be partially relevant to the given query. To\nachieve this, we present two large-scale VAR benchmarks, UCFCrime-AR and\nXDViolence-AR, constructed on top of prevalent anomaly datasets. Meanwhile, we\ndesign a model called Anomaly-Led Alignment Network (ALAN) for VAR. In ALAN, we\npropose an anomaly-led sampling to focus on key segments in long untrimmed\nvideos. Then, we introduce an efficient pretext task to enhance semantic\nassociations between video-text fine-grained representations. Besides, we\nleverage two complementary alignments to further match cross-modal contents.\nExperimental results on two benchmarks reveal the challenges of VAR task and\nalso demonstrate the advantages of our tailored method.\n","authors":["Peng Wu","Jing Liu","Xiangteng He","Yuxin Peng","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12545v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2307.12542v1","updated":"2023-07-24T06:12:37Z","published":"2023-07-24T06:12:37Z","title":"Client-Level Differential Privacy via Adaptive Intermediary in Federated\n  Medical Imaging","summary":"  Despite recent progress in enhancing the privacy of federated learning (FL)\nvia differential privacy (DP), the trade-off of DP between privacy protection\nand performance is still underexplored for real-world medical scenario. In this\npaper, we propose to optimize the trade-off under the context of client-level\nDP, which focuses on privacy during communications. However, FL for medical\nimaging involves typically much fewer participants (hospitals) than other\ndomains (e.g., mobile devices), thus ensuring clients be differentially private\nis much more challenging. To tackle this problem, we propose an adaptive\nintermediary strategy to improve performance without harming privacy.\nSpecifically, we theoretically find splitting clients into sub-clients, which\nserve as intermediaries between hospitals and the server, can mitigate the\nnoises introduced by DP without harming privacy. Our proposed approach is\nempirically evaluated on both classification and segmentation tasks using two\npublic datasets, and its effectiveness is demonstrated with significant\nperformance improvements and comprehensive analytical studies. Code is\navailable at: https://github.com/med-air/Client-DP-FL.\n","authors":["Meirui Jiang","Yuan Zhong","Anjie Le","Xiaoxiao Li","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2307.12542v1.pdf","comment":"Accepted by 26th International Conference on Medical Image Computing\n  and Computer Assisted Intervention (MICCAI'23)"},{"id":"http://arxiv.org/abs/2303.05021v3","updated":"2023-07-24T06:06:27Z","published":"2023-03-09T03:48:24Z","title":"DiffusionDepth: Diffusion Denoising Approach for Monocular Depth\n  Estimation","summary":"  Monocular depth estimation is a challenging task that predicts the pixel-wise\ndepth from a single 2D image. Current methods typically model this problem as a\nregression or classification task. We propose DiffusionDepth, a new approach\nthat reformulates monocular depth estimation as a denoising diffusion process.\nIt learns an iterative denoising process to `denoise' random depth distribution\ninto a depth map with the guidance of monocular visual conditions. The process\nis performed in the latent space encoded by a dedicated depth encoder and\ndecoder. Instead of diffusing ground truth (GT) depth, the model learns to\nreverse the process of diffusing the refined depth of itself into random depth\ndistribution. This self-diffusion formulation overcomes the difficulty of\napplying generative models to sparse GT depth scenarios. The proposed approach\nbenefits this task by refining depth estimation step by step, which is superior\nfor generating accurate and highly detailed depth maps. Experimental results on\nKITTI and NYU-Depth-V2 datasets suggest that a simple yet efficient diffusion\napproach could reach state-of-the-art performance in both indoor and outdoor\nscenarios with acceptable inference time.\n","authors":["Yiqun Duan","Xianda Guo","Zheng Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.05021v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12540v1","updated":"2023-07-24T06:04:12Z","published":"2023-07-24T06:04:12Z","title":"SelFormaly: Towards Task-Agnostic Unified Anomaly Detection","summary":"  The core idea of visual anomaly detection is to learn the normality from\nnormal images, but previous works have been developed specifically for certain\ntasks, leading to fragmentation among various tasks: defect detection, semantic\nanomaly detection, multi-class anomaly detection, and anomaly clustering. This\none-task-one-model approach is resource-intensive and incurs high maintenance\ncosts as the number of tasks increases. This paper presents SelFormaly, a\nuniversal and powerful anomaly detection framework. We emphasize the necessity\nof our off-the-shelf approach by pointing out a suboptimal issue with\nfluctuating performance in previous online encoder-based methods. In addition,\nwe question the effectiveness of using ConvNets as previously employed in the\nliterature and confirm that self-supervised ViTs are suitable for unified\nanomaly detection. We introduce back-patch masking and discover the new role of\ntop k-ratio feature matching to achieve unified and powerful anomaly detection.\nBack-patch masking eliminates irrelevant regions that possibly hinder\ntarget-centric detection with representations of the scene layout. The top\nk-ratio feature matching unifies various anomaly levels and tasks. Finally,\nSelFormaly achieves state-of-the-art results across various datasets for all\nthe aforementioned tasks.\n","authors":["Yujin Lee","Harin Lim","Hyunsoo Yoon"],"pdf_url":"https://arxiv.org/pdf/2307.12540v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.12534v1","updated":"2023-07-24T05:43:34Z","published":"2023-07-24T05:43:34Z","title":"Towards Generalizable Deepfake Detection by Primary Region\n  Regularization","summary":"  The existing deepfake detection methods have reached a bottleneck in\ngeneralizing to unseen forgeries and manipulation approaches. Based on the\nobservation that the deepfake detectors exhibit a preference for overfitting\nthe specific primary regions in input, this paper enhances the generalization\ncapability from a novel regularization perspective. This can be simply achieved\nby augmenting the images through primary region removal, thereby preventing the\ndetector from over-relying on data bias. Our method consists of two stages,\nnamely the static localization for primary region maps, as well as the dynamic\nexploitation of primary region masks. The proposed method can be seamlessly\nintegrated into different backbones without affecting their inference\nefficiency. We conduct extensive experiments over three widely used deepfake\ndatasets - DFDC, DF-1.0, and Celeb-DF with five backbones. Our method\ndemonstrates an average performance improvement of 6% across different\nbackbones and performs competitively with several state-of-the-art baselines.\n","authors":["Harry Cheng","Yangyang Guo","Tianyi Wang","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2307.12534v1.pdf","comment":"12 pages. Code and Dataset: https://github.com/xaCheng1996/PRLE"},{"id":"http://arxiv.org/abs/2307.12532v1","updated":"2023-07-24T05:36:19Z","published":"2023-07-24T05:36:19Z","title":"On the Connection between Pre-training Data Diversity and Fine-tuning\n  Robustness","summary":"  Pre-training has been widely adopted in deep learning to improve model\nperformance, especially when the training data for a target task is limited. In\nour work, we seek to understand the implications of this training strategy on\nthe generalization properties of downstream models. More specifically, we ask\nthe following question: how do properties of the pre-training distribution\naffect the robustness of a fine-tuned model? The properties we explore include\nthe label space, label semantics, image diversity, data domains, and data\nquantity of the pre-training distribution. We find that the primary factor\ninfluencing downstream effective robustness (Taori et al., 2020) is data\nquantity, while other factors have limited significance. For example, reducing\nthe number of ImageNet pre-training classes by 4x while increasing the number\nof images per class by 4x (that is, keeping total data quantity fixed) does not\nimpact the robustness of fine-tuned models. We demonstrate our findings on\npre-training distributions drawn from various natural and synthetic data\nsources, primarily using the iWildCam-WILDS distribution shift as a test for\ndownstream robustness.\n","authors":["Vivek Ramanujan","Thao Nguyen","Sewoong Oh","Ludwig Schmidt","Ali Farhadi"],"pdf_url":"https://arxiv.org/pdf/2307.12532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.18246v3","updated":"2023-07-24T05:35:30Z","published":"2023-03-31T17:59:09Z","title":"3D Human Pose Estimation via Intuitive Physics","summary":"  Estimating 3D humans from images often produces implausible bodies that lean,\nfloat, or penetrate the floor. Such methods ignore the fact that bodies are\ntypically supported by the scene. A physics engine can be used to enforce\nphysical plausibility, but these are not differentiable, rely on unrealistic\nproxy bodies, and are difficult to integrate into existing optimization and\nlearning frameworks. In contrast, we exploit novel intuitive-physics (IP) terms\nthat can be inferred from a 3D SMPL body interacting with the scene. Inspired\nby biomechanics, we infer the pressure heatmap on the body, the Center of\nPressure (CoP) from the heatmap, and the SMPL body's Center of Mass (CoM). With\nthese, we develop IPMAN, to estimate a 3D body from a color image in a \"stable\"\nconfiguration by encouraging plausible floor contact and overlapping CoP and\nCoM. Our IP terms are intuitive, easy to implement, fast to compute,\ndifferentiable, and can be integrated into existing optimization and regression\nmethods. We evaluate IPMAN on standard datasets and MoYo, a new dataset with\nsynchronized multi-view images, ground-truth 3D bodies with complex poses,\nbody-floor contact, CoM and pressure. IPMAN produces more plausible results\nthan the state of the art, improving accuracy for static poses, while not\nhurting dynamic ones. Code and data are available for research at\nhttps://ipman.is.tue.mpg.de.\n","authors":["Shashank Tripathi","Lea Müller","Chun-Hao P. Huang","Omid Taheri","Michael J. Black","Dimitrios Tzionas"],"pdf_url":"https://arxiv.org/pdf/2303.18246v3.pdf","comment":"Accepted in CVPR'23. Project page: https://ipman.is.tue.mpg.de"},{"id":"http://arxiv.org/abs/2307.12526v1","updated":"2023-07-24T04:56:23Z","published":"2023-07-24T04:56:23Z","title":"Rethinking Medical Report Generation: Disease Revealing Enhancement with\n  Knowledge Graph","summary":"  Knowledge Graph (KG) plays a crucial role in Medical Report Generation (MRG)\nbecause it reveals the relations among diseases and thus can be utilized to\nguide the generation process. However, constructing a comprehensive KG is\nlabor-intensive and its applications on the MRG process are under-explored. In\nthis study, we establish a complete KG on chest X-ray imaging that includes 137\ntypes of diseases and abnormalities. Based on this KG, we find that the current\nMRG data sets exhibit a long-tailed problem in disease distribution. To\nmitigate this problem, we introduce a novel augmentation strategy that enhances\nthe representation of disease types in the tail-end of the distribution. We\nfurther design a two-stage MRG approach, where a classifier is first trained to\ndetect whether the input images exhibit any abnormalities. The classified\nimages are then independently fed into two transformer-based generators,\nnamely, ``disease-specific generator\" and ``disease-free generator\" to generate\nthe corresponding reports. To enhance the clinical evaluation of whether the\ngenerated reports correctly describe the diseases appearing in the input image,\nwe propose diverse sensitivity (DS), a new metric that checks whether generated\ndiseases match ground truth and measures the diversity of all generated\ndiseases. Results show that the proposed two-stage generation framework and\naugmentation strategies improve DS by a considerable margin, indicating a\nnotable reduction in the long-tailed problem associated with under-represented\ndiseases.\n","authors":["Yixin Wang","Zihao Lin","Haoyu Dong"],"pdf_url":"https://arxiv.org/pdf/2307.12526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12517v1","updated":"2023-07-24T04:21:51Z","published":"2023-07-24T04:21:51Z","title":"Entropy Transformer Networks: A Learning Approach via Tangent Bundle\n  Data Manifold","summary":"  This paper focuses on an accurate and fast interpolation approach for image\ntransformation employed in the design of CNN architectures. Standard Spatial\nTransformer Networks (STNs) use bilinear or linear interpolation as their\ninterpolation, with unrealistic assumptions about the underlying data\ndistributions, which leads to poor performance under scale variations.\nMoreover, STNs do not preserve the norm of gradients in propagation due to\ntheir dependency on sparse neighboring pixels. To address this problem, a novel\nEntropy STN (ESTN) is proposed that interpolates on the data manifold\ndistributions. In particular, random samples are generated for each pixel in\nassociation with the tangent space of the data manifold and construct a linear\napproximation of their intensity values with an entropy regularizer to compute\nthe transformer parameters. A simple yet effective technique is also proposed\nto normalize the non-zero values of the convolution operation, to fine-tune the\nlayers for gradients' norm-regularization during training. Experiments on\nchallenging benchmarks show that the proposed ESTN can improve predictive\naccuracy over a range of computer vision tasks, including image reconstruction,\nand classification, while reducing the computational cost.\n","authors":["Pourya Shamsolmoali","Masoumeh Zareapoor"],"pdf_url":"https://arxiv.org/pdf/2307.12517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12539v2","updated":"2023-07-24T04:20:37Z","published":"2023-04-25T03:12:54Z","title":"Text-guided Eyeglasses Manipulation with Spatial Constraints","summary":"  Virtual try-on of eyeglasses involves placing eyeglasses of different shapes\nand styles onto a face image without physically trying them on. While existing\nmethods have shown impressive results, the variety of eyeglasses styles is\nlimited and the interactions are not always intuitive or efficient. To address\nthese limitations, we propose a Text-guided Eyeglasses Manipulation method that\nallows for control of the eyeglasses shape and style based on a binary mask and\ntext, respectively. Specifically, we introduce a mask encoder to extract mask\nconditions and a modulation module that enables simultaneous injection of text\nand mask conditions. This design allows for fine-grained control of the\neyeglasses' appearance based on both textual descriptions and spatial\nconstraints. Our approach includes a disentangled mapper and a decoupling\nstrategy that preserves irrelevant areas, resulting in better local editing. We\nemploy a two-stage training scheme to handle the different convergence speeds\nof the various modality conditions, successfully controlling both the shape and\nstyle of eyeglasses. Extensive comparison experiments and ablation analyses\ndemonstrate the effectiveness of our approach in achieving diverse eyeglasses\nstyles while preserving irrelevant areas.\n","authors":["Jiacheng Wang","Ping Liu","Jingen Liu","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2304.12539v2.pdf","comment":"Revised version: add some experiments"},{"id":"http://arxiv.org/abs/2307.11466v2","updated":"2023-07-24T03:35:03Z","published":"2023-07-21T10:02:02Z","title":"MatSpectNet: Material Segmentation Network with Domain-Aware and\n  Physically-Constrained Hyperspectral Reconstruction","summary":"  Achieving accurate material segmentation for 3-channel RGB images is\nchallenging due to the considerable variation in a material's appearance.\nHyperspectral images, which are sets of spectral measurements sampled at\nmultiple wavelengths, theoretically offer distinct information for material\nidentification, as variations in intensity of electromagnetic radiation\nreflected by a surface depend on the material composition of a scene. However,\nexisting hyperspectral datasets are impoverished regarding the number of images\nand material categories for the dense material segmentation task, and\ncollecting and annotating hyperspectral images with a spectral camera is\nprohibitively expensive. To address this, we propose a new model, the\nMatSpectNet to segment materials with recovered hyperspectral images from RGB\nimages. The network leverages the principles of colour perception in modern\ncameras to constrain the reconstructed hyperspectral images and employs the\ndomain adaptation method to generalise the hyperspectral reconstruction\ncapability from a spectral recovery dataset to material segmentation datasets.\nThe reconstructed hyperspectral images are further filtered using learned\nresponse curves and enhanced with human perception. The performance of\nMatSpectNet is evaluated on the LMD dataset as well as the OpenSurfaces\ndataset. Our experiments demonstrate that MatSpectNet attains a 1.60% increase\nin average pixel accuracy and a 3.42% improvement in mean class accuracy\ncompared with the most recent publication. The project code is attached to the\nsupplementary material and will be published on GitHub.\n","authors":["Yuwen Heng","Yihong Wu","Jiawen Chen","Srinandan Dasmahapatra","Hansung Kim"],"pdf_url":"https://arxiv.org/pdf/2307.11466v2.pdf","comment":"7 pages main paper"},{"id":"http://arxiv.org/abs/2304.03483v2","updated":"2023-07-24T03:28:34Z","published":"2023-04-07T05:29:59Z","title":"RED-PSM: Regularization by Denoising of Partially Separable Models for\n  Dynamic Imaging","summary":"  Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at\neach time instant using its undersampled measurements. In particular, in the\ncase of dynamic tomography, only a single projection at a single view angle may\nbe available at a time, making the problem severely ill-posed. In this work, we\npropose an approach, RED-PSM, which combines for the first time two powerful\ntechniques to address this challenging imaging problem. The first, are\npartially separable models, which have been used to efficiently introduce a\nlow-rank prior for the spatio-temporal object. The second is the recent\nRegularization by Denoising (RED), which provides a flexible framework to\nexploit the impressive performance of state-of-the-art image denoising\nalgorithms, for various inverse problems. We propose a partially separable\nobjective with RED and a computationally efficient and scalable optimization\nscheme with variable splitting and ADMM. Theoretical analysis proves the\nconvergence of our objective to a value corresponding to a stationary point\nsatisfying the first-order optimality conditions. Convergence is accelerated by\na particular projection-domain-based initialization. We demonstrate the\nperformance and computational improvements of our proposed RED-PSM with a\nlearned image denoiser by comparing it to a recent deep-prior-based method\nknown as TD-DIP. Although the main focus is on dynamic tomography, we also show\nthe performance advantages of RED-PSM in a cardiac dynamic MRI setting.\n","authors":["Berk Iskender","Marc L. Klasky","Yoram Bresler"],"pdf_url":"https://arxiv.org/pdf/2304.03483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12502v1","updated":"2023-07-24T03:27:41Z","published":"2023-07-24T03:27:41Z","title":"Cross Contrastive Feature Perturbation for Domain Generalization","summary":"  Domain generalization (DG) aims to learn a robust model from source domains\nthat generalize well on unseen target domains. Recent studies focus on\ngenerating novel domain samples or features to diversify distributions\ncomplementary to source domains. Yet, these approaches can hardly deal with the\nrestriction that the samples synthesized from various domains can cause\nsemantic distortion. In this paper, we propose an online one-stage Cross\nContrasting Feature Perturbation (CCFP) framework to simulate domain shift by\ngenerating perturbed features in the latent space while regularizing the model\nprediction against domain shift. Different from the previous fixed synthesizing\nstrategy, we design modules with learnable feature perturbations and semantic\nconsistency constraints. In contrast to prior work, our method does not use any\ngenerative-based models or domain labels. We conduct extensive experiments on a\nstandard DomainBed benchmark with a strict evaluation protocol for a fair\ncomparison. Comprehensive experiments show that our method outperforms the\nprevious state-of-the-art, and quantitative analyses illustrate that our\napproach can alleviate the domain shift problem in out-of-distribution (OOD)\nscenarios.\n","authors":["Chenming Li","Daoan Zhang","Wenjian Huang","Jianguo Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.09186v4","updated":"2023-07-24T03:20:19Z","published":"2022-04-20T02:14:20Z","title":"Reconstruction-Aware Prior Distillation for Semi-supervised Point Cloud\n  Completion","summary":"  Real-world sensors often produce incomplete, irregular, and noisy point\nclouds, making point cloud completion increasingly important. However, most\nexisting completion methods rely on large paired datasets for training, which\nis labor-intensive. This paper proposes RaPD, a novel semi-supervised point\ncloud completion method that reduces the need for paired datasets. RaPD\nutilizes a two-stage training scheme, where a deep semantic prior is learned in\nstage 1 from unpaired complete and incomplete point clouds, and a\nsemi-supervised prior distillation process is introduced in stage 2 to train a\ncompletion network using only a small number of paired samples. Additionally, a\nself-supervised completion module is introduced to improve performance using\nunpaired incomplete point clouds. Experiments on multiple datasets show that\nRaPD outperforms previous methods in both homologous and heterologous\nscenarios.\n","authors":["Zhaoxin Fan","Yulin He","Zhicheng Wang","Kejian Wu","Hongyan Liu","Jun He"],"pdf_url":"https://arxiv.org/pdf/2204.09186v4.pdf","comment":"Accepted to IJCAI 2023"},{"id":"http://arxiv.org/abs/2307.12499v1","updated":"2023-07-24T03:10:02Z","published":"2023-07-24T03:10:02Z","title":"AdvDiff: Generating Unrestricted Adversarial Examples using Diffusion\n  Models","summary":"  Unrestricted adversarial attacks present a serious threat to deep learning\nmodels and adversarial defense techniques. They pose severe security problems\nfor deep learning applications because they can effectively bypass defense\nmechanisms. However, previous attack methods often utilize Generative\nAdversarial Networks (GANs), which are not theoretically provable and thus\ngenerate unrealistic examples by incorporating adversarial objectives,\nespecially for large-scale datasets like ImageNet. In this paper, we propose a\nnew method, called AdvDiff, to generate unrestricted adversarial examples with\ndiffusion models. We design two novel adversarial guidance techniques to\nconduct adversarial sampling in the reverse generation process of diffusion\nmodels. These two techniques are effective and stable to generate high-quality,\nrealistic adversarial examples by integrating gradients of the target\nclassifier interpretably. Experimental results on MNIST and ImageNet datasets\ndemonstrate that AdvDiff is effective to generate unrestricted adversarial\nexamples, which outperforms GAN-based methods in terms of attack performance\nand generation quality.\n","authors":["Xuelong Dai","Kaisheng Liang","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.12499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.09417v2","updated":"2023-07-24T03:06:15Z","published":"2022-08-19T16:04:29Z","title":"Target-oriented Sentiment Classification with Sequential Cross-modal\n  Semantic Graph","summary":"  Multi-modal aspect-based sentiment classification (MABSC) is task of\nclassifying the sentiment of a target entity mentioned in a sentence and an\nimage. However, previous methods failed to account for the fine-grained\nsemantic association between the image and the text, which resulted in limited\nidentification of fine-grained image aspects and opinions. To address these\nlimitations, in this paper we propose a new approach called SeqCSG, which\nenhances the encoder-decoder sentiment classification framework using\nsequential cross-modal semantic graphs. SeqCSG utilizes image captions and\nscene graphs to extract both global and local fine-grained image information\nand considers them as elements of the cross-modal semantic graph along with\ntokens from tweets. The sequential cross-modal semantic graph is represented as\na sequence with a multi-modal adjacency matrix indicating relationships between\nelements. Experimental results show that the approach outperforms existing\nmethods and achieves state-of-the-art performance on two standard datasets.\nFurther analysis has demonstrated that the model can implicitly learn the\ncorrelation between fine-grained information of the image and the text with the\ngiven target. Our code is available at https://github.com/zjukg/SeqCSG.\n","authors":["Yufeng Huang","Zhuo Chen","Jiaoyan Chen","Jeff Z. Pan","Zhen Yao","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2208.09417v2.pdf","comment":"ICANN 2023, https://github.com/zjukg/SeqCSG"},{"id":"http://arxiv.org/abs/2307.11411v2","updated":"2023-07-24T02:57:01Z","published":"2023-07-21T08:10:26Z","title":"Deep Directly-Trained Spiking Neural Networks for Object Detection","summary":"  Spiking neural networks (SNNs) are brain-inspired energy-efficient models\nthat encode information in spatiotemporal dynamics. Recently, deep SNNs trained\ndirectly have shown great success in achieving high performance on\nclassification tasks with very few time steps. However, how to design a\ndirectly-trained SNN for the regression task of object detection still remains\na challenging problem. To address this problem, we propose EMS-YOLO, a novel\ndirectly-trained SNN framework for object detection, which is the first trial\nto train a deep SNN with surrogate gradients for object detection rather than\nANN-SNN conversion strategies. Specifically, we design a full-spike residual\nblock, EMS-ResNet, which can effectively extend the depth of the\ndirectly-trained SNN with low power consumption. Furthermore, we theoretically\nanalyze and prove the EMS-ResNet could avoid gradient vanishing or exploding.\nThe results demonstrate that our approach outperforms the state-of-the-art\nANN-SNN conversion methods (at least 500 time steps) in extremely fewer time\nsteps (only 4 time steps). It is shown that our model could achieve comparable\nperformance to the ANN with the same architecture while consuming 5.83 times\nless energy on the frame-based COCO Dataset and the event-based Gen1 Dataset.\n","authors":["Qiaoyi Su","Yuhong Chou","Yifan Hu","Jianing Li","Shijie Mei","Ziyang Zhang","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2307.11411v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.12493v1","updated":"2023-07-24T02:50:44Z","published":"2023-07-24T02:50:44Z","title":"TF-ICON: Diffusion-Based Training-Free Cross-Domain Image Composition","summary":"  Text-driven diffusion models have exhibited impressive generative\ncapabilities, enabling various image editing tasks. In this paper, we propose\nTF-ICON, a novel Training-Free Image COmpositioN framework that harnesses the\npower of text-driven diffusion models for cross-domain image-guided\ncomposition. This task aims to seamlessly integrate user-provided objects into\na specific visual context. Current diffusion-based methods often involve costly\ninstance-based optimization or finetuning of pretrained models on customized\ndatasets, which can potentially undermine their rich prior. In contrast,\nTF-ICON can leverage off-the-shelf diffusion models to perform cross-domain\nimage-guided composition without requiring additional training, finetuning, or\noptimization. Moreover, we introduce the exceptional prompt, which contains no\ninformation, to facilitate text-driven diffusion models in accurately inverting\nreal images into latent representations, forming the basis for compositing. Our\nexperiments show that equipping Stable Diffusion with the exceptional prompt\noutperforms state-of-the-art inversion methods on various datasets (CelebA-HQ,\nCOCO, and ImageNet), and that TF-ICON surpasses prior baselines in versatile\nvisual domains. Code is available at https://github.com/Shilin-LU/TF-ICON\n","authors":["Shilin Lu","Yanzhu Liu","Adams Wai-Kin Kong"],"pdf_url":"https://arxiv.org/pdf/2307.12493v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.00932v2","updated":"2023-07-24T01:57:52Z","published":"2023-07-03T11:13:28Z","title":"A large calcium-imaging dataset reveals a systematic V4 organization for\n  natural scenes","summary":"  The visual system evolved to process natural scenes, yet most of our\nunderstanding of the topology and function of visual cortex derives from\nstudies using artificial stimuli. To gain deeper insights into visual\nprocessing of natural scenes, we utilized widefield calcium-imaging of primate\nV4 in response to many natural images, generating a large dataset of\ncolumnar-scale responses. We used this dataset to build a digital twin of V4\nvia deep learning, generating a detailed topographical map of natural image\npreferences at each cortical position. The map revealed clustered functional\ndomains for specific classes of natural image features. These ranged from\nsurface-related attributes like color and texture to shape-related features\nsuch as edges, curvature, and facial features. We validated the model-predicted\ndomains with additional widefield calcium-imaging and single-cell resolution\ntwo-photon imaging. Our study illuminates the detailed topological organization\nand neural codes in V4 that represent natural scenes.\n","authors":["Tianye Wang","Haoxuan Yao","Tai Sing Lee","Jiayi Hong","Yang Li","Hongfei Jiang","Ian Max Andolina","Shiming Tang"],"pdf_url":"https://arxiv.org/pdf/2307.00932v2.pdf","comment":"39 pages, 14 figures"},{"id":"http://arxiv.org/abs/2305.01788v3","updated":"2023-07-24T00:54:51Z","published":"2023-05-02T21:33:10Z","title":"Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation\n  Incorporating Gloss Information","summary":"  Visual Word Sense Disambiguation (VWSD) is a task to find the image that most\naccurately depicts the correct sense of the target word for the given context.\nPreviously, image-text matching models often suffered from recognizing\npolysemous words. This paper introduces an unsupervised VWSD approach that uses\ngloss information of an external lexical knowledge-base, especially the sense\ndefinitions. Specifically, we suggest employing Bayesian inference to\nincorporate the sense definitions when sense information of the answer is not\nprovided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we\npropose a context-aware definition generation with GPT-3. Experimental results\nshow that the VWSD performance significantly increased with our Bayesian\ninference-based approach. In addition, our context-aware definition generation\nachieved prominent performance improvement in OOD examples exhibiting better\nperformance than the existing definition generation method.\n","authors":["Sunjae Kwon","Rishabh Garodia","Minhwa Lee","Zhichao Yang","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2305.01788v3.pdf","comment":"ACL 2023, https://aclanthology.org/2023.acl-long.88"},{"id":"http://arxiv.org/abs/2307.12463v1","updated":"2023-07-24T00:53:46Z","published":"2023-07-24T00:53:46Z","title":"Rethinking Data Distillation: Do Not Overlook Calibration","summary":"  Neural networks trained on distilled data often produce over-confident output\nand require correction by calibration methods. Existing calibration methods\nsuch as temperature scaling and mixup work well for networks trained on\noriginal large-scale data. However, we find that these methods fail to\ncalibrate networks trained on data distilled from large source datasets. In\nthis paper, we show that distilled data lead to networks that are not\ncalibratable due to (i) a more concentrated distribution of the maximum logits\nand (ii) the loss of information that is semantically meaningful but unrelated\nto classification tasks. To address this problem, we propose Masked Temperature\nScaling (MTS) and Masked Distillation Training (MDT) which mitigate the\nlimitations of distilled data and achieve better calibration results while\nmaintaining the efficiency of dataset distillation.\n","authors":["Dongyao Zhu","Bowen Lei","Jie Zhang","Yanbo Fang","Ruqi Zhang","Yiqun Xie","Dongkuan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.12463v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2304.07916v2","updated":"2023-07-24T00:29:45Z","published":"2023-04-16T23:37:24Z","title":"GaitRef: Gait Recognition with Refined Sequential Skeletons","summary":"  Identifying humans with their walking sequences, known as gait recognition,\nis a useful biometric understanding task as it can be observed from a long\ndistance and does not require cooperation from the subject. Two common\nmodalities used for representing the walking sequence of a person are\nsilhouettes and joint skeletons. Silhouette sequences, which record the\nboundary of the walking person in each frame, may suffer from the variant\nappearances from carried-on objects and clothes of the person. Framewise joint\ndetections are noisy and introduce some jitters that are not consistent with\nsequential detections. In this paper, we combine the silhouettes and skeletons\nand refine the framewise joint predictions for gait recognition. With temporal\ninformation from the silhouette sequences. We show that the refined skeletons\ncan improve gait recognition performance without extra annotations. We compare\nour methods on four public datasets, CASIA-B, OUMVLP, Gait3D and GREW, and show\nstate-of-the-art performance.\n","authors":["Haidong Zhu","Wanrong Zheng","Zhaoheng Zheng","Ram Nevatia"],"pdf_url":"https://arxiv.org/pdf/2304.07916v2.pdf","comment":"IJCB 2023. Code is available at\n  https://github.com/haidongz-usc/GaitRef"},{"id":"http://arxiv.org/abs/2307.12459v1","updated":"2023-07-24T00:03:09Z","published":"2023-07-24T00:03:09Z","title":"Robust face anti-spoofing framework with Convolutional Vision\n  Transformer","summary":"  Owing to the advances in image processing technology and large-scale\ndatasets, companies have implemented facial authentication processes, thereby\nstimulating increased focus on face anti-spoofing (FAS) against realistic\npresentation attacks. Recently, various attempts have been made to improve face\nrecognition performance using both global and local learning on face images;\nhowever, to the best of our knowledge, this is the first study to investigate\nwhether the robustness of FAS against domain shifts is improved by considering\nglobal information and local cues in face images captured using self-attention\nand convolutional layers. This study proposes a convolutional vision\ntransformer-based framework that achieves robust performance for various unseen\ndomain data. Our model resulted in 7.3%$p$ and 12.9%$p$ increases in FAS\nperformance compared to models using only a convolutional neural network or\nvision transformer, respectively. It also shows the highest average rank in\nsub-protocols of cross-dataset setting over the other nine benchmark models for\ndomain generalization.\n","authors":["Yunseung Lee","Youngjun Kwak","Jinho Shin"],"pdf_url":"https://arxiv.org/pdf/2307.12459v1.pdf","comment":"ICIP 2023"},{"id":"http://arxiv.org/abs/2301.06363v2","updated":"2023-07-24T23:39:15Z","published":"2023-01-16T11:17:32Z","title":"A$^2$-UAV: Application-Aware Content and Network Optimization of\n  Edge-Assisted UAV Systems","summary":"  To perform advanced surveillance, Unmanned Aerial Vehicles (UAVs) require the\nexecution of edge-assisted computer vision (CV) tasks. In multi-hop UAV\nnetworks, the successful transmission of these tasks to the edge is severely\nchallenged due to severe bandwidth constraints. For this reason, we propose a\nnovel A$^2$-UAV framework to optimize the number of correctly executed tasks at\nthe edge. In stark contrast with existing art, we take an application-aware\napproach and formulate a novel pplication-Aware Task Planning Problem\n(A$^2$-TPP) that takes into account (i) the relationship between deep neural\nnetwork (DNN) accuracy and image compression for the classes of interest based\non the available dataset, (ii) the target positions, (iii) the current\nenergy/position of the UAVs to optimize routing, data pre-processing and target\nassignment for each UAV. We demonstrate A$^2$-TPP is NP-Hard and propose a\npolynomial-time algorithm to solve it efficiently. We extensively evaluate\nA$^2$-UAV through real-world experiments with a testbed composed by four DJI\nMavic Air 2 UAVs. We consider state-of-the-art image classification tasks with\nfour different DNN models (i.e., DenseNet, ResNet152, ResNet50 and\nMobileNet-V2) and object detection tasks using YoloV4 trained on the ImageNet\ndataset. Results show that A$^2$-UAV attains on average around 38% more\naccomplished tasks than the state-of-the-art, with 400% more accomplished tasks\nwhen the number of targets increases significantly. To allow full\nreproducibility, we pledge to share datasets and code with the research\ncommunity.\n","authors":["Andrea Coletta","Flavio Giorgi","Gaia Maselli","Matteo Prata","Domenicomichele Silvestri","Jonathan Ashdown","Francesco Restuccia"],"pdf_url":"https://arxiv.org/pdf/2301.06363v2.pdf","comment":"Accepted to INFOCOM 2023"},{"id":"http://arxiv.org/abs/2307.13136v1","updated":"2023-07-24T21:29:48Z","published":"2023-07-24T21:29:48Z","title":"Does Progress On Object Recognition Benchmarks Improve Real-World\n  Generalization?","summary":"  For more than a decade, researchers have measured progress in object\nrecognition on ImageNet-based generalization benchmarks such as ImageNet-A, -C,\nand -R. Recent advances in foundation models, trained on orders of magnitude\nmore data, have begun to saturate these standard benchmarks, but remain brittle\nin practice. This suggests standard benchmarks, which tend to focus on\npredefined or synthetic changes, may not be sufficient for measuring real world\ngeneralization. Consequently, we propose studying generalization across\ngeography as a more realistic measure of progress using two datasets of objects\nfrom households across the globe. We conduct an extensive empirical evaluation\nof progress across nearly 100 vision models up to most recent foundation\nmodels. We first identify a progress gap between standard benchmarks and\nreal-world, geographical shifts: progress on ImageNet results in up to 2.5x\nmore progress on standard generalization benchmarks than real-world\ndistribution shifts. Second, we study model generalization across geographies\nby measuring the disparities in performance across regions, a more fine-grained\nmeasure of real world generalization. We observe all models have large\ngeographic disparities, even foundation CLIP models, with differences of 7-20%\nin accuracy between regions. Counter to modern intuition, we discover progress\non standard benchmarks fails to improve geographic disparities and often\nexacerbates them: geographic disparities between the least performant models\nand today's best models have more than tripled. Our results suggest scaling\nalone is insufficient for consistent robustness to real-world distribution\nshifts. Finally, we highlight in early experiments how simple last layer\nretraining on more representative, curated data can complement scaling as a\npromising direction of future work, reducing geographic disparity on both\nbenchmarks by over two-thirds.\n","authors":["Megan Richards","Polina Kirichenko","Diane Bouchacourt","Mark Ibrahim"],"pdf_url":"https://arxiv.org/pdf/2307.13136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13133v1","updated":"2023-07-24T21:22:58Z","published":"2023-07-24T21:22:58Z","title":"simPLE: a visuotactile method learned in simulation to precisely pick,\n  localize, regrasp, and place objects","summary":"  Existing robotic systems have a clear tension between generality and\nprecision. Deployed solutions for robotic manipulation tend to fall into the\nparadigm of one robot solving a single task, lacking precise generalization,\ni.e., the ability to solve many tasks without compromising on precision. This\npaper explores solutions for precise and general pick-and-place. In precise\npick-and-place, i.e. kitting, the robot transforms an unstructured arrangement\nof objects into an organized arrangement, which can facilitate further\nmanipulation. We propose simPLE (simulation to Pick Localize and PLacE) as a\nsolution to precise pick-and-place. simPLE learns to pick, regrasp and place\nobjects precisely, given only the object CAD model and no prior experience. We\ndevelop three main components: task-aware grasping, visuotactile perception,\nand regrasp planning. Task-aware grasping computes affordances of grasps that\nare stable, observable, and favorable to placing. The visuotactile perception\nmodel relies on matching real observations against a set of simulated ones\nthrough supervised learning. Finally, we compute the desired robot motion by\nsolving a shortest path problem on a graph of hand-to-hand regrasps. On a\ndual-arm robot equipped with visuotactile sensing, we demonstrate\npick-and-place of 15 diverse objects with simPLE. The objects span a wide range\nof shapes and simPLE achieves successful placements into structured\narrangements with 1mm clearance over 90% of the time for 6 objects, and over\n80% of the time for 11 objects. Videos are available at\nhttp://mcube.mit.edu/research/simPLE.html .\n","authors":["Maria Bauza","Antonia Bronars","Yifan Hou","Ian Taylor","Nikhil Chavan-Dafle","Alberto Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2307.13133v1.pdf","comment":"33 pages, 6 figures, 2 tables, submitted to Science Robotics"},{"id":"http://arxiv.org/abs/2205.04691v3","updated":"2023-07-24T20:56:50Z","published":"2022-05-10T06:24:09Z","title":"An Asynchronous Event-Based Algorithm for Periodic Signals","summary":"  Let $0\\leq\\tau_{1}\\leq\\tau_{2}\\leq\\cdots\\leq\\tau_{m}\\leq1$, originated from a\nuniform distribution. Let also $\\epsilon,\\delta\\in\\mathbb{R}$, and\n$d\\in\\mathbb{N}$. What is the probability of having more than $d$ adjacent\n$\\tau_{i}$-s pairs that the distance between them is $\\delta$, up to an error\n$\\epsilon$ ? In this paper we are going to show how this untreated theoretical\nprobabilistic problem arises naturally from the motivation of analyzing a\nsimple asynchronous algorithm for detection of signals with a known frequency,\nusing the novel technology of an event camera.\n","authors":["David El-Chai Ben-Ezra","Ron Arad","Ayelet Padowicz","Israel Tugendhaft"],"pdf_url":"https://arxiv.org/pdf/2205.04691v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2307.13125v1","updated":"2023-07-24T20:53:59Z","published":"2023-07-24T20:53:59Z","title":"Deep Learning Approaches for Data Augmentation in Medical Imaging: A\n  Review","summary":"  Deep learning has become a popular tool for medical image analysis, but the\nlimited availability of training data remains a major challenge, particularly\nin the medical field where data acquisition can be costly and subject to\nprivacy regulations. Data augmentation techniques offer a solution by\nartificially increasing the number of training samples, but these techniques\noften produce limited and unconvincing results. To address this issue, a\ngrowing number of studies have proposed the use of deep generative models to\ngenerate more realistic and diverse data that conform to the true distribution\nof the data. In this review, we focus on three types of deep generative models\nfor medical image augmentation: variational autoencoders, generative\nadversarial networks, and diffusion models. We provide an overview of the\ncurrent state of the art in each of these models and discuss their potential\nfor use in different downstream tasks in medical imaging, including\nclassification, segmentation, and cross-modal translation. We also evaluate the\nstrengths and limitations of each model and suggest directions for future\nresearch in this field. Our goal is to provide a comprehensive review about the\nuse of deep generative models for medical image augmentation and to highlight\nthe potential of these models for improving the performance of deep learning\nalgorithms in medical image analysis.\n","authors":["Aghiles Kebaili","Jérôme Lapuyade-Lahorgue","Su Ruan"],"pdf_url":"https://arxiv.org/pdf/2307.13125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13110v1","updated":"2023-07-24T19:59:15Z","published":"2023-07-24T19:59:15Z","title":"Automatic Infant Respiration Estimation from Video: A Deep Flow-based\n  Algorithm and a Novel Public Benchmark","summary":"  Respiration is a critical vital sign for infants, and continuous respiratory\nmonitoring is particularly important for newborns. However, neonates are\nsensitive and contact-based sensors present challenges in comfort, hygiene, and\nskin health, especially for preterm babies. As a step toward fully automatic,\ncontinuous, and contactless respiratory monitoring, we develop a deep-learning\nmethod for estimating respiratory rate and waveform from plain video footage in\nnatural settings. Our automated infant respiration flow-based network\n(AIRFlowNet) combines video-extracted optical flow input and spatiotemporal\nconvolutional processing tuned to the infant domain. We support our model with\nthe first public annotated infant respiration dataset with 125 videos\n(AIR-125), drawn from eight infant subjects, set varied pose, lighting, and\ncamera conditions. We include manual respiration annotations and optimize\nAIRFlowNet training on them using a novel spectral bandpass loss function. When\ntrained and tested on the AIR-125 infant data, our method significantly\noutperforms other state-of-the-art methods in respiratory rate estimation,\nachieving a mean absolute error of $\\sim$2.9 breaths per minute, compared to\n$\\sim$4.7--6.2 for other public models designed for adult subjects and more\nuniform environments.\n","authors":["Sai Kumar Reddy Manne","Shaotong Zhu","Sarah Ostadabbas","Michael Wan"],"pdf_url":"https://arxiv.org/pdf/2307.13110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05799v2","updated":"2023-07-24T19:13:20Z","published":"2023-07-11T20:46:19Z","title":"3D Medical Image Segmentation based on multi-scale MPU-Net","summary":"  The high cure rate of cancer is inextricably linked to physicians' accuracy\nin diagnosis and treatment, therefore a model that can accomplish\nhigh-precision tumor segmentation has become a necessity in many applications\nof the medical industry. It can effectively lower the rate of misdiagnosis\nwhile considerably lessening the burden on clinicians. However, fully automated\ntarget organ segmentation is problematic due to the irregular stereo structure\nof 3D volume organs. As a basic model for this class of real applications,\nU-Net excels. It can learn certain global and local features, but still lacks\nthe capacity to grasp spatial long-range relationships and contextual\ninformation at multiple scales. This paper proposes a tumor segmentation model\nMPU-Net for patient volume CT images, which is inspired by Transformer with a\nglobal attention mechanism. By combining image serialization with the Position\nAttention Module, the model attempts to comprehend deeper contextual\ndependencies and accomplish precise positioning. Each layer of the decoder is\nalso equipped with a multi-scale module and a cross-attention mechanism. The\ncapability of feature extraction and integration at different levels has been\nenhanced, and the hybrid loss function developed in this study can better\nexploit high-resolution characteristic information. Moreover, the suggested\narchitecture is tested and evaluated on the Liver Tumor Segmentation Challenge\n2017 (LiTS 2017) dataset. Compared with the benchmark model U-Net, MPU-Net\nshows excellent segmentation results. The dice, accuracy, precision,\nspecificity, IOU, and MCC metrics for the best model segmentation results are\n92.17%, 99.08%, 91.91%, 99.52%, 85.91%, and 91.74%, respectively. Outstanding\nindicators in various aspects illustrate the exceptional performance of this\nframework in automatic medical image segmentation.\n","authors":["Zeqiu. Yu","Shuo. Han","Ziheng. Song"],"pdf_url":"https://arxiv.org/pdf/2307.05799v2.pdf","comment":"37 pages"},{"id":"http://arxiv.org/abs/2307.13078v1","updated":"2023-07-24T18:59:46Z","published":"2023-07-24T18:59:46Z","title":"Adaptive Certified Training: Towards Better Accuracy-Robustness\n  Tradeoffs","summary":"  As deep learning models continue to advance and are increasingly utilized in\nreal-world systems, the issue of robustness remains a major challenge. Existing\ncertified training methods produce models that achieve high provable robustness\nguarantees at certain perturbation levels. However, the main problem of such\nmodels is a dramatically low standard accuracy, i.e. accuracy on clean\nunperturbed data, that makes them impractical. In this work, we consider a more\nrealistic perspective of maximizing the robustness of a model at certain levels\nof (high) standard accuracy. To this end, we propose a novel certified training\nmethod based on a key insight that training with adaptive certified radii helps\nto improve both the accuracy and robustness of the model, advancing\nstate-of-the-art accuracy-robustness tradeoffs. We demonstrate the\neffectiveness of the proposed method on MNIST, CIFAR-10, and TinyImageNet\ndatasets. Particularly, on CIFAR-10 and TinyImageNet, our method yields models\nwith up to two times higher robustness, measured as an average certified radius\nof a test set, at the same levels of standard accuracy compared to baseline\napproaches.\n","authors":["Zhakshylyk Nurlanov","Frank R. Schmidt","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2307.13078v1.pdf","comment":"Presented at ICML 2023 workshop \"New Frontiers in Adversarial Machine\n  Learning\""},{"id":"http://arxiv.org/abs/2307.09588v2","updated":"2023-07-24T18:52:54Z","published":"2023-07-18T19:51:28Z","title":"Automating Wood Species Detection and Classification in Microscopic\n  Images of Fibrous Materials with Deep Learning","summary":"  We have developed a methodology for the systematic generation of a large\nimage dataset of macerated wood references, which we used to generate image\ndata for nine hardwood genera. This is the basis for a substantial approach to\nautomate, for the first time, the identification of hardwood species in\nmicroscopic images of fibrous materials by deep learning. Our methodology\nincludes a flexible pipeline for easy annotation of vessel elements. We compare\nthe performance of different neural network architectures and hyperparameters.\nOur proposed method performs similarly well to human experts. In the future,\nthis will improve controls on global wood fiber product flows to protect\nforests.\n","authors":["Lars Nieradzik","Jördis Sieburg-Rockel","Stephanie Helmling","Janis Keuper","Thomas Weibel","Andrea Olbrich","Henrike Stephani"],"pdf_url":"https://arxiv.org/pdf/2307.09588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13069v1","updated":"2023-07-24T18:50:49Z","published":"2023-07-24T18:50:49Z","title":"General-Purpose Multi-Modal OOD Detection Framework","summary":"  Out-of-distribution (OOD) detection identifies test samples that differ from\nthe training data, which is critical to ensuring the safety and reliability of\nmachine learning (ML) systems. While a plethora of methods have been developed\nto detect uni-modal OOD samples, only a few have focused on multi-modal OOD\ndetection. Current contrastive learning-based methods primarily study\nmulti-modal OOD detection in a scenario where both a given image and its\ncorresponding textual description come from a new domain. However, real-world\ndeployments of ML systems may face more anomaly scenarios caused by multiple\nfactors like sensor faults, bad weather, and environmental changes. Hence, the\ngoal of this work is to simultaneously detect from multiple different OOD\nscenarios in a fine-grained manner. To reach this goal, we propose a\ngeneral-purpose weakly-supervised OOD detection framework, called WOOD, that\ncombines a binary classifier and a contrastive learning component to reap the\nbenefits of both. In order to better distinguish the latent representations of\nin-distribution (ID) and OOD samples, we adopt the Hinge loss to constrain\ntheir similarity. Furthermore, we develop a new scoring metric to integrate the\nprediction results from both the binary classifier and contrastive learning for\nidentifying OOD samples. We evaluate the proposed WOOD model on multiple\nreal-world datasets, and the experimental results demonstrate that the WOOD\nmodel outperforms the state-of-the-art methods for multi-modal OOD detection.\nImportantly, our approach is able to achieve high accuracy in OOD detection in\nthree different OOD scenarios simultaneously. The source code will be made\npublicly available upon publication.\n","authors":["Viet Duong","Qiong Wu","Zhengyi Zhou","Eric Zavesky","Jiahe Chen","Xiangzhou Liu","Wen-Ling Hsu","Huajie Shao"],"pdf_url":"https://arxiv.org/pdf/2307.13069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13060v1","updated":"2023-07-24T18:19:39Z","published":"2023-07-24T18:19:39Z","title":"On the characteristics of natural hydraulic dampers: An image-based\n  approach to study the fluid flow behaviour inside the human meniscal tissue","summary":"  The meniscal tissue is a layered material with varying properties influenced\nby collagen content and arrangement. Understanding the relationship between\nstructure and properties is crucial for disease management, treatment\ndevelopment, and biomaterial design. The internal layer of the meniscus is\nsofter and more deformable than the outer layers, thanks to interconnected\ncollagen channels that guide fluid flow. To investigate these relationships, we\npropose a novel approach that combines Computational Fluid Dynamics (CFD) with\nImage Analysis (CFD-IA). We analyze fluid flow in the internal architecture of\nthe human meniscus across a range of inlet velocities (0.1mm/s to 1.6m/s) using\nhigh-resolution 3D micro-computed tomography scans. Statistical correlations\nare observed between architectural parameters (tortuosity, connectivity,\nporosity, pore size) and fluid flow parameters (Re number distribution,\npermeability). Some channels exhibit Re values of 1400 at an inlet velocity of\n1.6m/s, and a transition from Darcy's regime to a non-Darcian regime occurs\naround an inlet velocity of 0.02m/s. Location-dependent permeability ranges\nfrom 20-32 Darcy. Regression modelling reveals a strong correlation between\nfluid velocity and tortuosity at high inlet velocities, as well as with channel\ndiameter at low inlet velocities. At higher inlet velocities, flow paths\ndeviate more from the preferential direction, resulting in a decrease in the\nconcentration parameter by an average of 0.4. This research provides valuable\ninsights into the fluid flow behaviour within the meniscus and its structural\ninfluences.\n","authors":["J. Waghorne","F. P. Bonomo","A. Rabbani","D. Bell","O. Barrera"],"pdf_url":"https://arxiv.org/pdf/2307.13060v1.pdf","comment":"20 Pages, 5 Figures"},{"id":"http://arxiv.org/abs/2307.02625v2","updated":"2023-07-24T18:16:38Z","published":"2023-07-05T19:56:50Z","title":"Retinex-based Image Denoising / Contrast Enhancement using Gradient\n  Graph Laplacian Regularizer","summary":"  Images captured in poorly lit conditions are often corrupted by acquisition\nnoise. Leveraging recent advances in graph-based regularization, we propose a\nfast Retinex-based restoration scheme that denoises and contrast-enhances an\nimage. Specifically, by Retinex theory we first assume that each image pixel is\na multiplication of its reflectance and illumination components. We next assume\nthat the reflectance and illumination components are piecewise constant (PWC)\nand continuous piecewise planar (PWP) signals, which can be recovered via graph\nLaplacian regularizer (GLR) and gradient graph Laplacian regularizer (GGLR)\nrespectively. We formulate quadratic objectives regularized by GLR and GGLR,\nwhich are minimized alternately until convergence by solving linear systems --\nwith improved condition numbers via proposed preconditioners -- via conjugate\ngradient (CG) efficiently. Experimental results show that our algorithm\nachieves competitive visual image quality while reducing computation complexity\nnoticeably.\n","authors":["Yeganeh Gharedaghi","Gene Cheung","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2307.02625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13011v1","updated":"2023-07-24T13:47:30Z","published":"2023-07-24T13:47:30Z","title":"Maximal Independent Sets for Pooling in Graph Neural Networks","summary":"  Convolutional Neural Networks (CNNs) have enabled major advances in image\nclassification through convolution and pooling. In particular, image pooling\ntransforms a connected discrete lattice into a reduced lattice with the same\nconnectivity and allows reduction functions to consider all pixels in an image.\nHowever, there is no pooling that satisfies these properties for graphs. In\nfact, traditional graph pooling methods suffer from at least one of the\nfollowing drawbacks: Graph disconnection or overconnection, low decimation\nratio, and deletion of large parts of graphs. In this paper, we present three\npooling methods based on the notion of maximal independent sets that avoid\nthese pitfalls. Our experimental results confirm the relevance of maximal\nindependent set constraints for graph pooling.\n","authors":["Stevan Stanovic","Benoit Gaüzère","Luc Brun"],"pdf_url":"https://arxiv.org/pdf/2307.13011v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.09683v2","updated":"2023-07-24T15:41:03Z","published":"2023-07-18T23:35:53Z","title":"PubMed and Beyond: Recent Advances and Best Practices in Biomedical\n  Literature Search","summary":"  Biomedical research yields a wealth of information, much of which is only\naccessible through the literature. Consequently, literature search is an\nessential tool for building on prior knowledge in clinical and biomedical\nresearch. Although recent improvements in artificial intelligence have expanded\nfunctionality beyond keyword-based search, these advances may be unfamiliar to\nclinicians and researchers. In response, we present a survey of literature\nsearch tools tailored to both general and specific information needs in\nbiomedicine, with the objective of helping readers efficiently fulfill their\ninformation needs. We first examine the widely used PubMed search engine,\ndiscussing recent improvements and continued challenges. We then describe\nliterature search tools catering to five specific information needs: 1.\nIdentifying high-quality clinical research for evidence-based medicine. 2.\nRetrieving gene-related information for precision medicine and genomics. 3.\nSearching by meaning, including natural language questions. 4. Locating related\narticles with literature recommendation. 5. Mining literature to discover\nassociations between concepts such as diseases and genetic variants.\nAdditionally, we cover practical considerations and best practices for choosing\nand using these tools. Finally, we provide a perspective on the future of\nliterature search engines, considering recent breakthroughs in large language\nmodels such as ChatGPT. In summary, our survey provides a comprehensive view of\nbiomedical literature search functionalities with 36 publicly available tools.\n","authors":["Qiao Jin","Robert Leaman","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.09683v2.pdf","comment":"27 pages, 6 figures, 36 tools"},{"id":"http://arxiv.org/abs/2307.12810v1","updated":"2023-07-24T14:00:07Z","published":"2023-07-24T14:00:07Z","title":"HeteFedRec: Federated Recommender Systems with Model Heterogeneity","summary":"  Owing to the nature of privacy protection, federated recommender systems\n(FedRecs) have garnered increasing interest in the realm of on-device\nrecommender systems. However, most existing FedRecs only allow participating\nclients to collaboratively train a recommendation model of the same public\nparameter size. Training a model of the same size for all clients can lead to\nsuboptimal performance since clients possess varying resources. For example,\nclients with limited training data may prefer to train a smaller recommendation\nmodel to avoid excessive data consumption, while clients with sufficient data\nwould benefit from a larger model to achieve higher recommendation accuracy. To\naddress the above challenge, this paper introduces HeteFedRec, a novel FedRec\nframework that enables the assignment of personalized model sizes to\nparticipants. In HeteFedRec, we present a heterogeneous recommendation model\naggregation strategy, including a unified dual-task learning mechanism and a\ndimensional decorrelation regularization, to allow knowledge aggregation among\nrecommender models of different sizes. Additionally, a relation-based ensemble\nknowledge distillation method is proposed to effectively distil knowledge from\nheterogeneous item embeddings. Extensive experiments conducted on three\nreal-world recommendation datasets demonstrate the effectiveness and efficiency\nof HeteFedRec in training federated recommender systems under heterogeneous\nsettings.\n","authors":["Wei Yuan","Liang Qu","Lizhen Cui","Yongxin Tong","Xiaofang Zhou","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2307.12810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12798v1","updated":"2023-07-24T13:51:19Z","published":"2023-07-24T13:51:19Z","title":"RRAML: Reinforced Retrieval Augmented Machine Learning","summary":"  The emergence of large language models (LLMs) has revolutionized machine\nlearning and related fields, showcasing remarkable abilities in comprehending,\ngenerating, and manipulating human language. However, their conventional usage\nthrough API-based text prompt submissions imposes certain limitations in terms\nof context constraints and external source availability. To address these\nchallenges, we propose a novel framework called Reinforced Retrieval Augmented\nMachine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs\nwith supporting information retrieved by a purpose-built retriever from a vast\nuser-provided database. By leveraging recent advancements in reinforcement\nlearning, our method effectively addresses several critical challenges.\nFirstly, it circumvents the need for accessing LLM gradients. Secondly, our\nmethod alleviates the burden of retraining LLMs for specific tasks, as it is\noften impractical or impossible due to restricted access to the model and the\ncomputational intensity involved. Additionally we seamlessly link the\nretriever's task with the reasoner, mitigating hallucinations and reducing\nirrelevant, and potentially damaging retrieved documents. We believe that the\nresearch agenda outlined in this paper has the potential to profoundly impact\nthe field of AI, democratizing access to and utilization of LLMs for a wide\nrange of entities.\n","authors":["Andrea Bacciu","Florin Cocunasu","Federico Siciliano","Fabrizio Silvestri","Nicola Tonellotto","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2307.12798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12756v1","updated":"2023-07-24T12:58:47Z","published":"2023-07-24T12:58:47Z","title":"Unbiased Delayed Feedback Label Correction for Conversion Rate\n  Prediction","summary":"  Conversion rate prediction is critical to many online applications such as\ndigital display advertising. To capture dynamic data distribution, industrial\nsystems often require retraining models on recent data daily or weekly.\nHowever, the delay of conversion behavior usually leads to incorrect labeling,\nwhich is called delayed feedback problem. Existing work may fail to introduce\nthe correct information about false negative samples due to data sparsity and\ndynamic data distribution. To directly introduce the correct feedback label\ninformation, we propose an Unbiased delayed feedback Label Correction framework\n(ULC), which uses an auxiliary model to correct labels for observed negative\nfeedback samples. Firstly, we theoretically prove that the label-corrected loss\nis an unbiased estimate of the oracle loss using true labels. Then, as there\nare no ready training data for label correction, counterfactual labeling is\nused to construct artificial training data. Furthermore, since counterfactual\nlabeling utilizes only partial training data, we design an embedding-based\nalternative training method to enhance performance. Comparative experiments on\nboth public and private datasets and detailed analyses show that our proposed\napproach effectively alleviates the delayed feedback problem and consistently\noutperforms the previous state-of-the-art methods.\n","authors":["Yifan Wang","Peijie Sun","Min Zhang","Qinglin Jia","Jingjie Li","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2307.12756v1.pdf","comment":"accepted by KDD 2023"},{"id":"http://arxiv.org/abs/2307.12576v1","updated":"2023-07-24T07:47:21Z","published":"2023-07-24T07:47:21Z","title":"Self-refining of Pseudo Labels for Music Source Separation with Noisy\n  Labeled Data","summary":"  Music source separation (MSS) faces challenges due to the limited\navailability of correctly-labeled individual instrument tracks. With the push\nto acquire larger datasets to improve MSS performance, the inevitability of\nencountering mislabeled individual instrument tracks becomes a significant\nchallenge to address. This paper introduces an automated technique for refining\nthe labels in a partially mislabeled dataset. Our proposed self-refining\ntechnique, employed with a noisy-labeled dataset, results in only a 1% accuracy\ndegradation in multi-label instrument recognition compared to a classifier\ntrained on a clean-labeled dataset. The study demonstrates the importance of\nrefining noisy-labeled data in MSS model training and shows that utilizing the\nrefined dataset leads to comparable results derived from a clean-labeled\ndataset. Notably, upon only access to a noisy dataset, MSS models trained on a\nself-refined dataset even outperform those trained on a dataset refined with a\nclassifier trained on clean labels.\n","authors":["Junghyun Koo","Yunkee Chae","Chang-Bin Jeon","Kyogu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.12576v1.pdf","comment":"24th International Society for Music Information Retrieval Conference\n  (ISMIR 2023)"},{"id":"http://arxiv.org/abs/2307.10617v3","updated":"2023-07-24T07:03:01Z","published":"2023-07-20T06:35:43Z","title":"Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques","summary":"  In the contemporary digital landscape, online reviews have become an\nindispensable tool for promoting products and services across various\nbusinesses. Marketers, advertisers, and online businesses have found incentives\nto create deceptive positive reviews for their products and negative reviews\nfor their competitors' offerings. As a result, the writing of deceptive reviews\nhas become an unavoidable practice for businesses seeking to promote themselves\nor undermine their rivals. Detecting such deceptive reviews has become an\nintense and ongoing area of research. This research paper proposes a machine\nlearning model to identify deceptive reviews, with a particular focus on\nrestaurants. This study delves into the performance of numerous experiments\nconducted on a dataset of restaurant reviews known as the Deceptive Opinion\nSpam Corpus. To accomplish this, an n-gram model and max features are developed\nto effectively identify deceptive content, particularly focusing on fake\nreviews. A benchmark study is undertaken to explore the performance of two\ndifferent feature extraction techniques, which are then coupled with five\ndistinct machine learning classification algorithms. The experimental results\nreveal that the passive aggressive classifier stands out among the various\nalgorithms, showcasing the highest accuracy not only in text classification but\nalso in identifying fake reviews. Moreover, the research delves into data\naugmentation and implements various deep learning techniques to further enhance\nthe process of detecting deceptive reviews. The findings shed light on the\nefficacy of the proposed machine learning approach and offer valuable insights\ninto dealing with deceptive reviews in the realm of online businesses.\n","authors":["Anusuya Baby Hari Krishnan"],"pdf_url":"https://arxiv.org/pdf/2307.10617v3.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.12518v1","updated":"2023-07-24T04:23:08Z","published":"2023-07-24T04:23:08Z","title":"FaFCNN: A General Disease Classification Framework Based on Feature\n  Fusion Neural Networks","summary":"  There are two fundamental problems in applying deep learning/machine learning\nmethods to disease classification tasks, one is the insufficient number and\npoor quality of training samples; another one is how to effectively fuse\nmultiple source features and thus train robust classification models. To\naddress these problems, inspired by the process of human learning knowledge, we\npropose the Feature-aware Fusion Correlation Neural Network (FaFCNN), which\nintroduces a feature-aware interaction module and a feature alignment module\nbased on domain adversarial learning. This is a general framework for disease\nclassification, and FaFCNN improves the way existing methods obtain sample\ncorrelation features. The experimental results show that training using\naugmented features obtained by pre-training gradient boosting decision tree\nyields more performance gains than random-forest based methods. On the\nlow-quality dataset with a large amount of missing data in our setup, FaFCNN\nobtains a consistently optimal performance compared to competitive baselines.\nIn addition, extensive experiments demonstrate the robustness of the proposed\nmethod and the effectiveness of each component of the model\\footnote{Accepted\nin IEEE SMC2023}.\n","authors":["Menglin Kong","Shaojie Zhao","Juan Cheng","Xingquan Li","Ri Su","Muzhou Hou","Cong Cao"],"pdf_url":"https://arxiv.org/pdf/2307.12518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13165v1","updated":"2023-07-24T23:26:46Z","published":"2023-07-24T23:26:46Z","title":"Investigating the Robustness of Sequential Recommender Systems Against\n  Training Data Perturbations: an Empirical Study","summary":"  Sequential Recommender Systems (SRSs) have been widely used to model user\nbehavior over time, but their robustness in the face of perturbations to\ntraining data is a critical issue. In this paper, we conduct an empirical study\nto investigate the effects of removing items at different positions within a\ntemporally ordered sequence. We evaluate two different SRS models on multiple\ndatasets, measuring their performance using Normalized Discounted Cumulative\nGain (NDCG) and Rank Sensitivity List metrics. Our results demonstrate that\nremoving items at the end of the sequence significantly impacts performance,\nwith NDCG decreasing up to 60\\%, while removing items from the beginning or\nmiddle has no significant effect. These findings highlight the importance of\nconsidering the position of the perturbed items in the training data and shall\ninform the design of more robust SRSs.\n","authors":["Filippo Betello","Federico Siciliano","Pushkar Mishra","Fabrizio Silvestri"],"pdf_url":"https://arxiv.org/pdf/2307.13165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.15498v2","updated":"2023-07-24T20:08:20Z","published":"2021-06-29T15:25:33Z","title":"Classification of Consumer Belief Statements From Social Media","summary":"  Social media offer plenty of information to perform market research in order\nto meet the requirements of customers. One way how this research is conducted\nis that a domain expert gathers and categorizes user-generated content into a\ncomplex and fine-grained class structure. In many of such cases, little data\nmeets complex annotations. It is not yet fully understood how this can be\nleveraged successfully for classification. We examine the classification\naccuracy of expert labels when used with a) many fine-grained classes and b)\nfew abstract classes. For scenario b) we compare abstract class labels given by\nthe domain expert as baseline and by automatic hierarchical clustering. We\ncompare this to another baseline where the entire class structure is given by a\ncompletely unsupervised clustering approach. By doing so, this work can serve\nas an example of how complex expert annotations are potentially beneficial and\ncan be utilized in the most optimal way for opinion mining in highly specific\ndomains. By exploring across a range of techniques and experiments, we find\nthat automated class abstraction approaches in particular the unsupervised\napproach performs remarkably well against domain expert baseline on text\nclassification tasks. This has the potential to inspire opinion mining\napplications in order to support market researchers in practice and to inspire\nfine-grained automated content analysis on a large scale.\n","authors":["Gerhard Johann Hagerer","Wenbin Le","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2106.15498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.02259v3","updated":"2023-07-24T20:03:14Z","published":"2021-11-03T14:49:50Z","title":"A Case Study and Qualitative Analysis of Simple Cross-Lingual Opinion\n  Mining","summary":"  User-generated content from social media is produced in many languages,\nmaking it technically challenging to compare the discussed themes from one\ndomain across different cultures and regions. It is relevant for domains in a\nglobalized world, such as market research, where people from two nations and\nmarkets might have different requirements for a product. We propose a simple,\nmodern, and effective method for building a single topic model with sentiment\nanalysis capable of covering multiple languages simultanteously, based on a\npre-trained state-of-the-art deep neural network for natural language\nunderstanding. To demonstrate its feasibility, we apply the model to newspaper\narticles and user comments of a specific domain, i.e., organic food products\nand related consumption behavior. The themes match across languages.\nAdditionally, we obtain an high proportion of stable and domain-relevant\ntopics, a meaningful relation between topics and their respective textual\ncontents, and an interpretable representation for social media documents.\nMarketing can potentially benefit from our method, since it provides an\neasy-to-use means of addressing specific customer interests from different\nmarket regions around the globe. For reproducibility, we provide the code,\ndata, and results of our study.\n","authors":["Gerhard Johann Hagerer","Wing Sheung Leung","Qiaoxi Liu","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2111.02259v3.pdf","comment":"10 pages, 2 tables, 5 figures, full paper, peer-reviewed, published\n  at KDIR/IC3k 2021 conference"},{"id":"http://arxiv.org/abs/2304.04759v2","updated":"2023-07-24T18:10:09Z","published":"2023-04-07T23:10:39Z","title":"Similarity search in the blink of an eye with compressed indices","summary":"  Nowadays, data is represented by vectors. Retrieving those vectors, among\nmillions and billions, that are similar to a given query is a ubiquitous\nproblem, known as similarity search, of relevance for a wide range of\napplications. Graph-based indices are currently the best performing techniques\nfor billion-scale similarity search. However, their random-access memory\npattern presents challenges to realize their full potential. In this work, we\npresent new techniques and systems for creating faster and smaller graph-based\nindices. To this end, we introduce a novel vector compression method,\nLocally-adaptive Vector Quantization (LVQ), that uses per-vector scaling and\nscalar quantization to improve search performance with fast similarity\ncomputations and a reduced effective bandwidth, while decreasing memory\nfootprint and barely impacting accuracy. LVQ, when combined with a new\nhigh-performance computing system for graph-based similarity search,\nestablishes the new state of the art in terms of performance and memory\nfootprint. For billions of vectors, LVQ outcompetes the second-best\nalternatives: (1) in the low-memory regime, by up to 20.7x in throughput with\nup to a 3x memory footprint reduction, and (2) in the high-throughput regime by\n5.8x with 1.4x less memory.\n","authors":["Cecilia Aguerrebere","Ishwar Bhati","Mark Hildebrand","Mariano Tepper","Ted Willke"],"pdf_url":"https://arxiv.org/pdf/2304.04759v2.pdf","comment":"VLDB 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.12983v1","updated":"2023-07-24T17:59:37Z","published":"2023-07-24T17:59:37Z","title":"Parallel $Q$-Learning: Scaling Off-policy Reinforcement Learning under\n  Massively Parallel Simulation","summary":"  Reinforcement learning is time-consuming for complex tasks due to the need\nfor large amounts of training data. Recent advances in GPU-based simulation,\nsuch as Isaac Gym, have sped up data collection thousands of times on a\ncommodity GPU. Most prior works used on-policy methods like PPO due to their\nsimplicity and ease of scaling. Off-policy methods are more data efficient but\nchallenging to scale, resulting in a longer wall-clock training time. This\npaper presents a Parallel $Q$-Learning (PQL) scheme that outperforms PPO in\nwall-clock time while maintaining superior sample efficiency of off-policy\nlearning. PQL achieves this by parallelizing data collection, policy learning,\nand value learning. Different from prior works on distributed off-policy\nlearning, such as Apex, our scheme is designed specifically for massively\nparallel GPU-based simulation and optimized to work on a single workstation. In\nexperiments, we demonstrate that $Q$-learning can be scaled to \\textit{tens of\nthousands of parallel environments} and investigate important factors affecting\nlearning speed. The code is available at https://github.com/Improbable-AI/pql.\n","authors":["Zechu Li","Tao Chen","Zhang-Wei Hong","Anurag Ajay","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2307.12983v1.pdf","comment":"Accepted by ICML 2023"},{"id":"http://arxiv.org/abs/2307.12981v1","updated":"2023-07-24T17:59:02Z","published":"2023-07-24T17:59:02Z","title":"3D-LLM: Injecting the 3D World into Large Language Models","summary":"  Large language models (LLMs) and Vision-Language Models (VLMs) have been\nproven to excel at multiple tasks, such as commonsense reasoning. Powerful as\nthese models can be, they are not grounded in the 3D physical world, which\ninvolves richer concepts such as spatial relationships, affordances, physics,\nlayout, and so on. In this work, we propose to inject the 3D world into large\nlanguage models and introduce a whole new family of 3D-LLMs. Specifically,\n3D-LLMs can take 3D point clouds and their features as input and perform a\ndiverse set of 3D-related tasks, including captioning, dense captioning, 3D\nquestion answering, task decomposition, 3D grounding, 3D-assisted dialog,\nnavigation, and so on. Using three types of prompting mechanisms that we\ndesign, we are able to collect over 300k 3D-language data covering these tasks.\nTo efficiently train 3D-LLMs, we first utilize a 3D feature extractor that\nobtains 3D features from rendered multi- view images. Then, we use 2D VLMs as\nour backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,\n3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show\nthat our model outperforms state-of-the-art baselines by a large margin (e.g.,\nthe BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,\nexperiments on our held-in datasets for 3D captioning, task composition, and\n3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative\nexamples also show that our model could perform more tasks beyond the scope of\nexisting LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.\n","authors":["Yining Hong","Haoyu Zhen","Peihao Chen","Shuhong Zheng","Yilun Du","Zhenfang Chen","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2307.12981v1.pdf","comment":"Project Page: : https://vis-www.cs.umass.edu/3dllm/"},{"id":"http://arxiv.org/abs/2303.06147v2","updated":"2023-07-24T17:58:45Z","published":"2023-03-10T18:59:57Z","title":"Exphormer: Sparse Transformers for Graphs","summary":"  Graph transformers have emerged as a promising architecture for a variety of\ngraph learning and representation tasks. Despite their successes, though, it\nremains challenging to scale graph transformers to large graphs while\nmaintaining accuracy competitive with message-passing networks. In this paper,\nwe introduce Exphormer, a framework for building powerful and scalable graph\ntransformers. Exphormer consists of a sparse attention mechanism based on two\nmechanisms: virtual global nodes and expander graphs, whose mathematical\ncharacteristics, such as spectral expansion, pseduorandomness, and sparsity,\nyield graph transformers with complexity only linear in the size of the graph,\nwhile allowing us to prove desirable theoretical properties of the resulting\ntransformer models. We show that incorporating Exphormer into the\nrecently-proposed GraphGPS framework produces models with competitive empirical\nresults on a wide variety of graph datasets, including state-of-the-art results\non three datasets. We also show that Exphormer can scale to datasets on larger\ngraphs than shown in previous graph transformer architectures. Code can be\nfound at \\url{https://github.com/hamed1375/Exphormer}.\n","authors":["Hamed Shirzad","Ameya Velingker","Balaji Venkatachalam","Danica J. Sutherland","Ali Kemal Sinop"],"pdf_url":"https://arxiv.org/pdf/2303.06147v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05407v3","updated":"2023-07-24T17:58:31Z","published":"2022-09-12T16:59:36Z","title":"Segmenting Known Objects and Unseen Unknowns without Prior Knowledge","summary":"  Panoptic segmentation methods assign a known class to each pixel given in\ninput. Even for state-of-the-art approaches, this inevitably enforces decisions\nthat systematically lead to wrong predictions for objects outside the training\ncategories. However, robustness against out-of-distribution samples and corner\ncases is crucial in safety-critical settings to avoid dangerous consequences.\nSince real-world datasets cannot contain enough data points to adequately\nsample the long tail of the underlying distribution, models must be able to\ndeal with unseen and unknown scenarios as well. Previous methods targeted this\nby re-identifying already-seen unlabeled objects. In this work, we propose the\nnecessary step to extend segmentation with a new setting which we term holistic\nsegmentation. Holistic segmentation aims to identify and separate objects of\nunseen unknown categories into instances, without any prior knowledge about\nthem, while performing panoptic segmentation of known classes. We tackle this\nnew problem with U3HS, which finds unknowns as highly uncertain regions and\nclusters their corresponding instance-aware embeddings into individual objects.\nBy doing so, for the first time in panoptic segmentation with unknown objects,\nour U3HS is trained without unknown categories, reducing assumptions and\nleaving the settings as unconstrained as in real-life scenarios. Extensive\nexperiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate\nthe effectiveness of U3HS for this new, challenging, and assumptions-free\nsetting called holistic segmentation.\n","authors":["Stefano Gasperini","Alvaro Marcos-Ramiro","Michael Schmidt","Nassir Navab","Benjamin Busam","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2209.05407v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12979v1","updated":"2023-07-24T17:56:58Z","published":"2023-07-24T17:56:58Z","title":"An Isometric Stochastic Optimizer","summary":"  The Adam optimizer is the standard choice in deep learning applications. I\npropose a simple explanation of Adam's success: it makes each parameter's step\nsize independent of the norms of the other parameters. Based on this principle\nI derive Iso, a new optimizer which makes the norm of a parameter's update\ninvariant to the application of any linear transformation to its inputs and\noutputs. I develop a variant of Iso called IsoAdam that allows optimal\nhyperparameters to be transferred from Adam, and demonstrate that IsoAdam\nobtains a speedup over Adam when training a small Transformer.\n","authors":["Jacob Jackson"],"pdf_url":"https://arxiv.org/pdf/2307.12979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12975v1","updated":"2023-07-24T17:50:24Z","published":"2023-07-24T17:50:24Z","title":"Provable Benefits of Policy Learning from Human Preferences in\n  Contextual Bandit Problems","summary":"  A crucial task in decision-making problems is reward engineering. It is\ncommon in practice that no obvious choice of reward function exists. Thus, a\npopular approach is to introduce human feedback during training and leverage\nsuch feedback to learn a reward function. Among all policy learning methods\nthat use human feedback, preference-based methods have demonstrated substantial\nsuccess in recent empirical applications such as InstructGPT. In this work, we\ndevelop a theory that provably shows the benefits of preference-based methods\nin offline contextual bandits. In particular, we improve the modeling and\nsuboptimality analysis for running policy learning methods on human-scored\nsamples directly. Then, we compare it with the suboptimality guarantees of\npreference-based methods and show that preference-based methods enjoy lower\nsuboptimality.\n","authors":["Xiang Ji","Huazheng Wang","Minshuo Chen","Tuo Zhao","Mengdi Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12971v1","updated":"2023-07-24T17:49:05Z","published":"2023-07-24T17:49:05Z","title":"Big Data - Supply Chain Management Framework for Forecasting: Data\n  Preprocessing and Machine Learning Techniques","summary":"  This article intends to systematically identify and comparatively analyze\nstate-of-the-art supply chain (SC) forecasting strategies and technologies. A\nnovel framework has been proposed incorporating Big Data Analytics in SC\nManagement (problem identification, data sources, exploratory data analysis,\nmachine-learning model training, hyperparameter tuning, performance evaluation,\nand optimization), forecasting effects on human-workforce, inventory, and\noverall SC. Initially, the need to collect data according to SC strategy and\nhow to collect them has been discussed. The article discusses the need for\ndifferent types of forecasting according to the period or SC objective. The SC\nKPIs and the error-measurement systems have been recommended to optimize the\ntop-performing model. The adverse effects of phantom inventory on forecasting\nand the dependence of managerial decisions on the SC KPIs for determining model\nperformance parameters and improving operations management, transparency, and\nplanning efficiency have been illustrated. The cyclic connection within the\nframework introduces preprocessing optimization based on the post-process KPIs,\noptimizing the overall control process (inventory management, workforce\ndetermination, cost, production and capacity planning). The contribution of\nthis research lies in the standard SC process framework proposal, recommended\nforecasting data analysis, forecasting effects on SC performance, machine\nlearning algorithms optimization followed, and in shedding light on future\nresearch.\n","authors":["Md Abrar Jahin","Md Sakib Hossain Shovon","Jungpil Shin","Istiyaque Ahmed Ridoy","Yoichi Tomioka","M. F. Mridha"],"pdf_url":"https://arxiv.org/pdf/2307.12971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12968v1","updated":"2023-07-24T17:46:32Z","published":"2023-07-24T17:46:32Z","title":"A Connection between One-Step Regularization and Critic Regularization\n  in Reinforcement Learning","summary":"  As with any machine learning problem with limited data, effective offline RL\nalgorithms require careful regularization to avoid overfitting. One-step\nmethods perform regularization by doing just a single step of policy\nimprovement, while critic regularization methods do many steps of policy\nimprovement with a regularized objective. These methods appear distinct.\nOne-step methods, such as advantage-weighted regression and conditional\nbehavioral cloning, truncate policy iteration after just one step. This ``early\nstopping'' makes one-step RL simple and stable, but can limit its asymptotic\nperformance. Critic regularization typically requires more compute but has\nappealing lower-bound guarantees. In this paper, we draw a close connection\nbetween these methods: applying a multi-step critic regularization method with\na regularization coefficient of 1 yields the same policy as one-step RL. While\npractical implementations violate our assumptions and critic regularization is\ntypically applied with smaller regularization coefficients, our experiments\nnevertheless show that our analysis makes accurate, testable predictions about\npractical offline RL methods (CQL and one-step RL) with commonly-used\nhyperparameters. Our results that every problem can be solved with a single\nstep of policy improvement, but rather that one-step RL might be competitive\nwith critic regularization on RL problems that demand strong regularization.\n","authors":["Benjamin Eysenbach","Matthieu Geist","Sergey Levine","Ruslan Salakhutdinov"],"pdf_url":"https://arxiv.org/pdf/2307.12968v1.pdf","comment":"Accepted to ICML 2023. Video\n  (https://www.youtube.com/watch?v=1xlixIHZ0R4) and code\n  (https://github.com/ben-eysenbach/ac-connection)"},{"id":"http://arxiv.org/abs/2307.12967v1","updated":"2023-07-24T17:45:40Z","published":"2023-07-24T17:45:40Z","title":"Learning Dense Correspondences between Photos and Sketches","summary":"  Humans effortlessly grasp the connection between sketches and real-world\nobjects, even when these sketches are far from realistic. Moreover, human\nsketch understanding goes beyond categorization -- critically, it also entails\nunderstanding how individual elements within a sketch correspond to parts of\nthe physical world it represents. What are the computational ingredients needed\nto support this ability? Towards answering this question, we make two\ncontributions: first, we introduce a new sketch-photo correspondence benchmark,\n$\\textit{PSC6k}$, containing 150K annotations of 6250 sketch-photo pairs across\n125 object categories, augmenting the existing Sketchy dataset with\nfine-grained correspondence metadata. Second, we propose a self-supervised\nmethod for learning dense correspondences between sketch-photo pairs, building\nupon recent advances in correspondence learning for pairs of photos. Our model\nuses a spatial transformer network to estimate the warp flow between latent\nrepresentations of a sketch and photo extracted by a contrastive learning-based\nConvNet backbone. We found that this approach outperformed several strong\nbaselines and produced predictions that were quantitatively consistent with\nother warp-based methods. However, our benchmark also revealed systematic\ndifferences between predictions of the suite of models we tested and those of\nhumans. Taken together, our work suggests a promising path towards developing\nartificial systems that achieve more human-like understanding of visual images\nat different levels of abstraction. Project page:\nhttps://photo-sketch-correspondence.github.io\n","authors":["Xuanchen Lu","Xiaolong Wang","Judith E Fan"],"pdf_url":"https://arxiv.org/pdf/2307.12967v1.pdf","comment":"Accepted to ICML 2023. Project page:\n  https://photo-sketch-correspondence.github.io"},{"id":"http://arxiv.org/abs/2303.04245v2","updated":"2023-07-24T17:29:04Z","published":"2023-03-07T21:42:17Z","title":"How Do Transformers Learn Topic Structure: Towards a Mechanistic\n  Understanding","summary":"  While the successes of transformers across many domains are indisputable,\naccurate understanding of the learning mechanics is still largely lacking.\nTheir capabilities have been probed on benchmarks which include a variety of\nstructured and reasoning tasks -- but mathematical understanding is lagging\nsubstantially behind. Recent lines of work have begun studying representational\naspects of this question: that is, the size/depth/complexity of attention-based\nnetworks to perform certain tasks. However, there is no guarantee the learning\ndynamics will converge to the constructions proposed. In our paper, we provide\nfine-grained mechanistic understanding of how transformers learn \"semantic\nstructure\", understood as capturing co-occurrence structure of words.\nPrecisely, we show, through a combination of mathematical analysis and\nexperiments on Wikipedia data and synthetic data modeled by Latent Dirichlet\nAllocation (LDA), that the embedding layer and the self-attention layer encode\nthe topical structure. In the former case, this manifests as higher average\ninner product of embeddings between same-topic words. In the latter, it\nmanifests as higher average pairwise attention between same-topic words. The\nmathematical results involve several assumptions to make the analysis\ntractable, which we verify on data, and might be of independent interest as\nwell.\n","authors":["Yuchen Li","Yuanzhi Li","Andrej Risteski"],"pdf_url":"https://arxiv.org/pdf/2303.04245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12943v1","updated":"2023-07-24T17:15:38Z","published":"2023-07-24T17:15:38Z","title":"Efficiently Sampling the PSD Cone with the Metric Dikin Walk","summary":"  Semi-definite programs represent a frontier of efficient computation. While\nthere has been much progress on semi-definite optimization, with moderate-sized\ninstances currently solvable in practice by the interior-point method, the\nbasic problem of sampling semi-definite solutions remains a formidable\nchallenge. The direct application of known polynomial-time algorithms for\nsampling general convex bodies to semi-definite sampling leads to a\nprohibitively high running time. In addition, known general methods require an\nexpensive rounding phase as pre-processing. Here we analyze the Dikin walk, by\nfirst adapting it to general metrics, then devising suitable metrics for the\nPSD cone with affine constraints. The resulting mixing time and per-step\ncomplexity are considerably smaller, and by an appropriate choice of the\nmetric, the dependence on the number of constraints can be made\npolylogarithmic. We introduce a refined notion of self-concordant matrix\nfunctions and give rules for combining different metrics. Along the way, we\nfurther develop the theory of interior-point methods for sampling.\n","authors":["Yunbum Kook","Santosh S. Vempala"],"pdf_url":"https://arxiv.org/pdf/2307.12943v1.pdf","comment":"54 pages"},{"id":"http://arxiv.org/abs/2307.12941v1","updated":"2023-07-24T17:11:39Z","published":"2023-07-24T17:11:39Z","title":"On Privileged and Convergent Bases in Neural Network Representations","summary":"  In this study, we investigate whether the representations learned by neural\nnetworks possess a privileged and convergent basis. Specifically, we examine\nthe significance of feature directions represented by individual neurons.\nFirst, we establish that arbitrary rotations of neural representations cannot\nbe inverted (unlike linear networks), indicating that they do not exhibit\ncomplete rotational invariance. Subsequently, we explore the possibility of\nmultiple bases achieving identical performance. To do this, we compare the\nbases of networks trained with the same parameters but with varying random\ninitializations. Our study reveals two findings: (1) Even in wide networks such\nas WideResNets, neural networks do not converge to a unique basis; (2) Basis\ncorrelation increases significantly when a few early layers of the network are\nfrozen identically.\n  Furthermore, we analyze Linear Mode Connectivity, which has been studied as a\nmeasure of basis correlation. Our findings give evidence that while Linear Mode\nConnectivity improves with increased network width, this improvement is not due\nto an increase in basis correlation.\n","authors":["Davis Brown","Nikhil Vyas","Yamini Bansal"],"pdf_url":"https://arxiv.org/pdf/2307.12941v1.pdf","comment":"In the Workshop on High-dimensional Learning Dynamics at ICML 2023"},{"id":"http://arxiv.org/abs/2307.08572v3","updated":"2023-07-24T17:01:50Z","published":"2023-07-17T15:38:11Z","title":"Revisiting the Robustness of the Minimum Error Entropy Criterion: A\n  Transfer Learning Case Study","summary":"  Coping with distributional shifts is an important part of transfer learning\nmethods in order to perform well in real-life tasks. However, most of the\nexisting approaches in this area either focus on an ideal scenario in which the\ndata does not contain noises or employ a complicated training paradigm or model\ndesign to deal with distributional shifts. In this paper, we revisit the\nrobustness of the minimum error entropy (MEE) criterion, a widely used\nobjective in statistical signal processing to deal with non-Gaussian noises,\nand investigate its feasibility and usefulness in real-life transfer learning\nregression tasks, where distributional shifts are common. Specifically, we put\nforward a new theoretical result showing the robustness of MEE against\ncovariate shift. We also show that by simply replacing the mean squared error\n(MSE) loss with the MEE on basic transfer learning algorithms such as\nfine-tuning and linear probing, we can achieve competitive performance with\nrespect to state-of-the-art transfer learning algorithms. We justify our\narguments on both synthetic data and 5 real-world time-series data.\n","authors":["Luis Pedro Silvestrin","Shujian Yu","Mark Hoogendoorn"],"pdf_url":"https://arxiv.org/pdf/2307.08572v3.pdf","comment":"Manuscript accepted at ECAI-23. Code available at\n  https://github.com/lpsilvestrin/mee-finetune"},{"id":"http://arxiv.org/abs/2307.12926v1","updated":"2023-07-24T16:36:04Z","published":"2023-07-24T16:36:04Z","title":"Contextual Bandits and Imitation Learning via Preference-Based Active\n  Queries","summary":"  We consider the problem of contextual bandits and imitation learning, where\nthe learner lacks direct knowledge of the executed action's reward. Instead,\nthe learner can actively query an expert at each round to compare two actions\nand receive noisy preference feedback. The learner's objective is two-fold: to\nminimize the regret associated with the executed actions, while simultaneously,\nminimizing the number of comparison queries made to the expert. In this paper,\nwe assume that the learner has access to a function class that can represent\nthe expert's preference model under appropriate link functions, and provide an\nalgorithm that leverages an online regression oracle with respect to this\nfunction class for choosing its actions and deciding when to query. For the\ncontextual bandit setting, our algorithm achieves a regret bound that combines\nthe best of both worlds, scaling as $O(\\min\\{\\sqrt{T}, d/\\Delta\\})$, where $T$\nrepresents the number of interactions, $d$ represents the eluder dimension of\nthe function class, and $\\Delta$ represents the minimum preference of the\noptimal action over any suboptimal action under all contexts. Our algorithm\ndoes not require the knowledge of $\\Delta$, and the obtained regret bound is\ncomparable to what can be achieved in the standard contextual bandits setting\nwhere the learner observes reward signals at each round. Additionally, our\nalgorithm makes only $O(\\min\\{T, d^2/\\Delta^2\\})$ queries to the expert. We\nthen extend our algorithm to the imitation learning setting, where the learning\nagent engages with an unknown environment in episodes of length $H$ each, and\nprovide similar guarantees for regret and query complexity. Interestingly, our\nalgorithm for imitation learning can even learn to outperform the underlying\nexpert, when it is suboptimal, highlighting a practical benefit of\npreference-based feedback in imitation learning.\n","authors":["Ayush Sekhari","Karthik Sridharan","Wen Sun","Runzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2307.12926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12231v2","updated":"2023-07-24T16:00:37Z","published":"2023-04-24T16:18:22Z","title":"An Approximation Theory for Metric Space-Valued Functions With A View\n  Towards Deep Learning","summary":"  Motivated by the developing mathematics of deep learning, we build universal\nfunctions approximators of continuous maps between arbitrary Polish metric\nspaces $\\mathcal{X}$ and $\\mathcal{Y}$ using elementary functions between\nEuclidean spaces as building blocks. Earlier results assume that the target\nspace $\\mathcal{Y}$ is a topological vector space. We overcome this limitation\nby ``randomization'': our approximators output discrete probability measures\nover $\\mathcal{Y}$. When $\\mathcal{X}$ and $\\mathcal{Y}$ are Polish without\nadditional structure, we prove very general qualitative guarantees; when they\nhave suitable combinatorial structure, we prove quantitative guarantees for\nH\\\"{o}lder-like maps, including maps between finite graphs, solution operators\nto rough differential equations between certain Carnot groups, and continuous\nnon-linear operators between Banach spaces arising in inverse problems. In\nparticular, we show that the required number of Dirac measures is determined by\nthe combinatorial structure of $\\mathcal{X}$ and $\\mathcal{Y}$. For barycentric\n$\\mathcal{Y}$, including Banach spaces, $\\mathbb{R}$-trees, Hadamard manifolds,\nor Wasserstein spaces on Polish metric spaces, our approximators reduce to\n$\\mathcal{Y}$-valued functions. When the Euclidean approximators are neural\nnetworks, our constructions generalize transformer networks, providing a new\nprobabilistic viewpoint of geometric deep learning.\n","authors":["Anastasis Kratsios","Chong Liu","Matti Lassas","Maarten V. de Hoop","Ivan Dokmanić"],"pdf_url":"https://arxiv.org/pdf/2304.12231v2.pdf","comment":"14 Figures, 3 Tables, 78 Pages (Main 40, Proofs 26, Acknowledgments\n  and References 12)"},{"id":"http://arxiv.org/abs/2307.12906v1","updated":"2023-07-24T15:59:36Z","published":"2023-07-24T15:59:36Z","title":"QAmplifyNet: Pushing the Boundaries of Supply Chain Backorder Prediction\n  Using Interpretable Hybrid Quantum - Classical Neural Network","summary":"  Supply chain management relies on accurate backorder prediction for\noptimizing inventory control, reducing costs, and enhancing customer\nsatisfaction. However, traditional machine-learning models struggle with\nlarge-scale datasets and complex relationships, hindering real-world data\ncollection. This research introduces a novel methodological framework for\nsupply chain backorder prediction, addressing the challenge of handling large\ndatasets. Our proposed model, QAmplifyNet, employs quantum-inspired techniques\nwithin a quantum-classical neural network to predict backorders effectively on\nshort and imbalanced datasets. Experimental evaluations on a benchmark dataset\ndemonstrate QAmplifyNet's superiority over classical models, quantum ensembles,\nquantum neural networks, and deep reinforcement learning. Its proficiency in\nhandling short, imbalanced datasets makes it an ideal solution for supply chain\nmanagement. To enhance model interpretability, we use Explainable Artificial\nIntelligence techniques. Practical implications include improved inventory\ncontrol, reduced backorders, and enhanced operational efficiency. QAmplifyNet\nseamlessly integrates into real-world supply chain management systems, enabling\nproactive decision-making and efficient resource allocation. Future work\ninvolves exploring additional quantum-inspired techniques, expanding the\ndataset, and investigating other supply chain applications. This research\nunlocks the potential of quantum computing in supply chain optimization and\npaves the way for further exploration of quantum-inspired machine learning\nmodels in supply chain management. Our framework and QAmplifyNet model offer a\nbreakthrough approach to supply chain backorder prediction, providing superior\nperformance and opening new avenues for leveraging quantum-inspired techniques\nin supply chain management.\n","authors":["Md Abrar Jahin","Md Sakib Hossain Shovon","Md. Saiful Islam","Jungpil Shin","M. F. Mridha","Yuichi Okuyama"],"pdf_url":"https://arxiv.org/pdf/2307.12906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12904v1","updated":"2023-07-24T15:52:33Z","published":"2023-07-24T15:52:33Z","title":"Universal Approximation Theorem and error bounds for quantum neural\n  networks and quantum reservoirs","summary":"  Universal approximation theorems are the foundations of classical neural\nnetworks, providing theoretical guarantees that the latter are able to\napproximate maps of interest. Recent results have shown that this can also be\nachieved in a quantum setting, whereby classical functions can be approximated\nby parameterised quantum circuits. We provide here precise error bounds for\nspecific classes of functions and extend these results to the interesting new\nsetup of randomised quantum circuits, mimicking classical reservoir neural\nnetworks. Our results show in particular that a quantum neural network with\n$\\mathcal{O}(\\varepsilon^{-2})$ weights and $\\mathcal{O} (\\lceil\n\\log_2(\\varepsilon^{-1}) \\rceil)$ qubits suffices to achieve accuracy\n$\\varepsilon>0$ when approximating functions with integrable Fourier transform.\n","authors":["Lukas Gonon","Antoine Jacquier"],"pdf_url":"https://arxiv.org/pdf/2307.12904v1.pdf","comment":"20 pages, 0 figure"},{"id":"http://arxiv.org/abs/2206.02909v2","updated":"2023-07-24T15:47:59Z","published":"2022-06-06T21:14:01Z","title":"Self-supervised Learning for Human Activity Recognition Using 700,000\n  Person-days of Wearable Data","summary":"  Advances in deep learning for human activity recognition have been relatively\nlimited due to the lack of large labelled datasets. In this study, we leverage\nself-supervised learning techniques on the UK-Biobank activity tracker\ndataset--the largest of its kind to date--containing more than 700,000\nperson-days of unlabelled wearable sensor data. Our resulting activity\nrecognition model consistently outperformed strong baselines across seven\nbenchmark datasets, with an F1 relative improvement of 2.5%-100% (median\n18.4%), the largest improvements occurring in the smaller datasets. In contrast\nto previous studies, our results generalise across external datasets, devices,\nand environments. Our open-source model will help researchers and developers to\nbuild customisable and generalisable activity classifiers with high\nperformance.\n","authors":["Hang Yuan","Shing Chan","Andrew P. Creagh","Catherine Tong","David A. Clifton","Aiden Doherty"],"pdf_url":"https://arxiv.org/pdf/2206.02909v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12897v1","updated":"2023-07-24T15:44:30Z","published":"2023-07-24T15:44:30Z","title":"Anytime Model Selection in Linear Bandits","summary":"  Model selection in the context of bandit optimization is a challenging\nproblem, as it requires balancing exploration and exploitation not only for\naction selection, but also for model selection. One natural approach is to rely\non online learning algorithms that treat different models as experts. Existing\nmethods, however, scale poorly ($\\text{poly}M$) with the number of models $M$\nin terms of their regret. Our key insight is that, for model selection in\nlinear bandits, we can emulate full-information feedback to the online learner\nwith a favorable bias-variance trade-off. This allows us to develop ALEXP,\nwhich has an exponentially improved ($\\log M$) dependence on $M$ for its\nregret. ALEXP has anytime guarantees on its regret, and neither requires\nknowledge of the horizon $n$, nor relies on an initial purely exploratory\nstage. Our approach utilizes a novel time-uniform analysis of the Lasso,\nestablishing a new connection between online learning and high-dimensional\nstatistics.\n","authors":["Parnian Kassraie","Aldo Pacchiano","Nicolas Emmenegger","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2307.12897v1.pdf","comment":"37 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.12892v1","updated":"2023-07-24T15:42:33Z","published":"2023-07-24T15:42:33Z","title":"A Statistical View of Column Subset Selection","summary":"  We consider the problem of selecting a small subset of representative\nvariables from a large dataset. In the computer science literature, this\ndimensionality reduction problem is typically formalized as Column Subset\nSelection (CSS). Meanwhile, the typical statistical formalization is to find an\ninformation-maximizing set of Principal Variables. This paper shows that these\ntwo approaches are equivalent, and moreover, both can be viewed as maximum\nlikelihood estimation within a certain semi-parametric model. Using these\nconnections, we show how to efficiently (1) perform CSS using only summary\nstatistics from the original dataset; (2) perform CSS in the presence of\nmissing and/or censored data; and (3) select the subset size for CSS in a\nhypothesis testing framework.\n","authors":["Anav Sood","Trevor Hastie"],"pdf_url":"https://arxiv.org/pdf/2307.12892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08649v3","updated":"2023-07-24T15:33:25Z","published":"2023-04-17T22:53:54Z","title":"Classification of US Supreme Court Cases using BERT-Based Techniques","summary":"  Models based on bidirectional encoder representations from transformers\n(BERT) produce state of the art (SOTA) results on many natural language\nprocessing (NLP) tasks such as named entity recognition (NER), part-of-speech\n(POS) tagging etc. An interesting phenomenon occurs when classifying long\ndocuments such as those from the US supreme court where BERT-based models can\nbe considered difficult to use on a first-pass or out-of-the-box basis. In this\npaper, we experiment with several BERT-based classification techniques for US\nsupreme court decisions or supreme court database (SCDB) and compare them with\nthe previous SOTA results. We then compare our results specifically with SOTA\nmodels for long documents. We compare our results for two classification tasks:\n(1) a broad classification task with 15 categories and (2) a fine-grained\nclassification task with 279 categories. Our best result produces an accuracy\nof 80\\% on the 15 broad categories and 60\\% on the fine-grained 279 categories\nwhich marks an improvement of 8\\% and 28\\% respectively from previously\nreported SOTA results.\n","authors":["Shubham Vatsal","Adam Meyers","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2304.08649v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.13628v2","updated":"2023-07-24T15:31:05Z","published":"2021-08-31T05:38:36Z","title":"Learning Optimal Prescriptive Trees from Observational Data","summary":"  We consider the problem of learning an optimal prescriptive tree (i.e., an\ninterpretable treatment assignment policy in the form of a binary tree) of\nmoderate depth, from observational data. This problem arises in numerous\nsocially important domains such as public health and personalized medicine,\nwhere interpretable and data-driven interventions are sought based on data\ngathered in deployment -- through passive collection of data -- rather than\nfrom randomized trials. We propose a method for learning optimal prescriptive\ntrees using mixed-integer optimization (MIO) technology. We show that under\nmild conditions our method is asymptotically exact in the sense that it\nconverges to an optimal out-of-sample treatment assignment policy as the number\nof historical data samples tends to infinity. Contrary to existing literature,\nour approach: 1) does not require data to be randomized, 2) does not impose\nstringent assumptions on the learned trees, and 3) has the ability to model\ndomain specific constraints. Through extensive computational experiments, we\ndemonstrate that our asymptotic guarantees translate to significant performance\nimprovements in finite samples, as well as showcase our uniquely flexible\nmodeling power by incorporating budget and fairness constraints.\n","authors":["Nathanael Jo","Sina Aghaei","Andrés Gómez","Phebe Vayanos"],"pdf_url":"https://arxiv.org/pdf/2108.13628v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.11389v3","updated":"2023-07-24T15:28:34Z","published":"2022-08-24T09:26:12Z","title":"Approximate blocked Gibbs sampling for Bayesian neural networks","summary":"  In this work, minibatch MCMC sampling for feedforward neural networks is made\nmore feasible. To this end, it is proposed to sample subgroups of parameters\nvia a blocked Gibbs sampling scheme. By partitioning the parameter space,\nsampling is possible irrespective of layer width. It is also possible to\nalleviate vanishing acceptance rates for increasing depth by reducing the\nproposal variance in deeper layers. Increasing the length of a non-convergent\nchain increases the predictive accuracy in classification tasks, so avoiding\nvanishing acceptance rates and consequently enabling longer chain runs have\npractical benefits. Moreover, non-convergent chain realizations aid in the\nquantification of predictive uncertainty. An open problem is how to perform\nminibatch MCMC sampling for feedforward neural networks in the presence of\naugmented data.\n","authors":["Theodore Papamarkou"],"pdf_url":"https://arxiv.org/pdf/2208.11389v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12803v3","updated":"2023-07-24T15:27:16Z","published":"2022-01-30T12:53:51Z","title":"Generalizing similarity in noisy setups: the DIBS phenomenon","summary":"  This work uncovers an interplay among data density, noise, and the\ngeneralization ability in similarity learning. We consider Siamese Neural\nNetworks (SNNs), which are the basic form of contrastive learning, and explore\ntwo types of noise that can impact SNNs, Pair Label Noise (PLN) and Single\nLabel Noise (SLN). Our investigation reveals that SNNs exhibit double descent\nbehaviour regardless of the training setup and that it is further exacerbated\nby noise. We demonstrate that the density of data pairs is crucial for\ngeneralization. When SNNs are trained on sparse datasets with the same amount\nof PLN or SLN, they exhibit comparable generalization properties. However, when\nusing dense datasets, PLN cases generalize worse than SLN ones in the\noverparametrized region, leading to a phenomenon we call Density-Induced Break\nof Similarity (DIBS). In this regime, PLN similarity violation becomes\nmacroscopical, corrupting the dataset to the point where complete interpolation\ncannot be achieved, regardless of the number of model parameters. Our analysis\nalso delves into the correspondence between online optimization and offline\ngeneralization in similarity learning. The results show that this equivalence\nfails in the presence of label noise in all the scenarios considered.\n","authors":["Nayara Fonseca","Veronica Guidetti"],"pdf_url":"https://arxiv.org/pdf/2201.12803v3.pdf","comment":"v3: version accepted at ECAI 2023 + Supplementary Material"},{"id":"http://arxiv.org/abs/2307.10490v3","updated":"2023-07-24T15:24:17Z","published":"2023-07-19T23:03:20Z","title":"(Ab)using Images and Sounds for Indirect Instruction Injection in\n  Multi-Modal LLMs","summary":"  We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10891v2","updated":"2023-07-24T15:16:46Z","published":"2023-06-19T12:36:54Z","title":"Transformer Training Strategies for Forecasting Multiple Load Time\n  Series","summary":"  In the smart grid of the future, accurate load forecasts on the level of\nindividual clients can help to balance supply and demand locally and to prevent\ngrid outages. While the number of monitored clients will increase with the\nongoing smart meter rollout, the amount of data per client will always be\nlimited. We evaluate whether a Transformer load forecasting model benefits from\na transfer learning strategy, where a global univariate model is trained on the\nload time series from multiple clients. In experiments with two datasets\ncontaining load time series from several hundred clients, we find that the\nglobal training strategy is superior to the multivariate and local training\nstrategies used in related work. On average, the global training strategy\nresults in 21.8% and 12.8% lower forecasting errors than the two other\nstrategies, measured across forecasting horizons from one day to one month into\nthe future. A comparison to linear models, multi-layer perceptrons and LSTMs\nshows that Transformers are effective for load forecasting when they are\ntrained with the global training strategy.\n","authors":["Matthias Hertel","Maximilian Beichter","Benedikt Heidrich","Oliver Neumann","Benjamin Schäfer","Ralf Mikut","Veit Hagenmeyer"],"pdf_url":"https://arxiv.org/pdf/2306.10891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12872v1","updated":"2023-07-24T15:10:22Z","published":"2023-07-24T15:10:22Z","title":"Data-free Black-box Attack based on Diffusion Model","summary":"  Since the training data for the target model in a data-free black-box attack\nis not available, most recent schemes utilize GANs to generate data for\ntraining substitute model. However, these GANs-based schemes suffer from low\ntraining efficiency as the generator needs to be retrained for each target\nmodel during the substitute training process, as well as low generation\nquality. To overcome these limitations, we consider utilizing the diffusion\nmodel to generate data, and propose a data-free black-box attack scheme based\non diffusion model to improve the efficiency and accuracy of substitute\ntraining. Despite the data generated by the diffusion model exhibits high\nquality, it presents diverse domain distributions and contains many samples\nthat do not meet the discriminative criteria of the target model. To further\nfacilitate the diffusion model to generate data suitable for the target model,\nwe propose a Latent Code Augmentation (LCA) method to guide the diffusion model\nin generating data. With the guidance of LCA, the data generated by the\ndiffusion model not only meets the discriminative criteria of the target model\nbut also exhibits high diversity. By utilizing this data, it is possible to\ntrain substitute model that closely resemble the target model more efficiently.\nExtensive experiments demonstrate that our LCA achieves higher attack success\nrates and requires fewer query budgets compared to GANs-based schemes for\ndifferent target models.\n","authors":["Mingwen Shao","Lingzhuang Meng","Yuanjian Qiao","Lixu Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.12872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12862v1","updated":"2023-07-24T15:02:03Z","published":"2023-07-24T15:02:03Z","title":"Stochastic Step-wise Feature Selection for Exponential Random Graph\n  Models (ERGMs)","summary":"  Statistical analysis of social networks provides valuable insights into\ncomplex network interactions across various scientific disciplines. However,\naccurate modeling of networks remains challenging due to the heavy\ncomputational burden and the need to account for observed network dependencies.\nExponential Random Graph Models (ERGMs) have emerged as a promising technique\nused in social network modeling to capture network dependencies by\nincorporating endogenous variables. Nevertheless, using ERGMs poses multiple\nchallenges, including the occurrence of ERGM degeneracy, which generates\nunrealistic and meaningless network structures. To address these challenges and\nenhance the modeling of collaboration networks, we propose and test a novel\napproach that focuses on endogenous variable selection within ERGMs. Our method\naims to overcome the computational burden and improve the accommodation of\nobserved network dependencies, thereby facilitating more accurate and\nmeaningful interpretations of network phenomena in various scientific fields.\nWe conduct empirical testing and rigorous analysis to contribute to the\nadvancement of statistical techniques and offer practical insights for network\nanalysis.\n","authors":["Helal El-Zaatari","Fei Yu","Michael R Kosorok"],"pdf_url":"https://arxiv.org/pdf/2307.12862v1.pdf","comment":"23 pages, 6 tables and 18 figures"},{"id":"http://arxiv.org/abs/2307.12856v1","updated":"2023-07-24T14:56:30Z","published":"2023-07-24T14:56:30Z","title":"A Real-World WebAgent with Planning, Long Context Understanding, and\n  Program Synthesis","summary":"  Pre-trained large language models (LLMs) have recently achieved better\ngeneralization and sample efficiency in autonomous web navigation. However, the\nperformance on real-world websites has still suffered from (1) open domainness,\n(2) limited context length, and (3) lack of inductive bias on HTML. We\nintroduce WebAgent, an LLM-driven agent that can complete the tasks on real\nwebsites following natural language instructions. WebAgent plans ahead by\ndecomposing instructions into canonical sub-instructions, summarizes long HTML\ndocuments into task-relevant snippets, and acts on websites via generated\nPython programs from those. We design WebAgent with Flan-U-PaLM, for grounded\ncode generation, and HTML-T5, new pre-trained LLMs for long HTML documents\nusing local and global attention mechanisms and a mixture of long-span\ndenoising objectives, for planning and summarization. We empirically\ndemonstrate that our recipe improves the success on a real website by over 50%,\nand that HTML-T5 is the best model to solve HTML-based tasks; achieving 14.9%\nhigher success rate than prior SoTA on the MiniWoB web navigation benchmark and\nbetter accuracy on offline task planning evaluation.\n","authors":["Izzeddin Gur","Hiroki Furuta","Austin Huang","Mustafa Safdari","Yutaka Matsuo","Douglas Eck","Aleksandra Faust"],"pdf_url":"https://arxiv.org/pdf/2307.12856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12851v1","updated":"2023-07-24T14:51:54Z","published":"2023-07-24T14:51:54Z","title":"Early Neuron Alignment in Two-layer ReLU Networks with Small\n  Initialization","summary":"  This paper studies the problem of training a two-layer ReLU network for\nbinary classification using gradient flow with small initialization. We\nconsider a training dataset with well-separated input vectors: Any pair of\ninput data with the same label are positively correlated, and any pair with\ndifferent labels are negatively correlated. Our analysis shows that, during the\nearly phase of training, neurons in the first layer try to align with either\nthe positive data or the negative data, depending on its corresponding weight\non the second layer. A careful analysis of the neurons' directional dynamics\nallows us to provide an $\\mathcal{O}(\\frac{\\log n}{\\sqrt{\\mu}})$ upper bound on\nthe time it takes for all neurons to achieve good alignment with the input\ndata, where $n$ is the number of data points and $\\mu$ measures how well the\ndata are separated. After the early alignment phase, the loss converges to zero\nat a $\\mathcal{O}(\\frac{1}{t})$ rate, and the weight matrix on the first layer\nis approximately low-rank. Numerical experiments on the MNIST dataset\nillustrate our theoretical findings.\n","authors":["Hancheng Min","René Vidal","Enrique Mallada"],"pdf_url":"https://arxiv.org/pdf/2307.12851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12840v1","updated":"2023-07-24T14:37:22Z","published":"2023-07-24T14:37:22Z","title":"Efficiently Learning One-Hidden-Layer ReLU Networks via Schur\n  Polynomials","summary":"  We study the problem of PAC learning a linear combination of $k$ ReLU\nactivations under the standard Gaussian distribution on $\\mathbb{R}^d$ with\nrespect to the square loss. Our main result is an efficient algorithm for this\nlearning task with sample and computational complexity $(dk/\\epsilon)^{O(k)}$,\nwhere $\\epsilon>0$ is the target accuracy. Prior work had given an algorithm\nfor this problem with complexity $(dk/\\epsilon)^{h(k)}$, where the function\n$h(k)$ scales super-polynomially in $k$. Interestingly, the complexity of our\nalgorithm is near-optimal within the class of Correlational Statistical Query\nalgorithms. At a high-level, our algorithm uses tensor decomposition to\nidentify a subspace such that all the $O(k)$-order moments are small in the\northogonal directions. Its analysis makes essential use of the theory of Schur\npolynomials to show that the higher-moment error tensors are small given that\nthe lower-order ones are.\n","authors":["Ilias Diakonikolas","Daniel M. Kane"],"pdf_url":"https://arxiv.org/pdf/2307.12840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08272v3","updated":"2023-07-24T14:28:11Z","published":"2023-03-14T23:26:55Z","title":"Automated patent extraction powers generative modeling in focused\n  chemical spaces","summary":"  Deep generative models have emerged as an exciting avenue for inverse\nmolecular design, with progress coming from the interplay between training\nalgorithms and molecular representations. One of the key challenges in their\napplicability to materials science and chemistry has been the lack of access to\nsizeable training datasets with property labels. Published patents contain the\nfirst disclosure of new materials prior to their publication in journals, and\nare a vast source of scientific knowledge that has remained relatively untapped\nin the field of data-driven molecular design. Because patents are filed seeking\nto protect specific uses, molecules in patents can be considered to be weakly\nlabeled into application classes. Furthermore, patents published by the US\nPatent and Trademark Office (USPTO) are downloadable and have machine-readable\ntext and molecular structures. In this work, we train domain-specific\ngenerative models using patent data sources by developing an automated pipeline\nto go from USPTO patent digital files to the generation of novel candidates\nwith minimal human intervention. We test the approach on two in-class extracted\ndatasets, one in organic electronics and another in tyrosine kinase inhibitors.\nWe then evaluate the ability of generative models trained on these in-class\ndatasets on two categories of tasks (distribution learning and property\noptimization), identify strengths and limitations, and suggest possible\nexplanations and remedies that could be used to overcome these in practice.\n","authors":["Akshay Subramanian","Kevin P. Greenman","Alexis Gervaix","Tzuhsiung Yang","Rafael Gómez-Bombarelli"],"pdf_url":"https://arxiv.org/pdf/2303.08272v3.pdf","comment":"Digital Discovery (2023)"},{"id":"http://arxiv.org/abs/2307.02620v2","updated":"2023-07-24T14:21:09Z","published":"2023-07-05T19:48:03Z","title":"Learning when to observe: A frugal reinforcement learning framework for\n  a high-cost world","summary":"  Reinforcement learning (RL) has been shown to learn sophisticated control\npolicies for complex tasks including games, robotics, heating and cooling\nsystems and text generation. The action-perception cycle in RL, however,\ngenerally assumes that a measurement of the state of the environment is\navailable at each time step without a cost. In applications such as materials\ndesign, deep-sea and planetary robot exploration and medicine, however, there\ncan be a high cost associated with measuring, or even approximating, the state\nof the environment. In this paper, we survey the recently growing literature\nthat adopts the perspective that an RL agent might not need, or even want, a\ncostly measurement at each time step. Within this context, we propose the Deep\nDynamic Multi-Step Observationless Agent (DMSOA), contrast it with the\nliterature and empirically evaluate it on OpenAI gym and Atari Pong\nenvironments. Our results, show that DMSOA learns a better policy with fewer\ndecision steps and measurements than the considered alternative from the\nliterature. The corresponding code is available at:\n\\url{https://github.com/cbellinger27/Learning-when-to-observe-in-RL\n","authors":["Colin Bellinger","Mark Crowley","Isaac Tamblyn"],"pdf_url":"https://arxiv.org/pdf/2307.02620v2.pdf","comment":"Accepted for presentation at ECML-PKDD 2023 workshop track:\n  Simplification, Compression, Efficiency and Frugality for Artificial\n  Intelligence (SCEFA)"},{"id":"http://arxiv.org/abs/2307.12822v1","updated":"2023-07-24T14:19:36Z","published":"2023-07-24T14:19:36Z","title":"Learning Provably Robust Estimators for Inverse Problems via Jittering","summary":"  Deep neural networks provide excellent performance for inverse problems such\nas denoising. However, neural networks can be sensitive to adversarial or\nworst-case perturbations. This raises the question of whether such networks can\nbe trained efficiently to be worst-case robust. In this paper, we investigate\nwhether jittering, a simple regularization technique that adds isotropic\nGaussian noise during training, is effective for learning worst-case robust\nestimators for inverse problems. While well studied for prediction in\nclassification tasks, the effectiveness of jittering for inverse problems has\nnot been systematically investigated. In this paper, we present a novel\nanalytical characterization of the optimal $\\ell_2$-worst-case robust estimator\nfor linear denoising and show that jittering yields optimal robust denoisers.\nFurthermore, we examine jittering empirically via training deep neural networks\n(U-nets) for natural image denoising, deconvolution, and accelerated magnetic\nresonance imaging (MRI). The results show that jittering significantly enhances\nthe worst-case robustness, but can be suboptimal for inverse problems beyond\ndenoising. Moreover, our results imply that training on real data which often\ncontains slight noise is somewhat robustness enhancing.\n","authors":["Anselm Krainovic","Mahdi Soltanolkotabi","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2307.12822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02813v2","updated":"2023-07-24T14:17:24Z","published":"2023-07-06T07:18:22Z","title":"CPDG: A Contrastive Pre-Training Method for Dynamic Graph Neural\n  Networks","summary":"  Dynamic graph data mining has gained popularity in recent years due to the\nrich information contained in dynamic graphs and their widespread use in the\nreal world. Despite the advances in dynamic graph neural networks (DGNNs), the\nrich information and diverse downstream tasks have posed significant\ndifficulties for the practical application of DGNNs in industrial scenarios. To\nthis end, in this paper, we propose to address them by pre-training and present\nthe Contrastive Pre-Training Method for Dynamic Graph Neural Networks (CPDG).\nCPDG tackles the challenges of pre-training for DGNNs, including generalization\ncapability and long-short term modeling capability, through a flexible\nstructural-temporal subgraph sampler along with structural-temporal contrastive\npre-training schemes. Extensive experiments conducted on both large-scale\nresearch and industrial dynamic graph datasets show that CPDG outperforms\nexisting methods in dynamic graph pre-training for various downstream tasks\nunder three transfer settings.\n","authors":["Yuanchen Bei","Hao Xu","Sheng Zhou","Huixuan Chi","Haishuai Wang","Mengdi Zhang","Zhao Li","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2307.02813v2.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.12797v1","updated":"2023-07-24T13:46:50Z","published":"2023-07-24T13:46:50Z","title":"Causal Fair Machine Learning via Rank-Preserving Interventional\n  Distributions","summary":"  A decision can be defined as fair if equal individuals are treated equally\nand unequals unequally. Adopting this definition, the task of designing machine\nlearning models that mitigate unfairness in automated decision-making systems\nmust include causal thinking when introducing protected attributes. Following a\nrecent proposal, we define individuals as being normatively equal if they are\nequal in a fictitious, normatively desired (FiND) world, where the protected\nattribute has no (direct or indirect) causal effect on the target. We propose\nrank-preserving interventional distributions to define an estimand of this FiND\nworld and a warping method for estimation. Evaluation criteria for both the\nmethod and resulting model are presented and validated through simulations and\nempirical data. With this, we show that our warping approach effectively\nidentifies the most discriminated individuals and mitigates unfairness.\n","authors":["Ludwig Bothmann","Susanne Dandl","Michael Schomaker"],"pdf_url":"https://arxiv.org/pdf/2307.12797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.05018v3","updated":"2023-07-24T13:46:46Z","published":"2022-07-11T17:13:10Z","title":"Learning Temporally Extended Skills in Continuous Domains as Symbolic\n  Actions for Planning","summary":"  Problems which require both long-horizon planning and continuous control\ncapabilities pose significant challenges to existing reinforcement learning\nagents. In this paper we introduce a novel hierarchical reinforcement learning\nagent which links temporally extended skills for continuous control with a\nforward model in a symbolic discrete abstraction of the environment's state for\nplanning. We term our agent SEADS for Symbolic Effect-Aware Diverse Skills. We\nformulate an objective and corresponding algorithm which leads to unsupervised\nlearning of a diverse set of skills through intrinsic motivation given a known\nstate abstraction. The skills are jointly learned with the symbolic forward\nmodel which captures the effect of skill execution in the state abstraction.\nAfter training, we can leverage the skills as symbolic actions using the\nforward model for long-horizon planning and subsequently execute the plan using\nthe learned continuous-action control skills. The proposed algorithm learns\nskills and forward models that can be used to solve complex tasks which require\nboth continuous control and long-horizon planning capabilities with high\nsuccess rate. It compares favorably with other flat and hierarchical\nreinforcement learning baseline agents and is successfully demonstrated with a\nreal robot.\n","authors":["Jan Achterhold","Markus Krimmel","Joerg Stueckler"],"pdf_url":"https://arxiv.org/pdf/2207.05018v3.pdf","comment":"Project website (including video) is available at\n  https://seads.is.tue.mpg.de/. (v2) Accepted for publication at the 6th\n  Conference on Robot Learning (CoRL) 2022, Auckland, New Zealand. (v3) Added\n  details on checkpointing (S.8.1), with references on p.7, p.8, p.21 to\n  clarify number of env. steps of reported results"},{"id":"http://arxiv.org/abs/2307.12790v1","updated":"2023-07-24T13:39:21Z","published":"2023-07-24T13:39:21Z","title":"Compact & Capable: Harnessing Graph Neural Networks and Edge Convolution\n  for Medical Image Classification","summary":"  Graph-based neural network models are gaining traction in the field of\nrepresentation learning due to their ability to uncover latent topological\nrelationships between entities that are otherwise challenging to identify.\nThese models have been employed across a diverse range of domains, encompassing\ndrug discovery, protein interactions, semantic segmentation, and fluid dynamics\nresearch. In this study, we investigate the potential of Graph Neural Networks\n(GNNs) for medical image classification. We introduce a novel model that\ncombines GNNs and edge convolution, leveraging the interconnectedness of RGB\nchannel feature values to strongly represent connections between crucial graph\nnodes. Our proposed model not only performs on par with state-of-the-art Deep\nNeural Networks (DNNs) but does so with 1000 times fewer parameters, resulting\nin reduced training time and data requirements. We compare our Graph\nConvolutional Neural Network (GCNN) to pre-trained DNNs for classifying\nMedMNIST dataset classes, revealing promising prospects for GNNs in medical\nimage analysis. Our results also encourage further exploration of advanced\ngraph-based models such as Graph Attention Networks (GAT) and Graph\nAuto-Encoders in the medical imaging domain. The proposed model yields more\nreliable, interpretable, and accurate outcomes for tasks like semantic\nsegmentation and image classification compared to simpler GCNNs\n","authors":["Aryan Singh","Pepijn Van de Ven","Ciarán Eising","Patrick Denny"],"pdf_url":"https://arxiv.org/pdf/2307.12790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.13170v4","updated":"2023-07-24T13:35:28Z","published":"2022-04-27T20:04:24Z","title":"AdaBest: Minimizing Client Drift in Federated Learning via Adaptive Bias\n  Estimation","summary":"  In Federated Learning (FL), a number of clients or devices collaborate to\ntrain a model without sharing their data. Models are optimized locally at each\nclient and further communicated to a central hub for aggregation. While FL is\nan appealing decentralized training paradigm, heterogeneity among data from\ndifferent clients can cause the local optimization to drift away from the\nglobal objective. In order to estimate and therefore remove this drift,\nvariance reduction techniques have been incorporated into FL optimization\nrecently. However, these approaches inaccurately estimate the clients' drift\nand ultimately fail to remove it properly. In this work, we propose an adaptive\nalgorithm that accurately estimates drift across clients. In comparison to\nprevious works, our approach necessitates less storage and communication\nbandwidth, as well as lower compute costs. Additionally, our proposed\nmethodology induces stability by constraining the norm of estimates for client\ndrift, making it more practical for large scale FL. Experimental findings\ndemonstrate that the proposed algorithm converges significantly faster and\nachieves higher accuracy than the baselines across various FL benchmarks.\n","authors":["Farshid Varno","Marzie Saghayi","Laya Rafiee Sevyeri","Sharut Gupta","Stan Matwin","Mohammad Havaei"],"pdf_url":"https://arxiv.org/pdf/2204.13170v4.pdf","comment":"Published as a conference paper at ECCV 2022; Corrected some typos in\n  the text and a baseline algorithm"},{"id":"http://arxiv.org/abs/2307.12788v1","updated":"2023-07-24T13:35:18Z","published":"2023-07-24T13:35:18Z","title":"Analyzing the Strategy of Propaganda using Inverse Reinforcement\n  Learning: Evidence from the 2022 Russian Invasion of Ukraine","summary":"  The 2022 Russian invasion of Ukraine was accompanied by a large-scale,\npro-Russian propaganda campaign on social media. However, the strategy behind\nthe dissemination of propaganda has remained unclear, particularly how the\nonline discourse was strategically shaped by the propagandists' community.\nHere, we analyze the strategy of the Twitter community using an inverse\nreinforcement learning (IRL) approach. Specifically, IRL allows us to model\nonline behavior as a Markov decision process, where the goal is to infer the\nunderlying reward structure that guides propagandists when interacting with\nusers with a supporting or opposing stance toward the invasion. Thereby, we aim\nto understand empirically whether and how between-user interactions are\nstrategically used to promote the proliferation of Russian propaganda. For\nthis, we leverage a large-scale dataset with 349,455 posts with pro-Russian\npropaganda from 132,131 users. We show that bots and humans follow a different\nstrategy: bots respond predominantly to pro-invasion messages, suggesting that\nthey seek to drive virality; while messages indicating opposition primarily\nelicit responses from humans, suggesting that they tend to engage in critical\ndiscussions. To the best of our knowledge, this is the first study analyzing\nthe strategy behind propaganda from the 2022 Russian invasion of Ukraine\nthrough the lens of IRL.\n","authors":["Dominique Geissler","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2307.12788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12540v2","updated":"2023-07-24T13:35:16Z","published":"2023-03-22T13:16:37Z","title":"Deployment of Image Analysis Algorithms under Prevalence Shifts","summary":"  Domain gaps are among the most relevant roadblocks in the clinical\ntranslation of machine learning (ML)-based solutions for medical image\nanalysis. While current research focuses on new training paradigms and network\narchitectures, little attention is given to the specific effect of prevalence\nshifts on an algorithm deployed in practice. Such discrepancies between class\nfrequencies in the data used for a method's development/validation and that in\nits deployment environment(s) are of great importance, for example in the\ncontext of artificial intelligence (AI) democratization, as disease prevalences\nmay vary widely across time and location. Our contribution is twofold. First,\nwe empirically demonstrate the potentially severe consequences of missing\nprevalence handling by analyzing (i) the extent of miscalibration, (ii) the\ndeviation of the decision threshold from the optimum, and (iii) the ability of\nvalidation metrics to reflect neural network performance on the deployment\npopulation as a function of the discrepancy between development and deployment\nprevalence. Second, we propose a workflow for prevalence-aware image\nclassification that uses estimated deployment prevalences to adjust a trained\nclassifier to a new environment, without requiring additional annotated\ndeployment data. Comprehensive experiments based on a diverse set of 30 medical\nclassification tasks showcase the benefit of the proposed workflow in\ngenerating better classifier decisions and more reliable performance estimates\ncompared to current practice.\n","authors":["Patrick Godau","Piotr Kalinowski","Evangelia Christodoulou","Annika Reinke","Minu Tizabi","Luciana Ferrer","Paul Jäger","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2303.12540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12775v1","updated":"2023-07-24T13:24:56Z","published":"2023-07-24T13:24:56Z","title":"Is attention all you need in medical image analysis? A review","summary":"  Medical imaging is a key component in clinical diagnosis, treatment planning\nand clinical trial design, accounting for almost 90% of all healthcare data.\nCNNs achieved performance gains in medical image analysis (MIA) over the last\nyears. CNNs can efficiently model local pixel interactions and be trained on\nsmall-scale MI data. The main disadvantage of typical CNN models is that they\nignore global pixel relationships within images, which limits their\ngeneralisation ability to understand out-of-distribution data with different\n'global' information. The recent progress of Artificial Intelligence gave rise\nto Transformers, which can learn global relationships from data. However, full\nTransformer models need to be trained on large-scale data and involve\ntremendous computational complexity. Attention and Transformer compartments\n(Transf/Attention) which can well maintain properties for modelling global\nrelationships, have been proposed as lighter alternatives of full Transformers.\nRecently, there is an increasing trend to co-pollinate complementary\nlocal-global properties from CNN and Transf/Attention architectures, which led\nto a new era of hybrid models. The past years have witnessed substantial growth\nin hybrid CNN-Transf/Attention models across diverse MIA problems. In this\nsystematic review, we survey existing hybrid CNN-Transf/Attention models,\nreview and unravel key architectural designs, analyse breakthroughs, and\nevaluate current and future opportunities as well as challenges. We also\nintroduced a comprehensive analysis framework on generalisation opportunities\nof scientific and clinical impact, based on which new data-driven domain\ngeneralisation and adaptation methods can be stimulated.\n","authors":["Giorgos Papanastasiou","Nikolaos Dikaios","Jiahao Huang","Chengjia Wang","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12771v1","updated":"2023-07-24T13:19:15Z","published":"2023-07-24T13:19:15Z","title":"Detecting disturbances in network-coupled dynamical systems with machine\n  learning","summary":"  Identifying disturbances in network-coupled dynamical systems without\nknowledge of the disturbances or underlying dynamics is a problem with a wide\nrange of applications. For example, one might want to know which nodes in the\nnetwork are being disturbed and identify the type of disturbance. Here we\npresent a model-free method based on machine learning to identify such unknown\ndisturbances based only on prior observations of the system when forced by a\nknown training function. We find that this method is able to identify the\nlocations and properties of many different types of unknown disturbances using\na variety of known forcing functions. We illustrate our results both with\nlinear and nonlinear disturbances using food web and neuronal activity models.\nFinally, we discuss how to scale our method to large networks.\n","authors":["Per Sebastian Skardal","Juan G. Restrepo"],"pdf_url":"https://arxiv.org/pdf/2307.12771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05732v6","updated":"2023-07-24T13:15:14Z","published":"2022-09-13T04:58:35Z","title":"Rényi Divergence Deep Mutual Learning","summary":"  This paper revisits Deep Mutual Learning (DML), a simple yet effective\ncomputing paradigm. We propose using R\\'{e}nyi divergence instead of the KL\ndivergence, which is more flexible and tunable, to improve vanilla DML. This\nmodification is able to consistently improve performance over vanilla DML with\nlimited additional complexity. The convergence properties of the proposed\nparadigm are analyzed theoretically, and Stochastic Gradient Descent with a\nconstant learning rate is shown to converge with $\\mathcal{O}(1)$-bias in the\nworst case scenario for nonconvex optimization tasks. That is, learning will\nreach nearby local optima but continue searching within a bounded scope, which\nmay help mitigate overfitting. Finally, our extensive empirical results\ndemonstrate the advantage of combining DML and R\\'{e}nyi divergence, leading to\nfurther improvement in model generalization.\n","authors":["Weipeng Huang","Junjie Tao","Changbo Deng","Ming Fan","Wenqiang Wan","Qi Xiong","Guangyuan Piao"],"pdf_url":"https://arxiv.org/pdf/2209.05732v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.11531v2","updated":"2023-07-24T13:04:48Z","published":"2022-09-23T11:36:32Z","title":"Deep Learning-based Anonymization of Chest Radiographs: A\n  Utility-preserving Measure for Patient Privacy","summary":"  Robust and reliable anonymization of chest radiographs constitutes an\nessential step before publishing large datasets of such for research purposes.\nThe conventional anonymization process is carried out by obscuring personal\ninformation in the images with black boxes and removing or replacing\nmeta-information. However, such simple measures retain biometric information in\nthe chest radiographs, allowing patients to be re-identified by a linkage\nattack. Therefore, there is an urgent need to obfuscate the biometric\ninformation appearing in the images. We propose the first deep learning-based\napproach (PriCheXy-Net) to targetedly anonymize chest radiographs while\nmaintaining data utility for diagnostic and machine learning purposes. Our\nmodel architecture is a composition of three independent neural networks that,\nwhen collectively used, allow for learning a deformation field that is able to\nimpede patient re-identification. Quantitative results on the ChestX-ray14\ndataset show a reduction of patient re-identification from 81.8% to 57.7% (AUC)\nafter re-training with little impact on the abnormality classification\nperformance. This indicates the ability to preserve underlying abnormality\npatterns while increasing patient privacy. Lastly, we compare our proposed\nanonymization approach with two other obfuscation-based methods (Privacy-Net,\nDP-Pix) and demonstrate the superiority of our method towards resolving the\nprivacy-utility trade-off for chest radiographs.\n","authors":["Kai Packhäuser","Sebastian Gündel","Florian Thamm","Felix Denzinger","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2209.11531v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.07620v2","updated":"2023-07-24T13:03:17Z","published":"2023-07-14T20:39:07Z","title":"Generalizable Embeddings with Cross-batch Metric Learning","summary":"  Global average pooling (GAP) is a popular component in deep metric learning\n(DML) for aggregating features. Its effectiveness is often attributed to\ntreating each feature vector as a distinct semantic entity and GAP as a\ncombination of them. Albeit substantiated, such an explanation's algorithmic\nimplications to learn generalizable entities to represent unseen classes, a\ncrucial DML goal, remain unclear. To address this, we formulate GAP as a convex\ncombination of learnable prototypes. We then show that the prototype learning\ncan be expressed as a recursive process fitting a linear predictor to a batch\nof samples. Building on that perspective, we consider two batches of disjoint\nclasses at each iteration and regularize the learning by expressing the samples\nof a batch with the prototypes that are fitted to the other batch. We validate\nour approach on 4 popular DML benchmarks.\n","authors":["Yeti Z. Gurbuz","A. Aydin Alatan"],"pdf_url":"https://arxiv.org/pdf/2307.07620v2.pdf","comment":"\\c{opyright} 2023 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2212.07368v3","updated":"2023-07-24T12:53:23Z","published":"2022-12-14T17:46:17Z","title":"Shuffled Multi-Channel Sparse Signal Recovery","summary":"  Mismatches between samples and their respective channel or target commonly\narise in several real-world applications. For instance, whole-brain calcium\nimaging of freely moving organisms, multiple-target tracking or multi-person\ncontactless vital sign monitoring may be severely affected by mismatched\nsample-channel assignments. To systematically address this fundamental problem,\nwe pose it as a signal reconstruction problem where we have lost\ncorrespondences between the samples and their respective channels. Assuming\nthat we have a sensing matrix for the underlying signals, we show that the\nproblem is equivalent to a structured unlabeled sensing problem, and establish\nsufficient conditions for unique recovery. To the best of our knowledge, a\nsampling result for the reconstruction of shuffled multi-channel signals has\nnot been considered in the literature and existing methods for unlabeled\nsensing cannot be directly applied. We extend our results to the case where the\nsignals admit a sparse representation in an overcomplete dictionary (i.e., the\nsensing matrix is not precisely known), and derive sufficient conditions for\nthe reconstruction of shuffled sparse signals. We propose a robust\nreconstruction method that combines sparse signal recovery with robust linear\nregression for the two-channel case. The performance and robustness of the\nproposed approach is illustrated in an application related to whole-brain\ncalcium imaging. The proposed methodology can be generalized to sparse signal\nrepresentations other than the ones considered in this work to be applied in a\nvariety of real-world problems with imprecise measurement or channel\nassignment.\n","authors":["Taulant Koka","Manolis C. Tsakiris","Michael Muma","Benjamín Béjar Haro"],"pdf_url":"https://arxiv.org/pdf/2212.07368v3.pdf","comment":"Submitted to TSP"},{"id":"http://arxiv.org/abs/2307.12754v1","updated":"2023-07-24T12:52:55Z","published":"2023-07-24T12:52:55Z","title":"Nonparametric Linear Feature Learning in Regression Through\n  Regularisation","summary":"  Representation learning plays a crucial role in automated feature selection,\nparticularly in the context of high-dimensional data, where non-parametric\nmethods often struggle. In this study, we focus on supervised learning\nscenarios where the pertinent information resides within a lower-dimensional\nlinear subspace of the data, namely the multi-index model. If this subspace\nwere known, it would greatly enhance prediction, computation, and\ninterpretation. To address this challenge, we propose a novel method for linear\nfeature learning with non-parametric prediction, which simultaneously estimates\nthe prediction function and the linear subspace. Our approach employs empirical\nrisk minimisation, augmented with a penalty on function derivatives, ensuring\nversatility. Leveraging the orthogonality and rotation invariance properties of\nHermite polynomials, we introduce our estimator, named RegFeaL. By utilising\nalternative minimisation, we iteratively rotate the data to improve alignment\nwith leading directions and accurately estimate the relevant dimension in\npractical settings. We establish that our method yields a consistent estimator\nof the prediction function with explicit rates. Additionally, we provide\nempirical results demonstrating the performance of RegFeaL in various\nexperiments.\n","authors":["Bertille Follain","Umut Simsekli","Francis Bach"],"pdf_url":"https://arxiv.org/pdf/2307.12754v1.pdf","comment":"43 pages, 16 figures"},{"id":"http://arxiv.org/abs/2307.12745v1","updated":"2023-07-24T12:36:05Z","published":"2023-07-24T12:36:05Z","title":"Concept-based explainability for an EEG transformer model","summary":"  Deep learning models are complex due to their size, structure, and inherent\nrandomness in training procedures. Additional complexity arises from the\nselection of datasets and inductive biases. Addressing these challenges for\nexplainability, Kim et al. (2018) introduced Concept Activation Vectors (CAVs),\nwhich aim to understand deep models' internal states in terms of human-aligned\nconcepts. These concepts correspond to directions in latent space, identified\nusing linear discriminants. Although this method was first applied to image\nclassification, it was later adapted to other domains, including natural\nlanguage processing. In this work, we attempt to apply the method to\nelectroencephalogram (EEG) data for explainability in Kostas et al.'s BENDR\n(2021), a large-scale transformer model. A crucial part of this endeavor\ninvolves defining the explanatory concepts and selecting relevant datasets to\nground concepts in the latent space. Our focus is on two mechanisms for EEG\nconcept formation: the use of externally labeled EEG datasets, and the\napplication of anatomically defined concepts. The former approach is a\nstraightforward generalization of methods used in image classification, while\nthe latter is novel and specific to EEG. We present evidence that both\napproaches to concept formation yield valuable insights into the\nrepresentations learned by deep EEG models.\n","authors":["Anders Gjølbye Madsen","William Theodor Lehn-Schiøler","Áshildur Jónsdóttir","Bergdís Arnardóttir","Lars Kai Hansen"],"pdf_url":"https://arxiv.org/pdf/2307.12745v1.pdf","comment":"To appear in proceedings of 2023 IEEE International workshop on\n  Machine Learning for Signal Processing"},{"id":"http://arxiv.org/abs/2207.09657v3","updated":"2023-07-24T12:35:18Z","published":"2022-07-20T05:22:26Z","title":"Reducing Training Time in Cross-Silo Federated Learning using Multigraph\n  Topology","summary":"  Federated learning is an active research topic since it enables several\nparticipants to jointly train a model without sharing local data. Currently,\ncross-silo federated learning is a popular training setting that utilizes a few\nhundred reliable data silos with high-speed access links to training a model.\nWhile this approach has been widely applied in real-world scenarios, designing\na robust topology to reduce the training time remains an open problem. In this\npaper, we present a new multigraph topology for cross-silo federated learning.\nWe first construct the multigraph using the overlay graph. We then parse this\nmultigraph into different simple graphs with isolated nodes. The existence of\nisolated nodes allows us to perform model aggregation without waiting for other\nnodes, hence effectively reducing the training time. Intensive experiments on\nthree public datasets show that our proposed method significantly reduces the\ntraining time compared with recent state-of-the-art topologies while\nmaintaining the accuracy of the learned model. Our code can be found at\nhttps://github.com/aioz-ai/MultigraphFL\n","authors":["Tuong Do","Binh X. Nguyen","Vuong Pham","Toan Tran","Erman Tjiputra","Quang Tran","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2207.09657v3.pdf","comment":"accepted in ICCV 2023"},{"id":"http://arxiv.org/abs/2302.09629v2","updated":"2023-07-24T12:33:09Z","published":"2023-02-19T17:15:56Z","title":"BiofilmScanner: A Computational Intelligence Approach to Obtain\n  Bacterial Cell Morphological Attributes from Biofilm Image","summary":"  Desulfovibrio alaskensis G20 (DA-G20) is utilized as a model for\nsulfate-reducing bacteria (SRB) that are associated with corrosion issues\ncaused by microorganisms. SRB-based biofilms are thought to be responsible for\nthe billion-dollar-per-year bio-corrosion of metal infrastructure.\nUnderstanding the extraction of the bacterial cells' shape and size properties\nin the SRB-biofilm at different growth stages will assist with the design of\nanti-corrosion techniques. However, numerous issues affect current approaches,\nincluding time-consuming geometric property extraction, low efficiency, and\nhigh error rates. This paper proposes BiofilScanner, a Yolact-based deep\nlearning method integrated with invariant moments to address these problems.\nOur approach efficiently detects and segments bacterial cells in an SRB image\nwhile simultaneously invariant moments measure the geometric characteristics of\nthe segmented cells with low errors. The numerical experiments of the proposed\nmethod demonstrate that the BiofilmScanner is 2.1x and 6.8x faster than our\nearlier Mask-RCNN and DLv3+ methods for detecting, segmenting, and measuring\nthe geometric properties of the cell. Furthermore, the BiofilmScanner achieved\nan F1-score of 85.28% while Mask-RCNN and DLv3+ obtained F1-scores of 77.67%\nand 75.18%, respectively.\n","authors":["Md Hafizur Rahman","Md Ali Azam","Md Abir Hossen","Shankarachary Ragi","Venkataramana Gadhamshetty"],"pdf_url":"https://arxiv.org/pdf/2302.09629v2.pdf","comment":"Submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2306.16177v3","updated":"2023-07-24T12:32:58Z","published":"2023-06-28T12:58:42Z","title":"Defining data science: a new field of inquiry","summary":"  Data science is not a science. It is a research paradigm. Its power, scope,\nand scale will surpass science, our most powerful research paradigm, to enable\nknowledge discovery and change our world. We have yet to understand and define\nit, vital to realizing its potential and managing its risks. Modern data\nscience is in its infancy. Emerging slowly since 1962 and rapidly since 2000,\nit is a fundamentally new field of inquiry, one of the most active, powerful,\nand rapidly evolving 21st century innovations. Due to its value, power, and\napplicability, it is emerging in over 40 disciplines, hundreds of research\nareas, and thousands of applications. Millions of data science publications\ncontain myriad definitions of data science and data science problem solving.\nDue to its infancy, many definitions are independent, application specific,\nmutually incomplete, redundant, or inconsistent, hence so is data science. This\nresearch addresses this data science multiple definitions challenge by\nproposing the development of coherent, unified definition based on a data\nscience reference framework using a data science journal for the data science\ncommunity to achieve such a definition. This paper provides candidate\ndefinitions for essential data science artifacts that are required to discuss\nsuch a definition. They are based on the classical research paradigm concept\nconsisting of a philosophy of data science, the data science problem solving\nparadigm, and the six component data science reference framework (axiology,\nontology, epistemology, methodology, methods, technology) that is a frequently\ncalled for unifying framework with which to define, unify, and evolve data\nscience. It presents challenges for defining data science, solution approaches,\ni.e., means for defining data science, and their requirements and benefits as\nthe basis of a comprehensive solution.\n","authors":["Michael L Brodie"],"pdf_url":"https://arxiv.org/pdf/2306.16177v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12865v3","updated":"2023-07-24T12:08:50Z","published":"2023-03-22T18:59:48Z","title":"NeRF-GAN Distillation for Efficient 3D-Aware Generation with\n  Convolutions","summary":"  Pose-conditioned convolutional generative models struggle with high-quality\n3D-consistent image generation from single-view datasets, due to their lack of\nsufficient 3D priors. Recently, the integration of Neural Radiance Fields\n(NeRFs) and generative models, such as Generative Adversarial Networks (GANs),\nhas transformed 3D-aware generation from single-view images. NeRF-GANs exploit\nthe strong inductive bias of neural 3D representations and volumetric rendering\nat the cost of higher computational complexity. This study aims at revisiting\npose-conditioned 2D GANs for efficient 3D-aware generation at inference time by\ndistilling 3D knowledge from pretrained NeRF-GANs. We propose a simple and\neffective method, based on re-using the well-disentangled latent space of a\npre-trained NeRF-GAN in a pose-conditioned convolutional network to directly\ngenerate 3D-consistent images corresponding to the underlying 3D\nrepresentations. Experiments on several datasets demonstrate that the proposed\nmethod obtains results comparable with volumetric rendering in terms of quality\nand 3D consistency while benefiting from the computational advantage of\nconvolutional networks. The code will be available at:\nhttps://github.com/mshahbazi72/NeRF-GAN-Distillation\n","authors":["Mohamad Shahbazi","Evangelos Ntavelis","Alessio Tonioni","Edo Collins","Danda Pani Paudel","Martin Danelljan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.12865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12716v1","updated":"2023-07-24T11:55:32Z","published":"2023-07-24T11:55:32Z","title":"Safety Performance of Neural Networks in the Presence of Covariate Shift","summary":"  Covariate shift may impact the operational safety performance of neural\nnetworks. A re-evaluation of the safety performance, however, requires\ncollecting new operational data and creating corresponding ground truth labels,\nwhich often is not possible during operation. We are therefore proposing to\nreshape the initial test set, as used for the safety performance evaluation\nprior to deployment, based on an approximation of the operational data. This\napproximation is obtained by observing and learning the distribution of\nactivation patterns of neurons in the network during operation. The reshaped\ntest set reflects the distribution of neuron activation values as observed\nduring operation, and may therefore be used for re-evaluating safety\nperformance in the presence of covariate shift. First, we derive conservative\nbounds on the values of neurons by applying finite binning and static dataflow\nanalysis. Second, we formulate a mixed integer linear programming (MILP)\nconstraint for constructing the minimum set of data points to be removed in the\ntest set, such that the difference between the discretized test and operational\ndistributions is bounded. We discuss potential benefits and limitations of this\nconstraint-based approach based on our initial experience with an implemented\nresearch prototype.\n","authors":["Chih-Hong Cheng","Harald Ruess","Konstantinos Theodorou"],"pdf_url":"https://arxiv.org/pdf/2307.12716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13871v2","updated":"2023-07-24T11:44:01Z","published":"2023-04-26T23:34:40Z","title":"Typical and atypical solutions in non-convex neural networks with\n  discrete and continuous weights","summary":"  We study the binary and continuous negative-margin perceptrons as simple\nnon-convex neural network models learning random rules and associations. We\nanalyze the geometry of the landscape of solutions in both models and find\nimportant similarities and differences. Both models exhibit subdominant\nminimizers which are extremely flat and wide. These minimizers coexist with a\nbackground of dominant solutions which are composed by an exponential number of\nalgorithmically inaccessible small clusters for the binary case (the frozen\n1-RSB phase) or a hierarchical structure of clusters of different sizes for the\nspherical case (the full RSB phase). In both cases, when a certain threshold in\nconstraint density is crossed, the local entropy of the wide flat minima\nbecomes non-monotonic, indicating a break-up of the space of robust solutions\ninto disconnected components. This has a strong impact on the behavior of\nalgorithms in binary models, which cannot access the remaining isolated\nclusters. For the spherical case the behaviour is different, since even beyond\nthe disappearance of the wide flat minima the remaining solutions are shown to\nalways be surrounded by a large number of other solutions at any distance, up\nto capacity. Indeed, we exhibit numerical evidence that algorithms seem to find\nsolutions up to the SAT/UNSAT transition, that we compute here using an 1RSB\napproximation. For both models, the generalization performance as a learning\ndevice is shown to be greatly improved by the existence of wide flat minimizers\neven when trained in the highly underconstrained regime of very negative\nmargins.\n","authors":["Carlo Baldassi","Enrico M. Malatesta","Gabriele Perugini","Riccardo Zecchina"],"pdf_url":"https://arxiv.org/pdf/2304.13871v2.pdf","comment":"34 pages, 13 figures"},{"id":"http://arxiv.org/abs/2210.17230v3","updated":"2023-07-24T11:43:26Z","published":"2022-10-31T11:15:48Z","title":"Lipschitz-regularized gradient flows and generative particle algorithms\n  for high-dimensional scarce data","summary":"  We build a new class of generative algorithms capable of efficiently learning\nan arbitrary target distribution from possibly scarce, high-dimensional data\nand subsequently generate new samples. These generative algorithms are\nparticle-based and are constructed as gradient flows of Lipschitz-regularized\nKullback-Leibler or other $f$-divergences, where data from a source\ndistribution can be stably transported as particles, towards the vicinity of\nthe target distribution. As a highlighted result in data integration, we\ndemonstrate that the proposed algorithms correctly transport gene expression\ndata points with dimension exceeding 54K, while the sample size is typically\nonly in the hundreds.\n","authors":["Hyemin Gu","Panagiota Birmpa","Yannis Pantazis","Luc Rey-Bellet","Markos A. Katsoulakis"],"pdf_url":"https://arxiv.org/pdf/2210.17230v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12703v1","updated":"2023-07-24T11:37:02Z","published":"2023-07-24T11:37:02Z","title":"Policy Gradient Optimal Correlation Search for Variance Reduction in\n  Monte Carlo simulation and Maximum Optimal Transport","summary":"  We propose a new algorithm for variance reduction when estimating $f(X_T)$\nwhere $X$ is the solution to some stochastic differential equation and $f$ is a\ntest function. The new estimator is $(f(X^1_T) + f(X^2_T))/2$, where $X^1$ and\n$X^2$ have same marginal law as $X$ but are pathwise correlated so that to\nreduce the variance. The optimal correlation function $\\rho$ is approximated by\na deep neural network and is calibrated along the trajectories of $(X^1, X^2)$\nby policy gradient and reinforcement learning techniques. Finding an optimal\ncoupling given marginal laws has links with maximum optimal transport.\n","authors":["Pierre Bras","Gilles Pagès"],"pdf_url":"https://arxiv.org/pdf/2307.12703v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2303.09340v3","updated":"2023-07-24T11:34:21Z","published":"2023-03-16T14:21:45Z","title":"Improving Automated Hemorrhage Detection in Sparse-view Computed\n  Tomography via Deep Convolutional Neural Network based Artifact Reduction","summary":"  Purpose: Sparse-view computed tomography (CT) is an effective way to reduce\ndose by lowering the total number of views acquired, albeit at the expense of\nimage quality, which, in turn, can impact the ability to detect diseases. We\nexplore deep learning-based artifact reduction in sparse-view cranial CT scans\nand its impact on automated hemorrhage detection. Methods: We trained a U-Net\nfor artefact reduction on simulated sparse-view cranial CT scans from 3000\npatients obtained from a public dataset and reconstructed with varying levels\nof sub-sampling. Additionally, we trained a convolutional neural network on\nfully sampled CT data from 17,545 patients for automated hemorrhage detection.\nWe evaluated the classification performance using the area under the receiver\noperator characteristic curves (AUC-ROCs) with corresponding 95% confidence\nintervals (CIs) and the DeLong test, along with confusion matrices. The\nperformance of the U-Net was compared to an analytical approach based on total\nvariation (TV). Results: The U-Net performed superior compared to unprocessed\nand TV-processed images with respect to image quality and automated hemorrhage\ndiagnosis. With U-Net post-processing, the number of views can be reduced from\n4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;\n0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256\nviews (0.967; 0.964-0.969) with a slight performance decrease (P<.001).\nConclusion: The results suggest that U-Net based artifact reduction\nsubstantially enhances automated hemorrhage detection in sparse-view cranial\nCTs. Our findings highlight that appropriate post-processing is crucial for\noptimal image quality and diagnostic accuracy while minimizing radiation dose.\n","authors":["Johannes Thalhammer","Manuel Schultheiss","Tina Dorosti","Tobias Lasser","Franz Pfeiffer","Daniela Pfeiffer","Florian Schaff"],"pdf_url":"https://arxiv.org/pdf/2303.09340v3.pdf","comment":"11 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.12698v1","updated":"2023-07-24T11:27:14Z","published":"2023-07-24T11:27:14Z","title":"MC-JEPA: A Joint-Embedding Predictive Architecture for Self-Supervised\n  Learning of Motion and Content Features","summary":"  Self-supervised learning of visual representations has been focusing on\nlearning content features, which do not capture object motion or location, and\nfocus on identifying and differentiating objects in images and videos. On the\nother hand, optical flow estimation is a task that does not involve\nunderstanding the content of the images on which it is estimated. We unify the\ntwo approaches and introduce MC-JEPA, a joint-embedding predictive architecture\nand self-supervised learning approach to jointly learn optical flow and content\nfeatures within a shared encoder, demonstrating that the two associated\nobjectives; the optical flow estimation objective and the self-supervised\nlearning objective; benefit from each other and thus learn content features\nthat incorporate motion information. The proposed approach achieves performance\non-par with existing unsupervised optical flow benchmarks, as well as with\ncommon self-supervised learning approaches on downstream tasks such as semantic\nsegmentation of images and videos.\n","authors":["Adrien Bardes","Jean Ponce","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2307.12698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10763v3","updated":"2023-07-24T11:15:47Z","published":"2023-02-12T12:19:57Z","title":"Contrastive Learning and the Emergence of Attributes Associations","summary":"  In response to an object presentation, supervised learning schemes generally\nrespond with a parsimonious label. Upon a similar presentation we humans\nrespond again with a label, but are flooded, in addition, by a myriad of\nassociations. A significant portion of these consist of the presented object\nattributes. Contrastive learning is a semi-supervised learning scheme based on\nthe application of identity preserving transformations on the object input\nrepresentations. It is conjectured in this work that these same applied\ntransformations preserve, in addition to the identity of the presented object,\nalso the identity of its semantically meaningful attributes. The corollary of\nthis is that the output representations of such a contrastive learning scheme\ncontain valuable information not only for the classification of the presented\nobject, but also for the presence or absence decision of any attribute of\ninterest. Simulation results which demonstrate this idea and the feasibility of\nthis conjecture are presented.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2302.10763v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2210.12583v2","updated":"2023-07-24T11:13:21Z","published":"2022-10-23T00:45:05Z","title":"Active Learning of Discrete-Time Dynamics for Uncertainty-Aware Model\n  Predictive Control","summary":"  Model-based control requires an accurate model of the system dynamics for\nprecisely and safely controlling the robot in complex and dynamic environments.\nMoreover, in the presence of variations in the operating conditions, the model\nshould be continuously refined to compensate for dynamics changes. In this\npaper, we present a self-supervised learning approach that actively models the\ndynamics of nonlinear robotic systems. We combine offline learning from past\nexperience and online learning from current robot interaction with the unknown\nenvironment. These two ingredients enable a highly sample-efficient and\nadaptive learning process, capable of accurately inferring model dynamics in\nreal-time even in operating regimes that greatly differ from the training\ndistribution. Moreover, we design an uncertainty-aware model predictive\ncontroller that is heuristically conditioned to the aleatoric (data)\nuncertainty of the learned dynamics. This controller actively chooses the\noptimal control actions that (i) optimize the control performance and (ii)\nimprove the efficiency of online learning sample collection. We demonstrate the\neffectiveness of our method through a series of challenging real-world\nexperiments using a quadrotor system. Our approach showcases high resilience\nand generalization capabilities by consistently adapting to unseen flight\nconditions, while it significantly outperforms classical and adaptive control\nbaselines.\n","authors":["Alessandro Saviolo","Jonathan Frey","Abhishek Rathod","Moritz Diehl","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2210.12583v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12689v1","updated":"2023-07-24T11:04:22Z","published":"2023-07-24T11:04:22Z","title":"Addressing the Impact of Localized Training Data in Graph Neural\n  Networks","summary":"  Graph Neural Networks (GNNs) have achieved notable success in learning from\ngraph-structured data, owing to their ability to capture intricate dependencies\nand relationships between nodes. They excel in various applications, including\nsemi-supervised node classification, link prediction, and graph generation.\nHowever, it is important to acknowledge that the majority of state-of-the-art\nGNN models are built upon the assumption of an in-distribution setting, which\nhinders their performance on real-world graphs with dynamic structures. In this\narticle, we aim to assess the impact of training GNNs on localized subsets of\nthe graph. Such restricted training data may lead to a model that performs well\nin the specific region it was trained on but fails to generalize and make\naccurate predictions for the entire graph. In the context of graph-based\nsemi-supervised learning (SSL), resource constraints often lead to scenarios\nwhere the dataset is large, but only a portion of it can be labeled, affecting\nthe model's performance. This limitation affects tasks like anomaly detection\nor spam detection when labeling processes are biased or influenced by human\nsubjectivity. To tackle the challenges posed by localized training data, we\napproach the problem as an out-of-distribution (OOD) data issue by by aligning\nthe distributions between the training data, which represents a small portion\nof labeled data, and the graph inference process that involves making\npredictions for the entire graph. We propose a regularization method to\nminimize distributional discrepancies between localized training data and graph\ninference, improving model performance on OOD data. Extensive tests on popular\nGNN models show significant performance improvement on three citation GNN\nbenchmark datasets. The regularization approach effectively enhances model\nadaptation and generalization, overcoming challenges posed by OOD data.\n","authors":["Singh Akansha"],"pdf_url":"https://arxiv.org/pdf/2307.12689v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.12679v1","updated":"2023-07-24T10:33:32Z","published":"2023-07-24T10:33:32Z","title":"An Estimator for the Sensitivity to Perturbations of Deep Neural\n  Networks","summary":"  For Deep Neural Networks (DNNs) to become useful in safety-critical\napplications, such as self-driving cars and disease diagnosis, they must be\nstable to perturbations in input and model parameters. Characterizing the\nsensitivity of a DNN to perturbations is necessary to determine minimal\nbit-width precision that may be used to safely represent the network. However,\nno general result exists that is capable of predicting the sensitivity of a\ngiven DNN to round-off error, noise, or other perturbations in input. This\npaper derives an estimator that can predict such quantities. The estimator is\nderived via inequalities and matrix norms, and the resulting quantity is\nroughly analogous to a condition number for the entire neural network. An\napproximation of the estimator is tested on two Convolutional Neural Networks,\nAlexNet and VGG-19, using the ImageNet dataset. For each of these networks, the\ntightness of the estimator is explored via random perturbations and adversarial\nattacks.\n","authors":["Naman Maheshwari","Nicholas Malaya","Scott Moe","Jaydeep P. Kulkarni","Sudhanva Gurumurthi"],"pdf_url":"https://arxiv.org/pdf/2307.12679v1.pdf","comment":"Actual work and paper concluded in January 2019"},{"id":"http://arxiv.org/abs/2307.12672v1","updated":"2023-07-24T10:20:14Z","published":"2023-07-24T10:20:14Z","title":"Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked\n  Image Modeling","summary":"  In dynamic Magnetic Resonance Imaging (MRI), k-space is typically\nundersampled due to limited scan time, resulting in aliasing artifacts in the\nimage domain. Hence, dynamic MR reconstruction requires not only modeling\nspatial frequency components in the x and y directions of k-space but also\nconsidering temporal redundancy. Most previous works rely on image-domain\nregularizers (priors) to conduct MR reconstruction. In contrast, we focus on\ninterpolating the undersampled k-space before obtaining images with Fourier\ntransform. In this work, we connect masked image modeling with k-space\ninterpolation and propose a novel Transformer-based k-space Global\nInterpolation Network, termed k-GIN. Our k-GIN learns global dependencies among\nlow- and high-frequency components of 2D+t k-space and uses it to interpolate\nunsampled data. Further, we propose a novel k-space Iterative Refinement Module\n(k-IRM) to enhance the high-frequency components learning. We evaluate our\napproach on 92 in-house 2D+t cardiac MR subjects and compare it to MR\nreconstruction methods with image-domain regularizers. Experiments show that\nour proposed k-space interpolation method quantitatively and qualitatively\noutperforms baseline methods. Importantly, the proposed approach achieves\nsubstantially higher robustness and generalizability in cases of\nhighly-undersampled MR data.\n","authors":["Jiazhen Pan","Suprosanna Shit","Özgün Turgut","Wenqi Huang","Hongwei Bran Li","Nil Stolt-Ansó","Thomas Küstner","Kerstin Hammernik","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2307.12672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12667v1","updated":"2023-07-24T10:14:51Z","published":"2023-07-24T10:14:51Z","title":"TransFusion: Generating Long, High Fidelity Time Series using Diffusion\n  Models with Transformers","summary":"  The generation of high-quality, long-sequenced time-series data is essential\ndue to its wide range of applications. In the past, standalone Recurrent and\nConvolutional Neural Network-based Generative Adversarial Networks (GAN) were\nused to synthesize time-series data. However, they are inadequate for\ngenerating long sequences of time-series data due to limitations in the\narchitecture. Furthermore, GANs are well known for their training instability\nand mode collapse problem. To address this, we propose TransFusion, a\ndiffusion, and transformers-based generative model to generate high-quality\nlong-sequence time-series data. We have stretched the sequence length to 384,\nand generated high-quality synthetic data. To the best of our knowledge, this\nis the first study that has been done with this long-sequence length. Also, we\nintroduce two evaluation metrics to evaluate the quality of the synthetic data\nas well as its predictive characteristics. We evaluate TransFusion with a wide\nvariety of visual and empirical metrics, and TransFusion outperforms the\nprevious state-of-the-art by a significant margin.\n","authors":["Md Fahim Sikder","Resmi Ramachandranpillai","Fredrik Heintz"],"pdf_url":"https://arxiv.org/pdf/2307.12667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12660v1","updated":"2023-07-24T10:04:27Z","published":"2023-07-24T10:04:27Z","title":"Online Continual Learning in Keyword Spotting for Low-Resource Devices\n  via Pooling High-Order Temporal Statistics","summary":"  Keyword Spotting (KWS) models on embedded devices should adapt fast to new\nuser-defined words without forgetting previous ones. Embedded devices have\nlimited storage and computational resources, thus, they cannot save samples or\nupdate large models. We consider the setup of embedded online continual\nlearning (EOCL), where KWS models with frozen backbone are trained to\nincrementally recognize new words from a non-repeated stream of samples, seen\none at a time. To this end, we propose Temporal Aware Pooling (TAP) which\nconstructs an enriched feature space computing high-order moments of speech\nfeatures extracted by a pre-trained backbone. Our method, TAP-SLDA, updates a\nGaussian model for each class on the enriched feature space to effectively use\naudio representations. In experimental analyses, TAP-SLDA outperforms\ncompetitors on several setups, backbones, and baselines, bringing a relative\naverage gain of 11.3% on the GSC dataset.\n","authors":["Umberto Michieli","Pablo Peso Parada","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2307.12660v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2306.12231v2","updated":"2023-07-24T09:36:05Z","published":"2023-06-21T12:44:52Z","title":"Predicting protein variants with equivariant graph neural networks","summary":"  Pre-trained models have been successful in many protein engineering tasks.\nMost notably, sequence-based models have achieved state-of-the-art performance\non protein fitness prediction while structure-based models have been used\nexperimentally to develop proteins with enhanced functions. However, there is a\nresearch gap in comparing structure- and sequence-based methods for predicting\nprotein variants that are better than the wildtype protein. This paper aims to\naddress this gap by conducting a comparative study between the abilities of\nequivariant graph neural networks (EGNNs) and sequence-based approaches to\nidentify promising amino-acid mutations. The results show that our proposed\nstructural approach achieves a competitive performance to sequence-based\nmethods while being trained on significantly fewer molecules. Additionally, we\nfind that combining assay labelled data with structure pre-trained models\nyields similar trends as with sequence pre-trained models.\n  Our code and trained models can be found at:\nhttps://github.com/semiluna/partIII-amino-acid-prediction.\n","authors":["Antonia Boca","Simon Mathis"],"pdf_url":"https://arxiv.org/pdf/2306.12231v2.pdf","comment":"4 pages, 2 figures, accepted to the 2023 ICML Workshop on\n  Computational Biology"},{"id":"http://arxiv.org/abs/2307.12644v1","updated":"2023-07-24T09:35:47Z","published":"2023-07-24T09:35:47Z","title":"Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation\n  of rPPG","summary":"  Remote Photoplethysmography (rPPG) is a technology that utilizes the light\nabsorption properties of hemoglobin, captured via camera, to analyze and\nmeasure blood volume pulse (BVP). By analyzing the measured BVP, various\nphysiological signals such as heart rate, stress levels, and blood pressure can\nbe derived, enabling applications such as the early prediction of\ncardiovascular diseases. rPPG is a rapidly evolving field as it allows the\nmeasurement of vital signals using camera-equipped devices without the need for\nadditional devices such as blood pressure monitors or pulse oximeters, and\nwithout the assistance of medical experts. Despite extensive efforts and\nadvances in this field, serious challenges remain, including issues related to\nskin color, camera characteristics, ambient lighting, and other sources of\nnoise, which degrade performance accuracy. We argue that fair and evaluable\nbenchmarking is urgently required to overcome these challenges and make any\nmeaningful progress from both academic and commercial perspectives. In most\nexisting work, models are trained, tested, and validated only on limited\ndatasets. Worse still, some studies lack available code or reproducibility,\nmaking it difficult to fairly evaluate and compare performance. Therefore, the\npurpose of this study is to provide a benchmarking framework to evaluate\nvarious rPPG techniques across a wide range of datasets for fair evaluation and\ncomparison, including both conventional non-deep neural network (non-DNN) and\ndeep neural network (DNN) methods. GitHub URL:\nhttps://github.com/remotebiosensing/rppg.\n","authors":["Dae Yeol Kim","Eunsu Goh","KwangKee Lee","JongEui Chae","JongHyeon Mun","Junyeong Na","Chae-bong Sohn","Do-Yup Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12644v1.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.12639v1","updated":"2023-07-24T09:30:30Z","published":"2023-07-24T09:30:30Z","title":"Fake News Detection Through Graph-based Neural Networks: A Survey","summary":"  The popularity of online social networks has enabled rapid dissemination of\ninformation. People now can share and consume information much more rapidly\nthan ever before. However, low-quality and/or accidentally/deliberately fake\ninformation can also spread rapidly. This can lead to considerable and negative\nimpacts on society. Identifying, labelling and debunking online misinformation\nas early as possible has become an increasingly urgent problem. Many methods\nhave been proposed to detect fake news including many deep learning and\ngraph-based approaches. In recent years, graph-based methods have yielded\nstrong results, as they can closely model the social context and propagation\nprocess of online news. In this paper, we present a systematic review of fake\nnews detection studies based on graph-based and deep learning-based techniques.\nWe classify existing graph-based methods into knowledge-driven methods,\npropagation-based methods, and heterogeneous social context-based methods,\ndepending on how a graph structure is constructed to model news related\ninformation flows. We further discuss the challenges and open problems in\ngraph-based fake news detection and identify future research directions.\n","authors":["Shuzhi Gong","Richard O. Sinnott","Jianzhong Qi","Cecile Paris"],"pdf_url":"https://arxiv.org/pdf/2307.12639v1.pdf","comment":"18 pages, 3 tables, 7 figures"},{"id":"http://arxiv.org/abs/2304.03981v2","updated":"2023-07-24T09:24:04Z","published":"2023-04-08T10:47:41Z","title":"Uncertainty-inspired Open Set Learning for Retinal Anomaly\n  Identification","summary":"  Failure to recognize samples from the classes unseen during training is a\nmajor limitation of artificial intelligence in the real-world implementation\nfor recognition and classification of retinal anomalies. We established an\nuncertainty-inspired open-set (UIOS) model, which was trained with fundus\nimages of 9 retinal conditions. Besides assessing the probability of each\ncategory, UIOS also calculated an uncertainty score to express its confidence.\nOur UIOS model with thresholding strategy achieved an F1 score of 99.55%,\n97.01% and 91.91% for the internal testing set, external target categories\n(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1\nscore of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS\ncorrectly predicted high uncertainty scores, which would prompt the need for a\nmanual check in the datasets of non-target categories retinal diseases,\nlow-quality fundus images, and non-fundus images. UIOS provides a robust method\nfor real-world screening of retinal anomalies.\n","authors":["Meng Wang","Tian Lin","Lianyu Wang","Aidi Lin","Ke Zou","Xinxing Xu","Yi Zhou","Yuanyuan Peng","Qingquan Meng","Yiming Qian","Guoyao Deng","Zhiqun Wu","Junhong Chen","Jianhong Lin","Mingzhi Zhang","Weifang Zhu","Changqing Zhang","Daoqiang Zhang","Rick Siow Mong Goh","Yong Liu","Chi Pui Pang","Xinjian Chen","Haoyu Chen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2304.03981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12636v1","updated":"2023-07-24T09:19:38Z","published":"2023-07-24T09:19:38Z","title":"Identifying drivers and mitigators for congestion and redispatch in the\n  German electric power system with explainable AI","summary":"  The transition to a sustainable energy supply challenges the operation of\nelectric power systems in manifold ways. Transmission grid loads increase as\nwind and solar power are often installed far away from the consumers. In\nextreme cases, system operators must intervene via countertrading or redispatch\nto ensure grid stability. In this article, we provide a data-driven analysis of\ncongestion in the German transmission grid. We develop an explainable machine\nlearning model to predict the volume of redispatch and countertrade on an\nhourly basis. The model reveals factors that drive or mitigate grid congestion\nand quantifies their impact. We show that, as expected, wind power generation\nis the main driver, but hydropower and cross-border electricity trading also\nplay an essential role. Solar power, on the other hand, has no mitigating\neffect. Our results suggest that a change to the market design would alleviate\ncongestion.\n","authors":["Maurizio Titz","Sebastian Pütz","Dirk Witthaut"],"pdf_url":"https://arxiv.org/pdf/2307.12636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14430v3","updated":"2023-07-24T09:15:02Z","published":"2022-09-28T21:31:43Z","title":"Minimax Optimal Kernel Operator Learning via Multilevel Training","summary":"  Learning mappings between infinite-dimensional function spaces has achieved\nempirical success in many disciplines of machine learning, including generative\nmodeling, functional data analysis, causal inference, and multi-agent\nreinforcement learning. In this paper, we study the statistical limit of\nlearning a Hilbert-Schmidt operator between two infinite-dimensional Sobolev\nreproducing kernel Hilbert spaces. We establish the information-theoretic lower\nbound in terms of the Sobolev Hilbert-Schmidt norm and show that a\nregularization that learns the spectral components below the bias contour and\nignores the ones that are above the variance contour can achieve the optimal\nlearning rate. At the same time, the spectral components between the bias and\nvariance contours give us flexibility in designing computationally feasible\nmachine learning algorithms. Based on this observation, we develop a multilevel\nkernel operator learning algorithm that is optimal when learning linear\noperators between infinite-dimensional function spaces.\n","authors":["Jikai Jin","Yiping Lu","Jose Blanchet","Lexing Ying"],"pdf_url":"https://arxiv.org/pdf/2209.14430v3.pdf","comment":"ICLR 2023 spotlight"},{"id":"http://arxiv.org/abs/2307.12625v1","updated":"2023-07-24T08:56:25Z","published":"2023-07-24T08:56:25Z","title":"De-confounding Representation Learning for Counterfactual Inference on\n  Continuous Treatment via Generative Adversarial Network","summary":"  Counterfactual inference for continuous rather than binary treatment\nvariables is more common in real-world causal inference tasks. While there are\nalready some sample reweighting methods based on Marginal Structural Model for\neliminating the confounding bias, they generally focus on removing the\ntreatment's linear dependence on confounders and rely on the accuracy of the\nassumed parametric models, which are usually unverifiable. In this paper, we\npropose a de-confounding representation learning (DRL) framework for\ncounterfactual outcome estimation of continuous treatment by generating the\nrepresentations of covariates disentangled with the treatment variables. The\nDRL is a non-parametric model that eliminates both linear and nonlinear\ndependence between treatment and covariates. Specifically, we train the\ncorrelations between the de-confounded representations and the treatment\nvariables against the correlations between the covariate representations and\nthe treatment variables to eliminate confounding bias. Further, a\ncounterfactual inference network is embedded into the framework to make the\nlearned representations serve both de-confounding and trusted inference.\nExtensive experiments on synthetic datasets show that the DRL model performs\nsuperiorly in learning de-confounding representations and outperforms\nstate-of-the-art counterfactual inference models for continuous treatment\nvariables. In addition, we apply the DRL model to a real-world medical dataset\nMIMIC and demonstrate a detailed causal relationship between red cell width\ndistribution and mortality.\n","authors":["Yonghe Zhao","Qiang Huang","Haolong Zeng","Yun Pen","Huiyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.12625v1.pdf","comment":"15 pages,4 figures"},{"id":"http://arxiv.org/abs/2307.12617v1","updated":"2023-07-24T08:46:12Z","published":"2023-07-24T08:46:12Z","title":"Predicting Ordinary Differential Equations with Transformers","summary":"  We develop a transformer-based sequence-to-sequence model that recovers\nscalar ordinary differential equations (ODEs) in symbolic form from irregularly\nsampled and noisy observations of a single solution trajectory. We demonstrate\nin extensive empirical evaluations that our model performs better or on par\nwith existing methods in terms of accurate recovery across various settings.\nMoreover, our method is efficiently scalable: after one-time pretraining on a\nlarge set of ODEs, we can infer the governing law of a new observed solution in\na few forward passes of the model.\n","authors":["Sören Becker","Michal Klein","Alexander Neitz","Giambattista Parascandolo","Niki Kilbertus"],"pdf_url":"https://arxiv.org/pdf/2307.12617v1.pdf","comment":"Published at ICML 2023"},{"id":"http://arxiv.org/abs/2307.09458v3","updated":"2023-07-24T08:32:40Z","published":"2023-07-18T17:39:04Z","title":"Does Circuit Analysis Interpretability Scale? Evidence from Multiple\n  Choice Capabilities in Chinchilla","summary":"  \\emph{Circuit analysis} is a promising technique for understanding the\ninternal mechanisms of language models. However, existing analyses are done in\nsmall models far from the state of the art. To address this, we present a case\nstudy of circuit analysis in the 70B Chinchilla model, aiming to test the\nscalability of circuit analysis. In particular, we study multiple-choice\nquestion answering, and investigate Chinchilla's capability to identify the\ncorrect answer \\emph{label} given knowledge of the correct answer \\emph{text}.\nWe find that the existing techniques of logit attribution, attention pattern\nvisualization, and activation patching naturally scale to Chinchilla, allowing\nus to identify and categorize a small set of `output nodes' (attention heads\nand MLPs).\n  We further study the `correct letter' category of attention heads aiming to\nunderstand the semantics of their features, with mixed results. For normal\nmultiple-choice question answers, we significantly compress the query, key and\nvalue subspaces of the head without loss of performance when operating on the\nanswer labels for multiple-choice questions, and we show that the query and key\nsubspaces represent an `Nth item in an enumeration' feature to at least some\nextent. However, when we attempt to use this explanation to understand the\nheads' behaviour on a more general distribution including randomized answer\nlabels, we find that it is only a partial explanation, suggesting there is more\nto learn about the operation of `correct letter' heads on multiple choice\nquestion answering.\n","authors":["Tom Lieberum","Matthew Rahtz","János Kramár","Neel Nanda","Geoffrey Irving","Rohin Shah","Vladimir Mikulik"],"pdf_url":"https://arxiv.org/pdf/2307.09458v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12607v1","updated":"2023-07-24T08:32:27Z","published":"2023-07-24T08:32:27Z","title":"ExWarp: Extrapolation and Warping-based Temporal Supersampling for\n  High-frequency Displays","summary":"  High-frequency displays are gaining immense popularity because of their\nincreasing use in video games and virtual reality applications. However, the\nissue is that the underlying GPUs cannot continuously generate frames at this\nhigh rate -- this results in a less smooth and responsive experience.\nFurthermore, if the frame rate is not synchronized with the refresh rate, the\nuser may experience screen tearing and stuttering. Previous works propose\nincreasing the frame rate to provide a smooth experience on modern displays by\npredicting new frames based on past or future frames. Interpolation and\nextrapolation are two widely used algorithms that predict new frames.\nInterpolation requires waiting for the future frame to make a prediction, which\nadds additional latency. On the other hand, extrapolation provides a better\nquality of experience because it relies solely on past frames -- it does not\nincur any additional latency. The simplest method to extrapolate a frame is to\nwarp the previous frame using motion vectors; however, the warped frame may\ncontain improperly rendered visual artifacts due to dynamic objects -- this\nmakes it very challenging to design such a scheme. Past work has used DNNs to\nget good accuracy, however, these approaches are slow. This paper proposes\nExwarp -- an approach based on reinforcement learning (RL) to intelligently\nchoose between the slower DNN-based extrapolation and faster warping-based\nmethods to increase the frame rate by 4x with an almost negligible reduction in\nthe perceived image quality.\n","authors":["Akanksha Dixit","Yashashwee Chakrabarty","Smruti R. Sarangi"],"pdf_url":"https://arxiv.org/pdf/2307.12607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12601v1","updated":"2023-07-24T08:21:13Z","published":"2023-07-24T08:21:13Z","title":"Concept backpropagation: An Explainable AI approach for visualising\n  learned concepts in neural network models","summary":"  Neural network models are widely used in a variety of domains, often as\nblack-box solutions, since they are not directly interpretable for humans. The\nfield of explainable artificial intelligence aims at developing explanation\nmethods to address this challenge, and several approaches have been developed\nover the recent years, including methods for investigating what type of\nknowledge these models internalise during the training process. Among these,\nthe method of concept detection, investigates which \\emph{concepts} neural\nnetwork models learn to represent in order to complete their tasks. In this\nwork, we present an extension to the method of concept detection, named\n\\emph{concept backpropagation}, which provides a way of analysing how the\ninformation representing a given concept is internalised in a given neural\nnetwork model. In this approach, the model input is perturbed in a manner\nguided by a trained concept probe for the described model, such that the\nconcept of interest is maximised. This allows for the visualisation of the\ndetected concept directly in the input space of the model, which in turn makes\nit possible to see what information the model depends on for representing the\ndescribed concept. We present results for this method applied to a various set\nof input modalities, and discuss how our proposed method can be used to\nvisualise what information trained concept probes use, and the degree as to\nwhich the representation of the probed concept is entangled within the neural\nnetwork model itself.\n","authors":["Patrik Hammersborg","Inga Strümke"],"pdf_url":"https://arxiv.org/pdf/2307.12601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12594v1","updated":"2023-07-24T08:11:59Z","published":"2023-07-24T08:11:59Z","title":"Optimized data collection and analysis process for studying\n  solar-thermal desalination by machine learning","summary":"  An effective interdisciplinary study between machine learning and\nsolar-thermal desalination requires a sufficiently large and well-analyzed\nexperimental datasets. This study develops a modified dataset collection and\nanalysis process for studying solar-thermal desalination by machine learning.\nBased on the optimized water condensation and collection process, the proposed\nexperimental method collects over one thousand datasets, which is ten times\nmore than the average number of datasets in previous works, by accelerating\ndata collection and reducing the time by 83.3%. On the other hand, the effects\nof dataset features are investigated by using three different algorithms,\nincluding artificial neural networks, multiple linear regressions, and random\nforests. The investigation focuses on the effects of dataset size and range on\nprediction accuracy, factor importance ranking, and the model's generalization\nability. The results demonstrate that a larger dataset can significantly\nimprove prediction accuracy when using artificial neural networks and random\nforests. Additionally, the study highlights the significant impact of dataset\nsize and range on ranking the importance of influence factors. Furthermore, the\nstudy reveals that the extrapolation data range significantly affects the\nextrapolation accuracy of artificial neural networks. Based on the results,\nmassive dataset collection and analysis of dataset feature effects are\nimportant steps in an effective and consistent machine learning process flow\nfor solar-thermal desalination, which can promote machine learning as a more\ngeneral tool in the field of solar-thermal desalination.\n","authors":["Guilong Peng","Senshan Sun","Yangjun Qin","Zhenwei Xu","Juxin Du","Swellam W. sharshir","A. W. Kandel","A. E. Kabeel","Nuo Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07515v2","updated":"2023-07-24T08:10:52Z","published":"2023-04-15T09:39:52Z","title":"S3M: Scalable Statistical Shape Modeling through Unsupervised\n  Correspondences","summary":"  Statistical shape models (SSMs) are an established way to represent the\nanatomy of a population with various clinically relevant applications. However,\nthey typically require domain expertise, and labor-intensive landmark\nannotations to construct. We address these shortcomings by proposing an\nunsupervised method that leverages deep geometric features and functional\ncorrespondences to simultaneously learn local and global shape structures\nacross population anatomies. Our pipeline significantly improves unsupervised\ncorrespondence estimation for SSMs compared to baseline methods, even on highly\nirregular surface topologies. We demonstrate this for two different anatomical\nstructures: the thyroid and a multi-chamber heart dataset. Furthermore, our\nmethod is robust enough to learn from noisy neural network predictions,\npotentially enabling scaling SSMs to larger patient populations without manual\nsegmentation annotation.\n","authors":["Lennart Bastian","Alexander Baumann","Emily Hoppe","Vincent Bürgin","Ha Young Kim","Mahdi Saleh","Benjamin Busam","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2304.07515v2.pdf","comment":"Accepted at MICCAI 2023. 13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.12586v1","updated":"2023-07-24T07:58:18Z","published":"2023-07-24T07:58:18Z","title":"InVAErt networks: a data-driven framework for emulation, inference and\n  identifiability analysis","summary":"  Use of generative models and deep learning for physics-based systems is\ncurrently dominated by the task of emulation. However, the remarkable\nflexibility offered by data-driven architectures would suggest to extend this\nrepresentation to other aspects of system synthesis including model inversion\nand identifiability. We introduce inVAErt (pronounced \\emph{invert}) networks,\na comprehensive framework for data-driven analysis and synthesis of parametric\nphysical systems which uses a deterministic encoder and decoder to represent\nthe forward and inverse solution maps, normalizing flow to capture the\nprobabilistic distribution of system outputs, and a variational encoder\ndesigned to learn a compact latent representation for the lack of bijectivity\nbetween inputs and outputs. We formally investigate the selection of penalty\ncoefficients in the loss function and strategies for latent space sampling,\nsince we find that these significantly affect both training and testing\nperformance. We validate our framework through extensive numerical examples,\nincluding simple linear, nonlinear, and periodic maps, dynamical systems, and\nspatio-temporal PDEs.\n","authors":["Guoxiang Grayson Tong","Carlos A. Sing Long","Daniele E. Schiavazzi"],"pdf_url":"https://arxiv.org/pdf/2307.12586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09087v3","updated":"2023-07-24T07:55:19Z","published":"2023-06-15T12:33:39Z","title":"Deep learning based Meta-modeling for Multi-objective Technology\n  Optimization of Electrical Machines","summary":"  Optimization of rotating electrical machines is both time- and\ncomputationally expensive. Because of the different parametrization, design\noptimization is commonly executed separately for each machine technology. In\nthis paper, we present the application of a variational auto-encoder (VAE) to\noptimize two different machine technologies simultaneously, namely an\nasynchronous machine and a permanent magnet synchronous machine. After\ntraining, we employ a deep neural network and a decoder as meta-models to\npredict global key performance indicators (KPIs) and generate associated new\ndesigns, respectively, through unified latent space in the optimization loop.\nNumerical results demonstrate concurrent parametric multi-objective technology\noptimization in the high-dimensional design space. The VAE-based approach is\nquantitatively compared to a classical deep learning-based direct approach for\nKPIs prediction.\n","authors":["Vivek Parekh","Dominik Flore","Sebastian Schöps"],"pdf_url":"https://arxiv.org/pdf/2306.09087v3.pdf","comment":"12 pages, 15 figures"},{"id":"http://arxiv.org/abs/2307.12576v1","updated":"2023-07-24T07:47:21Z","published":"2023-07-24T07:47:21Z","title":"Self-refining of Pseudo Labels for Music Source Separation with Noisy\n  Labeled Data","summary":"  Music source separation (MSS) faces challenges due to the limited\navailability of correctly-labeled individual instrument tracks. With the push\nto acquire larger datasets to improve MSS performance, the inevitability of\nencountering mislabeled individual instrument tracks becomes a significant\nchallenge to address. This paper introduces an automated technique for refining\nthe labels in a partially mislabeled dataset. Our proposed self-refining\ntechnique, employed with a noisy-labeled dataset, results in only a 1% accuracy\ndegradation in multi-label instrument recognition compared to a classifier\ntrained on a clean-labeled dataset. The study demonstrates the importance of\nrefining noisy-labeled data in MSS model training and shows that utilizing the\nrefined dataset leads to comparable results derived from a clean-labeled\ndataset. Notably, upon only access to a noisy dataset, MSS models trained on a\nself-refined dataset even outperform those trained on a dataset refined with a\nclassifier trained on clean labels.\n","authors":["Junghyun Koo","Yunkee Chae","Chang-Bin Jeon","Kyogu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.12576v1.pdf","comment":"24th International Society for Music Information Retrieval Conference\n  (ISMIR 2023)"},{"id":"http://arxiv.org/abs/2306.16264v2","updated":"2023-07-24T07:30:53Z","published":"2023-06-28T14:46:55Z","title":"Deep Unfolded Simulated Bifurcation for Massive MIMO Signal Detection","summary":"  Multiple-input multiple-output (MIMO) is a key ingredient of next-generation\nwireless communications. Recently, various MIMO signal detectors based on deep\nlearning techniques and quantum(-inspired) algorithms have been proposed to\nimprove the detection performance compared with conventional detectors. This\npaper focuses on the simulated bifurcation (SB) algorithm, a quantum-inspired\nalgorithm. This paper proposes two techniques to improve its detection\nperformance. The first is modifying the algorithm inspired by the\nLevenberg-Marquardt algorithm to eliminate local minima of maximum likelihood\ndetection. The second is the use of deep unfolding, a deep learning technique\nto train the internal parameters of an iterative algorithm. We propose a\ndeep-unfolded SB by making the update rule of SB differentiable. The numerical\nresults show that these proposed detectors significantly improve the signal\ndetection performance in massive MIMO systems.\n","authors":["Satoshi Takabe"],"pdf_url":"https://arxiv.org/pdf/2306.16264v2.pdf","comment":"5pages, 4 figures; codes are available at\n  https://github.com/s-takabe/unfolded_simbif"},{"id":"http://arxiv.org/abs/2307.12564v1","updated":"2023-07-24T07:17:33Z","published":"2023-07-24T07:17:33Z","title":"Towards Generalising Neural Topical Representations","summary":"  Topic models have evolved from conventional Bayesian probabilistic models to\nNeural Topic Models (NTMs) over the last two decays. Although NTMs have\nachieved promising performance when trained and tested on a specific corpus,\ntheir generalisation ability across corpora is rarely studied. In practice, we\noften expect that an NTM trained on a source corpus can still produce quality\ntopical representation for documents in a different target corpus without\nretraining. In this work, we aim to improve NTMs further so that their benefits\ngeneralise reliably across corpora and tasks. To do so, we propose to model\nsimilar documents by minimising their semantical distance when training NTMs.\nSpecifically, similar documents are created by data augmentation during\ntraining; The semantical distance between documents is measured by the\nHierarchical Topic Transport Distance (HOTT), which computes the Optimal\nTransport (OT) distance between the topical representations. Our framework can\nbe readily applied to most NTMs as a plug-and-play module. Extensive\nexperiments show that our framework significantly improves the generalisation\nability regarding neural topical representation across corpora.\n","authors":["Xiaohao Yang","He Zhao","Dinh Phung","Lan Du"],"pdf_url":"https://arxiv.org/pdf/2307.12564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09251v2","updated":"2023-07-24T07:08:59Z","published":"2022-11-16T22:50:40Z","title":"Learning-Augmented B-Trees","summary":"  We study learning-augmented binary search trees (BSTs) and B-Trees via Treaps\nwith composite priorities. The result is a simple search tree where the depth\nof each item is determined by its predicted weight $w_x$. To achieve the\nresult, each item $x$ has its composite priority\n$-\\lfloor\\log\\log(1/w_x)\\rfloor + U(0, 1)$ where $U(0, 1)$ is the uniform\nrandom variable. This generalizes the recent learning-augmented BSTs\n[Lin-Luo-Woodruff ICML`22], which only work for Zipfian distributions, to\narbitrary inputs and predictions. It also gives the first B-Tree data structure\nthat can provably take advantage of localities in the access sequence via\nonline self-reorganization. The data structure is robust to prediction errors\nand handles insertions, deletions, as well as prediction updates.\n","authors":["Xinyuan Cao","Jingbang Chen","Li Chen","Chris Lambert","Richard Peng","Daniel Sleator"],"pdf_url":"https://arxiv.org/pdf/2211.09251v2.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2307.10617v3","updated":"2023-07-24T07:03:01Z","published":"2023-07-20T06:35:43Z","title":"Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques","summary":"  In the contemporary digital landscape, online reviews have become an\nindispensable tool for promoting products and services across various\nbusinesses. Marketers, advertisers, and online businesses have found incentives\nto create deceptive positive reviews for their products and negative reviews\nfor their competitors' offerings. As a result, the writing of deceptive reviews\nhas become an unavoidable practice for businesses seeking to promote themselves\nor undermine their rivals. Detecting such deceptive reviews has become an\nintense and ongoing area of research. This research paper proposes a machine\nlearning model to identify deceptive reviews, with a particular focus on\nrestaurants. This study delves into the performance of numerous experiments\nconducted on a dataset of restaurant reviews known as the Deceptive Opinion\nSpam Corpus. To accomplish this, an n-gram model and max features are developed\nto effectively identify deceptive content, particularly focusing on fake\nreviews. A benchmark study is undertaken to explore the performance of two\ndifferent feature extraction techniques, which are then coupled with five\ndistinct machine learning classification algorithms. The experimental results\nreveal that the passive aggressive classifier stands out among the various\nalgorithms, showcasing the highest accuracy not only in text classification but\nalso in identifying fake reviews. Moreover, the research delves into data\naugmentation and implements various deep learning techniques to further enhance\nthe process of detecting deceptive reviews. The findings shed light on the\nefficacy of the proposed machine learning approach and offer valuable insights\ninto dealing with deceptive reviews in the realm of online businesses.\n","authors":["Anusuya Baby Hari Krishnan"],"pdf_url":"https://arxiv.org/pdf/2307.10617v3.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.12555v1","updated":"2023-07-24T06:41:59Z","published":"2023-07-24T06:41:59Z","title":"Homophily-Driven Sanitation View for Robust Graph Contrastive Learning","summary":"  We investigate adversarial robustness of unsupervised Graph Contrastive\nLearning (GCL) against structural attacks. First, we provide a comprehensive\nempirical and theoretical analysis of existing attacks, revealing how and why\nthey downgrade the performance of GCL. Inspired by our analytic results, we\npresent a robust GCL framework that integrates a homophily-driven sanitation\nview, which can be learned jointly with contrastive learning. A key challenge\nthis poses, however, is the non-differentiable nature of the sanitation\nobjective. To address this challenge, we propose a series of techniques to\nenable gradient-based end-to-end robust GCL. Moreover, we develop a fully\nunsupervised hyperparameter tuning method which, unlike prior approaches, does\nnot require knowledge of node labels. We conduct extensive experiments to\nevaluate the performance of our proposed model, GCHS (Graph Contrastive\nLearning with Homophily-driven Sanitation View), against two state of the art\nstructural attacks on GCL. Our results demonstrate that GCHS consistently\noutperforms all state of the art baselines in terms of the quality of generated\nnode embeddings as well as performance on two important downstream tasks.\n","authors":["Yulin Zhu","Xing Ai","Yevgeniy Vorobeychik","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.12555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12551v1","updated":"2023-07-24T06:38:10Z","published":"2023-07-24T06:38:10Z","title":"Continuation Path Learning for Homotopy Optimization","summary":"  Homotopy optimization is a traditional method to deal with a complicated\noptimization problem by solving a sequence of easy-to-hard surrogate\nsubproblems. However, this method can be very sensitive to the continuation\nschedule design and might lead to a suboptimal solution to the original\nproblem. In addition, the intermediate solutions, often ignored by classic\nhomotopy optimization, could be useful for many real-world applications. In\nthis work, we propose a novel model-based approach to learn the whole\ncontinuation path for homotopy optimization, which contains infinite\nintermediate solutions for any surrogate subproblems. Rather than the classic\nunidirectional easy-to-hard optimization, our method can simultaneously\noptimize the original problem and all surrogate subproblems in a collaborative\nmanner. The proposed model also supports real-time generation of any\nintermediate solution, which could be desirable for many applications.\nExperimental studies on different problems show that our proposed method can\nsignificantly improve the performance of homotopy optimization and provide\nextra helpful information to support better decision-making.\n","authors":["Xi Lin","Zhiyuan Yang","Xiaoyuan Zhang","Qingfu Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12551v1.pdf","comment":"Accepted by the 40th International Conference on Machine Learning\n  (ICML 2023)"},{"id":"http://arxiv.org/abs/2304.12438v2","updated":"2023-07-24T06:19:17Z","published":"2023-04-24T20:24:07Z","title":"Stochastic MPC for energy hubs using data driven demand forecasting","summary":"  Energy hubs convert and distribute energy resources by combining different\nenergy inputs through multiple conversion and storage components. The optimal\noperation of the energy hub exploits its flexibility to increase the energy\nefficiency and reduce the operational costs. However, uncertainties in the\ndemand present challenges to energy hub optimization. In this paper, we propose\na stochastic MPC controller to minimize energy costs using chance constraints\nfor the uncertain electricity and thermal demands. Historical data is used to\nbuild a demand prediction model based on Gaussian processes to generate a\nforecast of the future electricity and heat demands. The stochastic\noptimization problem is solved via the Scenario Approach by sampling multi-step\ndemand trajectories from the derived prediction model. The performance of the\nproposed predictor and of the stochastic controller is verified on a simulated\nenergy hub model and demand data from a real building.\n","authors":["Varsha Behrunani","Francesco Micheli","Jonas Mehr","Philipp Heer","John Lygeros"],"pdf_url":"https://arxiv.org/pdf/2304.12438v2.pdf","comment":"6 pages, 5 figures. Submitted to IFAC World Congress 2023"},{"id":"http://arxiv.org/abs/2211.09710v3","updated":"2023-07-24T05:39:27Z","published":"2022-11-17T17:45:59Z","title":"Style Classification of Rabbinic Literature for Detection of Lost\n  Midrash Tanhuma Material","summary":"  Midrash collections are complex rabbinic works that consist of text in\nmultiple languages, which evolved through long processes of unstable oral and\nwritten transmission. Determining the origin of a given passage in such a\ncompilation is not always straightforward and is often a matter of dispute\namong scholars, yet it is essential for scholars' understanding of the passage\nand its relationship to other texts in the rabbinic corpus. To help solve this\nproblem, we propose a system for classification of rabbinic literature based on\nits style, leveraging recent advances in natural language processing for Hebrew\ntexts. Additionally, we demonstrate how this method can be applied to uncover\nlost material from a specific midrash genre, Tan\\d{h}uma-Yelammedenu, that has\nbeen preserved in later anthologies.\n","authors":["Shlomo Tannor","Nachum Dershowitz","Moshe Lavee"],"pdf_url":"https://arxiv.org/pdf/2211.09710v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12532v1","updated":"2023-07-24T05:36:19Z","published":"2023-07-24T05:36:19Z","title":"On the Connection between Pre-training Data Diversity and Fine-tuning\n  Robustness","summary":"  Pre-training has been widely adopted in deep learning to improve model\nperformance, especially when the training data for a target task is limited. In\nour work, we seek to understand the implications of this training strategy on\nthe generalization properties of downstream models. More specifically, we ask\nthe following question: how do properties of the pre-training distribution\naffect the robustness of a fine-tuned model? The properties we explore include\nthe label space, label semantics, image diversity, data domains, and data\nquantity of the pre-training distribution. We find that the primary factor\ninfluencing downstream effective robustness (Taori et al., 2020) is data\nquantity, while other factors have limited significance. For example, reducing\nthe number of ImageNet pre-training classes by 4x while increasing the number\nof images per class by 4x (that is, keeping total data quantity fixed) does not\nimpact the robustness of fine-tuned models. We demonstrate our findings on\npre-training distributions drawn from various natural and synthetic data\nsources, primarily using the iWildCam-WILDS distribution shift as a test for\ndownstream robustness.\n","authors":["Vivek Ramanujan","Thao Nguyen","Sewoong Oh","Ludwig Schmidt","Ali Farhadi"],"pdf_url":"https://arxiv.org/pdf/2307.12532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12526v1","updated":"2023-07-24T04:56:23Z","published":"2023-07-24T04:56:23Z","title":"Rethinking Medical Report Generation: Disease Revealing Enhancement with\n  Knowledge Graph","summary":"  Knowledge Graph (KG) plays a crucial role in Medical Report Generation (MRG)\nbecause it reveals the relations among diseases and thus can be utilized to\nguide the generation process. However, constructing a comprehensive KG is\nlabor-intensive and its applications on the MRG process are under-explored. In\nthis study, we establish a complete KG on chest X-ray imaging that includes 137\ntypes of diseases and abnormalities. Based on this KG, we find that the current\nMRG data sets exhibit a long-tailed problem in disease distribution. To\nmitigate this problem, we introduce a novel augmentation strategy that enhances\nthe representation of disease types in the tail-end of the distribution. We\nfurther design a two-stage MRG approach, where a classifier is first trained to\ndetect whether the input images exhibit any abnormalities. The classified\nimages are then independently fed into two transformer-based generators,\nnamely, ``disease-specific generator\" and ``disease-free generator\" to generate\nthe corresponding reports. To enhance the clinical evaluation of whether the\ngenerated reports correctly describe the diseases appearing in the input image,\nwe propose diverse sensitivity (DS), a new metric that checks whether generated\ndiseases match ground truth and measures the diversity of all generated\ndiseases. Results show that the proposed two-stage generation framework and\naugmentation strategies improve DS by a considerable margin, indicating a\nnotable reduction in the long-tailed problem associated with under-represented\ndiseases.\n","authors":["Yixin Wang","Zihao Lin","Haoyu Dong"],"pdf_url":"https://arxiv.org/pdf/2307.12526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12524v1","updated":"2023-07-24T04:46:22Z","published":"2023-07-24T04:46:22Z","title":"Landslide Surface Displacement Prediction Based on VSXC-LSTM Algorithm","summary":"  Landslide is a natural disaster that can easily threaten local ecology,\npeople's lives and property. In this paper, we conduct modelling research on\nreal unidirectional surface displacement data of recent landslides in the\nresearch area and propose a time series prediction framework named\nVMD-SegSigmoid-XGBoost-ClusterLSTM (VSXC-LSTM) based on variational mode\ndecomposition, which can predict the landslide surface displacement more\naccurately. The model performs well on the test set. Except for the random item\nsubsequence that is hard to fit, the root mean square error (RMSE) and the mean\nabsolute percentage error (MAPE) of the trend item subsequence and the periodic\nitem subsequence are both less than 0.1, and the RMSE is as low as 0.006 for\nthe periodic item prediction module based on XGBoost\\footnote{Accepted in\nICANN2023}.\n","authors":["Menglin Kong","Ruichen Li","Fan Liu","Xingquan Li","Juan Cheng","Muzhou Hou","Cong Cao"],"pdf_url":"https://arxiv.org/pdf/2307.12524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12520v1","updated":"2023-07-24T04:29:43Z","published":"2023-07-24T04:29:43Z","title":"Lost In Translation: Generating Adversarial Examples Robust to\n  Round-Trip Translation","summary":"  Language Models today provide a high accuracy across a large number of\ndownstream tasks. However, they remain susceptible to adversarial attacks,\nparticularly against those where the adversarial examples maintain considerable\nsimilarity to the original text. Given the multilingual nature of text, the\neffectiveness of adversarial examples across translations and how machine\ntranslations can improve the robustness of adversarial examples remain largely\nunexplored. In this paper, we present a comprehensive study on the robustness\nof current text adversarial attacks to round-trip translation. We demonstrate\nthat 6 state-of-the-art text-based adversarial attacks do not maintain their\nefficacy after round-trip translation. Furthermore, we introduce an\nintervention-based solution to this problem, by integrating Machine Translation\ninto the process of adversarial example generation and demonstrating increased\nrobustness to round-trip translation. Our results indicate that finding\nadversarial examples robust to translation can help identify the insufficiency\nof language models that is common across languages, and motivate further\nresearch into multilingual adversarial attacks.\n","authors":["Neel Bhandari","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12520v1.pdf","comment":"Published at International Conference on Acoustics, Speech, and\n  Signal Processing (ICASSP) 2023"},{"id":"http://arxiv.org/abs/2307.12519v1","updated":"2023-07-24T04:29:00Z","published":"2023-07-24T04:29:00Z","title":"DEPHN: Different Expression Parallel Heterogeneous Network using virtual\n  gradient optimization for Multi-task Learning","summary":"  Recommendation system algorithm based on multi-task learning (MTL) is the\nmajor method for Internet operators to understand users and predict their\nbehaviors in the multi-behavior scenario of platform. Task correlation is an\nimportant consideration of MTL goals, traditional models use shared-bottom\nmodels and gating experts to realize shared representation learning and\ninformation differentiation. However, The relationship between real-world tasks\nis often more complex than existing methods do not handle properly sharing\ninformation. In this paper, we propose an Different Expression Parallel\nHeterogeneous Network (DEPHN) to model multiple tasks simultaneously. DEPHN\nconstructs the experts at the bottom of the model by using different feature\ninteraction methods to improve the generalization ability of the shared\ninformation flow. In view of the model's differentiating ability for different\ntask information flows, DEPHN uses feature explicit mapping and virtual\ngradient coefficient for expert gating during the training process, and\nadaptively adjusts the learning intensity of the gated unit by considering the\ndifference of gating values and task correlation. Extensive experiments on\nartificial and real-world datasets demonstrate that our proposed method can\ncapture task correlation in complex situations and achieve better performance\nthan baseline models\\footnote{Accepted in IJCNN2023}.\n","authors":["Menglin Kong","Ri Su","Shaojie Zhao","Muzhou Hou"],"pdf_url":"https://arxiv.org/pdf/2307.12519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12518v1","updated":"2023-07-24T04:23:08Z","published":"2023-07-24T04:23:08Z","title":"FaFCNN: A General Disease Classification Framework Based on Feature\n  Fusion Neural Networks","summary":"  There are two fundamental problems in applying deep learning/machine learning\nmethods to disease classification tasks, one is the insufficient number and\npoor quality of training samples; another one is how to effectively fuse\nmultiple source features and thus train robust classification models. To\naddress these problems, inspired by the process of human learning knowledge, we\npropose the Feature-aware Fusion Correlation Neural Network (FaFCNN), which\nintroduces a feature-aware interaction module and a feature alignment module\nbased on domain adversarial learning. This is a general framework for disease\nclassification, and FaFCNN improves the way existing methods obtain sample\ncorrelation features. The experimental results show that training using\naugmented features obtained by pre-training gradient boosting decision tree\nyields more performance gains than random-forest based methods. On the\nlow-quality dataset with a large amount of missing data in our setup, FaFCNN\nobtains a consistently optimal performance compared to competitive baselines.\nIn addition, extensive experiments demonstrate the robustness of the proposed\nmethod and the effectiveness of each component of the model\\footnote{Accepted\nin IEEE SMC2023}.\n","authors":["Menglin Kong","Shaojie Zhao","Juan Cheng","Xingquan Li","Ri Su","Muzhou Hou","Cong Cao"],"pdf_url":"https://arxiv.org/pdf/2307.12518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12510v1","updated":"2023-07-24T03:52:11Z","published":"2023-07-24T03:52:11Z","title":"An Empirical Evaluation of Temporal Graph Benchmark","summary":"  In this paper, we conduct an empirical evaluation of Temporal Graph Benchmark\n(TGB) by extending our Dynamic Graph Library (DyGLib) to TGB. Compared with\nTGB, we include eleven popular dynamic graph learning methods for more\nexhaustive comparisons. Through the experiments, we find that (1) some issues\nneed to be addressed in the current version of TGB, including mismatched data\nstatistics, inaccurate evaluation metric computation, and so on; (2) different\nmodels depict varying performance across various datasets, which is in line\nwith previous observations; (3) the performance of some baselines can be\nsignificantly improved over the reported results in TGB when using DyGLib. This\nwork aims to ease the researchers' efforts in evaluating various dynamic graph\nlearning methods on TGB and attempts to offer results that can be directly\nreferenced in the follow-up research. All the used resources in this project\nare publicly available at https://github.com/yule-BUAA/DyGLib_TGB. This work is\nin progress, and feedback from the community is welcomed for improvements.\n","authors":["Le Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12510v1.pdf","comment":"preprint, in progress"},{"id":"http://arxiv.org/abs/2304.03483v2","updated":"2023-07-24T03:28:34Z","published":"2023-04-07T05:29:59Z","title":"RED-PSM: Regularization by Denoising of Partially Separable Models for\n  Dynamic Imaging","summary":"  Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at\neach time instant using its undersampled measurements. In particular, in the\ncase of dynamic tomography, only a single projection at a single view angle may\nbe available at a time, making the problem severely ill-posed. In this work, we\npropose an approach, RED-PSM, which combines for the first time two powerful\ntechniques to address this challenging imaging problem. The first, are\npartially separable models, which have been used to efficiently introduce a\nlow-rank prior for the spatio-temporal object. The second is the recent\nRegularization by Denoising (RED), which provides a flexible framework to\nexploit the impressive performance of state-of-the-art image denoising\nalgorithms, for various inverse problems. We propose a partially separable\nobjective with RED and a computationally efficient and scalable optimization\nscheme with variable splitting and ADMM. Theoretical analysis proves the\nconvergence of our objective to a value corresponding to a stationary point\nsatisfying the first-order optimality conditions. Convergence is accelerated by\na particular projection-domain-based initialization. We demonstrate the\nperformance and computational improvements of our proposed RED-PSM with a\nlearned image denoiser by comparing it to a recent deep-prior-based method\nknown as TD-DIP. Although the main focus is on dynamic tomography, we also show\nthe performance advantages of RED-PSM in a cardiac dynamic MRI setting.\n","authors":["Berk Iskender","Marc L. Klasky","Yoram Bresler"],"pdf_url":"https://arxiv.org/pdf/2304.03483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12499v1","updated":"2023-07-24T03:10:02Z","published":"2023-07-24T03:10:02Z","title":"AdvDiff: Generating Unrestricted Adversarial Examples using Diffusion\n  Models","summary":"  Unrestricted adversarial attacks present a serious threat to deep learning\nmodels and adversarial defense techniques. They pose severe security problems\nfor deep learning applications because they can effectively bypass defense\nmechanisms. However, previous attack methods often utilize Generative\nAdversarial Networks (GANs), which are not theoretically provable and thus\ngenerate unrealistic examples by incorporating adversarial objectives,\nespecially for large-scale datasets like ImageNet. In this paper, we propose a\nnew method, called AdvDiff, to generate unrestricted adversarial examples with\ndiffusion models. We design two novel adversarial guidance techniques to\nconduct adversarial sampling in the reverse generation process of diffusion\nmodels. These two techniques are effective and stable to generate high-quality,\nrealistic adversarial examples by integrating gradients of the target\nclassifier interpretably. Experimental results on MNIST and ImageNet datasets\ndemonstrate that AdvDiff is effective to generate unrestricted adversarial\nexamples, which outperforms GAN-based methods in terms of attack performance\nand generation quality.\n","authors":["Xuelong Dai","Kaisheng Liang","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.12499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12496v1","updated":"2023-07-24T03:04:10Z","published":"2023-07-24T03:04:10Z","title":"A faster and simpler algorithm for learning shallow networks","summary":"  We revisit the well-studied problem of learning a linear combination of $k$\nReLU activations given labeled examples drawn from the standard $d$-dimensional\nGaussian measure. Chen et al. [CDG+23] recently gave the first algorithm for\nthis problem to run in $\\text{poly}(d,1/\\varepsilon)$ time when $k = O(1)$,\nwhere $\\varepsilon$ is the target error. More precisely, their algorithm runs\nin time $(d/\\varepsilon)^{\\mathrm{quasipoly}(k)}$ and learns over multiple\nstages. Here we show that a much simpler one-stage version of their algorithm\nsuffices, and moreover its runtime is only $(d/\\varepsilon)^{O(k^2)}$.\n","authors":["Sitan Chen","Shyam Narayanan"],"pdf_url":"https://arxiv.org/pdf/2307.12496v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2307.12491v1","updated":"2023-07-24T02:50:19Z","published":"2023-07-24T02:50:19Z","title":"Learning Universal and Robust 3D Molecular Representations with Graph\n  Convolutional Networks","summary":"  To learn accurate representations of molecules, it is essential to consider\nboth chemical and geometric features. To encode geometric information, many\ndescriptors have been proposed in constrained circumstances for specific types\nof molecules and do not have the properties to be ``robust\": 1. Invariant to\nrotations and translations; 2. Injective when embedding molecular structures.\nIn this work, we propose a universal and robust Directional Node Pair (DNP)\ndescriptor based on the graph representations of 3D molecules. Our DNP\ndescriptor is robust compared to previous ones and can be applied to multiple\nmolecular types. To combine the DNP descriptor and chemical features in\nmolecules, we construct the Robust Molecular Graph Convolutional Network\n(RoM-GCN) which is capable to take both node and edge features into\nconsideration when generating molecule representations. We evaluate our model\non protein and small molecule datasets. Our results validate the superiority of\nthe DNP descriptor in incorporating 3D geometric information of molecules.\nRoM-GCN outperforms all compared baselines.\n","authors":["Shuo Zhang","Yang Liu","Li Xie","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2307.12491v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2307.01482v2","updated":"2023-07-24T02:40:29Z","published":"2023-07-04T05:19:19Z","title":"Nexus sine qua non: Essentially Connected Networks for Traffic\n  Forecasting","summary":"  Spatial-temporal graph neural networks (STGNNs) have become the de facto\nmodels for learning spatiotemporal representations of traffic flow. However,\nmodern STGNNs often contain superfluous or obscure components, along with\ncomplex techniques, posing significant challenges in terms of complexity and\nscalability. Such concerns prompt us to rethink the design of neural\narchitectures and to identify the key challenges in traffic forecasting as\nspatial-temporal contextualization. Here, we present an essentially connected\nmodel based on an efficient message-passing backbone, powered by learnable node\nembedding, without any complex sequential techniques such as TCNs, RNNs, and\nTransformers. Intriguingly, empirical results demonstrate how a simple and\nelegant model with contextualization capability compares favorably w.r.t. the\nstate-of-the-art with elaborate structures, while being much more interpretable\nand computationally efficient for traffic forecasting. We anticipate that our\nfindings will open new horizons for further research to explore the possibility\nof creating simple but effective neural forecasting architectures.\n","authors":["Tong Nie","Guoyang Qin","Yunpeng Wang","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2307.01482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04893v2","updated":"2023-07-24T02:38:09Z","published":"2023-07-10T20:31:23Z","title":"Choosing Well Your Opponents: How to Guide the Synthesis of Programmatic\n  Strategies","summary":"  This paper introduces Local Learner (2L), an algorithm for providing a set of\nreference strategies to guide the search for programmatic strategies in\ntwo-player zero-sum games. Previous learning algorithms, such as Iterated Best\nResponse (IBR), Fictitious Play (FP), and Double-Oracle (DO), can be\ncomputationally expensive or miss important information for guiding search\nalgorithms. 2L actively selects a set of reference strategies to improve the\nsearch signal. We empirically demonstrate the advantages of our approach while\nguiding a local search algorithm for synthesizing strategies in three games,\nincluding MicroRTS, a challenging real-time strategy game. Results show that 2L\nlearns reference strategies that provide a stronger search signal than IBR, FP,\nand DO. We also simulate a tournament of MicroRTS, where a synthesizer using 2L\noutperformed the winners of the two latest MicroRTS competitions, which were\nprogrammatic strategies written by human programmers.\n","authors":["Rubens O. Moraes","David S. Aleixo","Lucas N. Ferreira","Levi H. S. Lelis"],"pdf_url":"https://arxiv.org/pdf/2307.04893v2.pdf","comment":"International Joint Conference on Artificial Intelligence (IJCAI)\n  2023"},{"id":"http://arxiv.org/abs/2307.12480v1","updated":"2023-07-24T02:28:50Z","published":"2023-07-24T02:28:50Z","title":"Learning Resource Allocation Policy: Vertex-GNN or Edge-GNN?","summary":"  Graph neural networks (GNNs) update the hidden representations of vertices\n(called Vertex-GNNs) or hidden representations of edges (called Edge-GNNs) by\nprocessing and pooling the information of neighboring vertices and edges and\ncombining to incorporate graph topology. When learning resource allocation\npolicies, GNNs cannot perform well if their expressive power are weak, i.e., if\nthey cannot differentiate all input features such as channel matrices. In this\npaper, we analyze the expressive power of the Vertex-GNNs and Edge-GNNs for\nlearning three representative wireless policies: link scheduling, power\ncontrol, and precoding policies. We find that the expressive power of the GNNs\ndepend on the linearity and output dimensions of the processing and combination\nfunctions. When linear processors are used, the Vertex-GNNs cannot\ndifferentiate all channel matrices due to the loss of channel information,\nwhile the Edge-GNNs can. When learning the precoding policy, even the\nVertex-GNNs with non-linear processors may not be with strong expressive\nability due to the dimension compression. We proceed to provide necessary\nconditions for the GNNs to well learn the precoding policy. Simulation results\nvalidate the analyses and show that the Edge-GNNs can achieve the same\nperformance as the Vertex-GNNs with much lower training and inference time.\n","authors":["Yao Peng","Jia Guo","Chenyang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.16392v2","updated":"2023-07-24T02:05:50Z","published":"2022-10-28T20:13:00Z","title":"Physics-aware Graph Neural Network for Accurate RNA 3D Structure\n  Prediction","summary":"  Biological functions of RNAs are determined by their three-dimensional (3D)\nstructures. Thus, given the limited number of experimentally determined RNA\nstructures, the prediction of RNA structures will facilitate elucidating RNA\nfunctions and RNA-targeted drug discovery, but remains a challenging task. In\nthis work, we propose a Graph Neural Network (GNN)-based scoring function\ntrained only with the atomic types and coordinates on limited solved RNA 3D\nstructures for distinguishing accurate structural models. The proposed\nPhysics-aware Multiplex Graph Neural Network (PaxNet) separately models the\nlocal and non-local interactions inspired by molecular mechanics. Furthermore,\nPaxNet contains an attention-based fusion module that learns the individual\ncontribution of each interaction type for the final prediction. We rigorously\nevaluate the performance of PaxNet on two benchmarks and compare it with\nseveral state-of-the-art baselines. The results show that PaxNet significantly\noutperforms all the baselines overall, and demonstrate the potential of PaxNet\nfor improving the 3D structure modeling of RNA and other macromolecules. Our\ncode is available at https://github.com/zetayue/Physics-aware-Multiplex-GNN.\n","authors":["Shuo Zhang","Yang Liu","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2210.16392v2.pdf","comment":"Accepted by the Machine Learning for Structural Biology Workshop\n  (MLSB) at the 36th Conference on Neural Information Processing Systems\n  (NeurIPS 2022)"},{"id":"http://arxiv.org/abs/2307.12472v1","updated":"2023-07-24T01:58:48Z","published":"2023-07-24T01:58:48Z","title":"Model-free generalized fiducial inference","summary":"  Motivated by the need for the development of safe and reliable methods for\nuncertainty quantification in machine learning, I propose and develop ideas for\na model-free statistical framework for imprecise probabilistic prediction\ninference. This framework facilitates uncertainty quantification in the form of\nprediction sets that offer finite sample control of type 1 errors, a property\nshared with conformal prediction sets, but this new approach also offers more\nversatile tools for imprecise probabilistic reasoning. Furthermore, I propose\nand consider the theoretical and empirical properties of a precise\nprobabilistic approximation to the model-free imprecise framework.\nApproximating a belief/plausibility measure pair by an [optimal in some sense]\nprobability measure in the credal set is a critical resolution needed for the\nbroader adoption of imprecise probabilistic approaches to inference in\nstatistical and machine learning communities. It is largely undetermined in the\nstatistical and machine learning literatures, more generally, how to properly\nquantify uncertainty in that there is no generally accepted standard of\naccountability of stated uncertainties. The research I present in this\nmanuscript is aimed at motivating a framework for statistical inference with\nreliability and accountability as the guiding principles.\n","authors":["Jonathan P Williams"],"pdf_url":"https://arxiv.org/pdf/2307.12472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12463v1","updated":"2023-07-24T00:53:46Z","published":"2023-07-24T00:53:46Z","title":"Rethinking Data Distillation: Do Not Overlook Calibration","summary":"  Neural networks trained on distilled data often produce over-confident output\nand require correction by calibration methods. Existing calibration methods\nsuch as temperature scaling and mixup work well for networks trained on\noriginal large-scale data. However, we find that these methods fail to\ncalibrate networks trained on data distilled from large source datasets. In\nthis paper, we show that distilled data lead to networks that are not\ncalibratable due to (i) a more concentrated distribution of the maximum logits\nand (ii) the loss of information that is semantically meaningful but unrelated\nto classification tasks. To address this problem, we propose Masked Temperature\nScaling (MTS) and Masked Distillation Training (MDT) which mitigate the\nlimitations of distilled data and achieve better calibration results while\nmaintaining the efficiency of dataset distillation.\n","authors":["Dongyao Zhu","Bowen Lei","Jie Zhang","Yanbo Fang","Ruqi Zhang","Yiqun Xie","Dongkuan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.12463v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12461v1","updated":"2023-07-24T00:16:50Z","published":"2023-07-24T00:16:50Z","title":"Rates of Approximation by ReLU Shallow Neural Networks","summary":"  Neural networks activated by the rectified linear unit (ReLU) play a central\nrole in the recent development of deep learning. The topic of approximating\nfunctions from H\\\"older spaces by these networks is crucial for understanding\nthe efficiency of the induced learning algorithms. Although the topic has been\nwell investigated in the setting of deep neural networks with many layers of\nhidden neurons, it is still open for shallow networks having only one hidden\nlayer. In this paper, we provide rates of uniform approximation by these\nnetworks. We show that ReLU shallow neural networks with $m$ hidden neurons can\nuniformly approximate functions from the H\\\"older space $W_\\infty^r([-1, 1]^d)$\nwith rates $O((\\log m)^{\\frac{1}{2} +d}m^{-\\frac{r}{d}\\frac{d+2}{d+4}})$ when\n$r<d/2 +2$. Such rates are very close to the optimal one $O(m^{-\\frac{r}{d}})$\nin the sense that $\\frac{d+2}{d+4}$ is close to $1$, when the dimension $d$ is\nlarge.\n","authors":["Tong Mao","Ding-Xuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.12461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13178v1","updated":"2023-07-24T23:57:29Z","published":"2023-07-24T23:57:29Z","title":"Evaluating the reliability of automatically generated pedestrian and\n  bicycle crash surrogates","summary":"  Vulnerable road users (VRUs), such as pedestrians and bicyclists, are at a\nhigher risk of being involved in crashes with motor vehicles, and crashes\ninvolving VRUs also are more likely to result in severe injuries or fatalities.\nSignalized intersections are a major safety concern for VRUs due to their\ncomplex and dynamic nature, highlighting the need to understand how these road\nusers interact with motor vehicles and deploy evidence-based countermeasures to\nimprove safety performance. Crashes involving VRUs are relatively infrequent,\nmaking it difficult to understand the underlying contributing factors. An\nalternative is to identify and use conflicts between VRUs and motorized\nvehicles as a surrogate for safety performance. Automatically detecting these\nconflicts using a video-based systems is a crucial step in developing smart\ninfrastructure to enhance VRU safety. The Pennsylvania Department of\nTransportation conducted a study using video-based event monitoring system to\nassess VRU and motor vehicle interactions at fifteen signalized intersections\nacross Pennsylvania to improve VRU safety performance. This research builds on\nthat study to assess the reliability of automatically generated surrogates in\npredicting confirmed conflicts using advanced data-driven models. The surrogate\ndata used for analysis include automatically collectable variables such as\nvehicular and VRU speeds, movements, post-encroachment time, in addition to\nmanually collected variables like signal states, lighting, and weather\nconditions. The findings highlight the varying importance of specific\nsurrogates in predicting true conflicts, some being more informative than\nothers. The findings can assist transportation agencies to collect the right\ntypes of data to help prioritize infrastructure investments, such as bike lanes\nand crosswalks, and evaluate their effectiveness.\n","authors":["Agnimitra Sengupta","S. Ilgin Guler","Vikash V. Gayah","Shannon Warchol"],"pdf_url":"https://arxiv.org/pdf/2307.13178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06363v2","updated":"2023-07-24T23:39:15Z","published":"2023-01-16T11:17:32Z","title":"A$^2$-UAV: Application-Aware Content and Network Optimization of\n  Edge-Assisted UAV Systems","summary":"  To perform advanced surveillance, Unmanned Aerial Vehicles (UAVs) require the\nexecution of edge-assisted computer vision (CV) tasks. In multi-hop UAV\nnetworks, the successful transmission of these tasks to the edge is severely\nchallenged due to severe bandwidth constraints. For this reason, we propose a\nnovel A$^2$-UAV framework to optimize the number of correctly executed tasks at\nthe edge. In stark contrast with existing art, we take an application-aware\napproach and formulate a novel pplication-Aware Task Planning Problem\n(A$^2$-TPP) that takes into account (i) the relationship between deep neural\nnetwork (DNN) accuracy and image compression for the classes of interest based\non the available dataset, (ii) the target positions, (iii) the current\nenergy/position of the UAVs to optimize routing, data pre-processing and target\nassignment for each UAV. We demonstrate A$^2$-TPP is NP-Hard and propose a\npolynomial-time algorithm to solve it efficiently. We extensively evaluate\nA$^2$-UAV through real-world experiments with a testbed composed by four DJI\nMavic Air 2 UAVs. We consider state-of-the-art image classification tasks with\nfour different DNN models (i.e., DenseNet, ResNet152, ResNet50 and\nMobileNet-V2) and object detection tasks using YoloV4 trained on the ImageNet\ndataset. Results show that A$^2$-UAV attains on average around 38% more\naccomplished tasks than the state-of-the-art, with 400% more accomplished tasks\nwhen the number of targets increases significantly. To allow full\nreproducibility, we pledge to share datasets and code with the research\ncommunity.\n","authors":["Andrea Coletta","Flavio Giorgi","Gaia Maselli","Matteo Prata","Domenicomichele Silvestri","Jonathan Ashdown","Francesco Restuccia"],"pdf_url":"https://arxiv.org/pdf/2301.06363v2.pdf","comment":"Accepted to INFOCOM 2023"},{"id":"http://arxiv.org/abs/2202.03167v2","updated":"2023-07-24T22:56:21Z","published":"2022-02-07T13:51:19Z","title":"Bayesian Non-stationary Linear Bandits for Large-Scale Recommender\n  Systems","summary":"  Taking advantage of contextual information can potentially boost the\nperformance of recommender systems. In the era of big data, such side\ninformation often has several dimensions. Thus, developing decision-making\nalgorithms to cope with such a high-dimensional context in real time is\nessential. That is specifically challenging when the decision-maker has a\nvariety of items to recommend. In addition, changes in items' popularity or\nusers' preferences can hinder the performance of the deployed recommender\nsystem due to a lack of robustness to distribution shifts in the environment.\nIn this paper, we build upon the linear contextual multi-armed bandit framework\nto address this problem. We develop a decision-making policy for a linear\nbandit problem with high-dimensional feature vectors, a large set of arms, and\nnon-stationary reward-generating processes. Our Thompson sampling-based policy\nreduces the dimension of feature vectors using random projection and uses\nexponentially increasing weights to decrease the influence of past observations\nwith time. Our proposed recommender system employs this policy to learn the\nusers' item preferences online while minimizing runtime. We prove a regret\nbound that scales as a factor of the reduced dimension instead of the original\none. To evaluate our proposed recommender system numerically, we apply it to\nthree real-world datasets. The theoretical and numerical results demonstrate\nthe effectiveness of our proposed algorithm in making a trade-off between\ncomputational complexity and regret performance compared to the\nstate-of-the-art.\n","authors":["Saeed Ghoorchian","Evgenii Kortukov","Setareh Maghsudi"],"pdf_url":"https://arxiv.org/pdf/2202.03167v2.pdf","comment":"30 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.13158v1","updated":"2023-07-24T22:52:02Z","published":"2023-07-24T22:52:02Z","title":"Multi-UAV Speed Control with Collision Avoidance and Handover-aware Cell\n  Association: DRL with Action Branching","summary":"  This paper presents a deep reinforcement learning solution for optimizing\nmulti-UAV cell-association decisions and their moving velocity on a 3D aerial\nhighway. The objective is to enhance transportation and communication\nperformance, including collision avoidance, connectivity, and handovers. The\nproblem is formulated as a Markov decision process (MDP) with UAVs' states\ndefined by velocities and communication data rates. We propose a neural\narchitecture with a shared decision module and multiple network branches, each\ndedicated to a specific action dimension in a 2D transportation-communication\nspace. This design efficiently handles the multi-dimensional action space,\nallowing independence for individual action dimensions. We introduce two\nmodels, Branching Dueling Q-Network (BDQ) and Branching Dueling Double Deep\nQ-Network (Dueling DDQN), to demonstrate the approach. Simulation results show\na significant improvement of 18.32% compared to existing benchmarks.\n","authors":["Zijiang Yan","Wael Jaafar","Bassant Selim","Hina Tabassum"],"pdf_url":"https://arxiv.org/pdf/2307.13158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13149v1","updated":"2023-07-24T22:22:32Z","published":"2023-07-24T22:22:32Z","title":"Discovering interpretable elastoplasticity models via the neural\n  polynomial method enabled symbolic regressions","summary":"  Conventional neural network elastoplasticity models are often perceived as\nlacking interpretability. This paper introduces a two-step machine-learning\napproach that returns mathematical models interpretable by human experts. In\nparticular, we introduce a surrogate model where yield surfaces are expressed\nin terms of a set of single-variable feature mappings obtained from supervised\nlearning. A postprocessing step is then used to re-interpret the set of\nsingle-variable neural network mapping functions into mathematical form through\nsymbolic regression. This divide-and-conquer approach provides several\nimportant advantages. First, it enables us to overcome the scaling issue of\nsymbolic regression algorithms. From a practical perspective, it enhances the\nportability of learned models for partial differential equation solvers written\nin different programming languages. Finally, it enables us to have a concrete\nunderstanding of the attributes of the materials, such as convexity and\nsymmetries of models, through automated derivations and reasoning. Numerical\nexamples have been provided, along with an open-source code to enable\nthird-party validation.\n","authors":["Bahador Bahmani","Hyoung Suk Suh","WaiChing Sun"],"pdf_url":"https://arxiv.org/pdf/2307.13149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13147v1","updated":"2023-07-24T22:01:22Z","published":"2023-07-24T22:01:22Z","title":"Extending Path-Dependent NJ-ODEs to Noisy Observations and a Dependent\n  Observation Framework","summary":"  The Path-Dependent Neural Jump ODE (PD-NJ-ODE) is a model for predicting\ncontinuous-time stochastic processes with irregular and incomplete\nobservations. In particular, the method learns optimal forecasts given\nirregularly sampled time series of incomplete past observations. So far the\nprocess itself and the coordinate-wise observation times were assumed to be\nindependent and observations were assumed to be noiseless. In this work we\ndiscuss two extensions to lift these restrictions and provide theoretical\nguarantees as well as empirical examples for them.\n","authors":["William Andersson","Jakob Heiss","Florian Krach","Josef Teichmann"],"pdf_url":"https://arxiv.org/pdf/2307.13147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.09736v2","updated":"2023-07-24T21:55:20Z","published":"2022-01-21T00:13:54Z","title":"Tensor and Matrix Low-Rank Value-Function Approximation in Reinforcement\n  Learning","summary":"  Value-function (VF) approximation is a central problem in Reinforcement\nLearning (RL). Classical non-parametric VF estimation suffers from the curse of\ndimensionality. As a result, parsimonious parametric models have been adopted\nto approximate VFs in high-dimensional spaces, with most efforts being focused\non linear and neural-network-based approaches. Differently, this paper puts\nforth a a \\emph{parsimonious non-parametric} approach, where we use\n\\emph{stochastic low-rank algorithms} to estimate the VF matrix in an online\nand model-free fashion. Furthermore, as VFs tend to be multi-dimensional, we\npropose replacing the classical VF matrix representation with a tensor\n(multi-way array) representation and, then, use the PARAFAC decomposition to\ndesign an online model-free tensor low-rank algorithm. Different versions of\nthe algorithms are proposed, their complexity is analyzed, and their\nperformance is assessed numerically using standardized RL environments.\n","authors":["Sergio Rozada","Santiago Paternain","Antonio G. Marques"],"pdf_url":"https://arxiv.org/pdf/2201.09736v2.pdf","comment":"13 pages, 6 figures, 2 table"},{"id":"http://arxiv.org/abs/2206.14284v4","updated":"2023-07-24T21:34:54Z","published":"2022-06-28T20:50:14Z","title":"Optimal Estimation of Generic Dynamics by Path-Dependent Neural Jump\n  ODEs","summary":"  This paper studies the problem of forecasting general stochastic processes\nusing a path-dependent extension of the Neural Jump ODE (NJ-ODE) framework.\nWhile NJ-ODE was the first framework to establish convergence guarantees for\nthe prediction of irregularly observed time series, these results were limited\nto data stemming from It\\^o-diffusions with complete observations, in\nparticular Markov processes where all coordinates are observed simultaneously.\nIn this work, we generalise these results to generic, possibly non-Markovian or\ndiscontinuous, stochastic processes with incomplete observations, by utilising\nthe reconstruction properties of the signature transform. These theoretical\nresults are supported by empirical studies, where it is shown that the\npath-dependent NJ-ODE outperforms the original NJ-ODE framework in the case of\nnon-Markovian data. Moreover, we show that PD-NJ-ODE can be applied\nsuccessfully to classical stochastic filtering problems and to limit order book\n(LOB) data.\n","authors":["Florian Krach","Marc Nübel","Josef Teichmann"],"pdf_url":"https://arxiv.org/pdf/2206.14284v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13136v1","updated":"2023-07-24T21:29:48Z","published":"2023-07-24T21:29:48Z","title":"Does Progress On Object Recognition Benchmarks Improve Real-World\n  Generalization?","summary":"  For more than a decade, researchers have measured progress in object\nrecognition on ImageNet-based generalization benchmarks such as ImageNet-A, -C,\nand -R. Recent advances in foundation models, trained on orders of magnitude\nmore data, have begun to saturate these standard benchmarks, but remain brittle\nin practice. This suggests standard benchmarks, which tend to focus on\npredefined or synthetic changes, may not be sufficient for measuring real world\ngeneralization. Consequently, we propose studying generalization across\ngeography as a more realistic measure of progress using two datasets of objects\nfrom households across the globe. We conduct an extensive empirical evaluation\nof progress across nearly 100 vision models up to most recent foundation\nmodels. We first identify a progress gap between standard benchmarks and\nreal-world, geographical shifts: progress on ImageNet results in up to 2.5x\nmore progress on standard generalization benchmarks than real-world\ndistribution shifts. Second, we study model generalization across geographies\nby measuring the disparities in performance across regions, a more fine-grained\nmeasure of real world generalization. We observe all models have large\ngeographic disparities, even foundation CLIP models, with differences of 7-20%\nin accuracy between regions. Counter to modern intuition, we discover progress\non standard benchmarks fails to improve geographic disparities and often\nexacerbates them: geographic disparities between the least performant models\nand today's best models have more than tripled. Our results suggest scaling\nalone is insufficient for consistent robustness to real-world distribution\nshifts. Finally, we highlight in early experiments how simple last layer\nretraining on more representative, curated data can complement scaling as a\npromising direction of future work, reducing geographic disparity on both\nbenchmarks by over two-thirds.\n","authors":["Megan Richards","Polina Kirichenko","Diane Bouchacourt","Mark Ibrahim"],"pdf_url":"https://arxiv.org/pdf/2307.13136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13133v1","updated":"2023-07-24T21:22:58Z","published":"2023-07-24T21:22:58Z","title":"simPLE: a visuotactile method learned in simulation to precisely pick,\n  localize, regrasp, and place objects","summary":"  Existing robotic systems have a clear tension between generality and\nprecision. Deployed solutions for robotic manipulation tend to fall into the\nparadigm of one robot solving a single task, lacking precise generalization,\ni.e., the ability to solve many tasks without compromising on precision. This\npaper explores solutions for precise and general pick-and-place. In precise\npick-and-place, i.e. kitting, the robot transforms an unstructured arrangement\nof objects into an organized arrangement, which can facilitate further\nmanipulation. We propose simPLE (simulation to Pick Localize and PLacE) as a\nsolution to precise pick-and-place. simPLE learns to pick, regrasp and place\nobjects precisely, given only the object CAD model and no prior experience. We\ndevelop three main components: task-aware grasping, visuotactile perception,\nand regrasp planning. Task-aware grasping computes affordances of grasps that\nare stable, observable, and favorable to placing. The visuotactile perception\nmodel relies on matching real observations against a set of simulated ones\nthrough supervised learning. Finally, we compute the desired robot motion by\nsolving a shortest path problem on a graph of hand-to-hand regrasps. On a\ndual-arm robot equipped with visuotactile sensing, we demonstrate\npick-and-place of 15 diverse objects with simPLE. The objects span a wide range\nof shapes and simPLE achieves successful placements into structured\narrangements with 1mm clearance over 90% of the time for 6 objects, and over\n80% of the time for 11 objects. Videos are available at\nhttp://mcube.mit.edu/research/simPLE.html .\n","authors":["Maria Bauza","Antonia Bronars","Yifan Hou","Ian Taylor","Nikhil Chavan-Dafle","Alberto Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2307.13133v1.pdf","comment":"33 pages, 6 figures, 2 tables, submitted to Science Robotics"},{"id":"http://arxiv.org/abs/2302.12685v2","updated":"2023-07-24T21:11:55Z","published":"2023-02-24T15:21:39Z","title":"Active Membership Inference Attack under Local Differential Privacy in\n  Federated Learning","summary":"  Federated learning (FL) was originally regarded as a framework for\ncollaborative learning among clients with data privacy protection through a\ncoordinating server. In this paper, we propose a new active membership\ninference (AMI) attack carried out by a dishonest server in FL. In AMI attacks,\nthe server crafts and embeds malicious parameters into global models to\neffectively infer whether a target data sample is included in a client's\nprivate training data or not. By exploiting the correlation among data features\nthrough a non-linear decision boundary, AMI attacks with a certified guarantee\nof success can achieve severely high success rates under rigorous local\ndifferential privacy (LDP) protection; thereby exposing clients' training data\nto significant privacy risk. Theoretical and experimental results on several\nbenchmark datasets show that adding sufficient privacy-preserving noise to\nprevent our attack would significantly damage FL's model utility.\n","authors":["Truc Nguyen","Phung Lai","Khang Tran","NhatHai Phan","My T. Thai"],"pdf_url":"https://arxiv.org/pdf/2302.12685v2.pdf","comment":"Published at AISTATS 2023"},{"id":"http://arxiv.org/abs/2307.13127v1","updated":"2023-07-24T21:03:25Z","published":"2023-07-24T21:03:25Z","title":"A Differentially Private Weighted Empirical Risk Minimization Procedure\n  and its Application to Outcome Weighted Learning","summary":"  It is commonplace to use data containing personal information to build\npredictive models in the framework of empirical risk minimization (ERM). While\nthese models can be highly accurate in prediction, results obtained from these\nmodels with the use of sensitive data may be susceptible to privacy attacks.\nDifferential privacy (DP) is an appealing framework for addressing such data\nprivacy issues by providing mathematically provable bounds on the privacy loss\nincurred when releasing information from sensitive data. Previous work has\nprimarily concentrated on applying DP to unweighted ERM. We consider an\nimportant generalization to weighted ERM (wERM). In wERM, each individual's\ncontribution to the objective function can be assigned varying weights. In this\ncontext, we propose the first differentially private wERM algorithm, backed by\na rigorous theoretical proof of its DP guarantees under mild regularity\nconditions. Extending the existing DP-ERM procedures to wERM paves a path to\nderiving privacy-preserving learning methods for individualized treatment\nrules, including the popular outcome weighted learning (OWL). We evaluate the\nperformance of the DP-wERM application to OWL in a simulation study and in a\nreal clinical trial of melatonin for sleep health. All empirical results\ndemonstrate the viability of training OWL models via wERM with DP guarantees\nwhile maintaining sufficiently useful model performance. Therefore, we\nrecommend practitioners consider implementing the proposed privacy-preserving\nOWL procedure in real-world scenarios involving sensitive data.\n","authors":["Spencer Giddens","Yiwang Zhou","Kevin R. Krull","Tara M. Brinkman","Peter X. K. Song","Fang Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13127v1.pdf","comment":"24 pages and 2 figures for the main manuscript, 5 pages and 2 figures\n  for the supplementary materials"},{"id":"http://arxiv.org/abs/2307.11127v2","updated":"2023-07-24T21:01:15Z","published":"2023-07-20T15:52:22Z","title":"Synthetic Control Methods by Density Matching under Implicit Endogeneity","summary":"  Synthetic control methods (SCMs) have become a crucial tool for causal\ninference in comparative case studies. The fundamental idea of SCMs is to\nestimate counterfactual outcomes for a treated unit by using a weighted sum of\nobserved outcomes from untreated units. The accuracy of the synthetic control\n(SC) is critical for estimating the causal effect, and hence, the estimation of\nSC weights has been the focus of much research. In this paper, we first point\nout that existing SCMs suffer from an implicit endogeneity problem, which is\nthe correlation between the outcomes of untreated units and the error term in\nthe model of a counterfactual outcome. We show that this problem yields a bias\nin the causal effect estimator. We then propose a novel SCM based on density\nmatching, assuming that the density of outcomes of the treated unit can be\napproximated by a weighted average of the densities of untreated units (i.e., a\nmixture model). Based on this assumption, we estimate SC weights by matching\nmoments of treated outcomes and the weighted sum of moments of untreated\noutcomes. Our proposed method has three advantages over existing methods.\nFirst, our estimator is asymptotically unbiased under the assumption of the\nmixture model. Second, due to the asymptotic unbiasedness, we can reduce the\nmean squared error for counterfactual prediction. Third, our method generates\nfull densities of the treatment effect, not only expected values, which\nbroadens the applicability of SCMs. We provide experimental results to\ndemonstrate the effectiveness of our proposed method.\n","authors":["Masahiro Kato","Akari Ohda","Masaaki Imaizumi","Kenichiro McAlinn"],"pdf_url":"https://arxiv.org/pdf/2307.11127v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13124v1","updated":"2023-07-24T20:45:39Z","published":"2023-07-24T20:45:39Z","title":"Conformal prediction for frequency-severity modeling","summary":"  We present a nonparametric model-agnostic framework for building prediction\nintervals of insurance claims, with finite sample statistical guarantees,\nextending the technique of split conformal prediction to the domain of\ntwo-stage frequency-severity modeling. The effectiveness of the framework is\nshowcased with simulated and real datasets. When the underlying severity model\nis a random forest, we extend the two-stage split conformal prediction\nprocedure, showing how the out-of-bag mechanism can be leveraged to eliminate\nthe need for a calibration set and to enable the production of prediction\nintervals with adaptive width.\n","authors":["Helton Graziadei","Paulo C. Marques F.","Eduardo F. L. de Melo","Rodrigo S. Targino"],"pdf_url":"https://arxiv.org/pdf/2307.13124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.15498v2","updated":"2023-07-24T20:08:20Z","published":"2021-06-29T15:25:33Z","title":"Classification of Consumer Belief Statements From Social Media","summary":"  Social media offer plenty of information to perform market research in order\nto meet the requirements of customers. One way how this research is conducted\nis that a domain expert gathers and categorizes user-generated content into a\ncomplex and fine-grained class structure. In many of such cases, little data\nmeets complex annotations. It is not yet fully understood how this can be\nleveraged successfully for classification. We examine the classification\naccuracy of expert labels when used with a) many fine-grained classes and b)\nfew abstract classes. For scenario b) we compare abstract class labels given by\nthe domain expert as baseline and by automatic hierarchical clustering. We\ncompare this to another baseline where the entire class structure is given by a\ncompletely unsupervised clustering approach. By doing so, this work can serve\nas an example of how complex expert annotations are potentially beneficial and\ncan be utilized in the most optimal way for opinion mining in highly specific\ndomains. By exploring across a range of techniques and experiments, we find\nthat automated class abstraction approaches in particular the unsupervised\napproach performs remarkably well against domain expert baseline on text\nclassification tasks. This has the potential to inspire opinion mining\napplications in order to support market researchers in practice and to inspire\nfine-grained automated content analysis on a large scale.\n","authors":["Gerhard Johann Hagerer","Wenbin Le","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2106.15498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13108v1","updated":"2023-07-24T19:57:21Z","published":"2023-07-24T19:57:21Z","title":"An Explainable Geometric-Weighted Graph Attention Network for\n  Identifying Functional Networks Associated with Gait Impairment","summary":"  One of the hallmark symptoms of Parkinson's Disease (PD) is the progressive\nloss of postural reflexes, which eventually leads to gait difficulties and\nbalance problems. Identifying disruptions in brain function associated with\ngait impairment could be crucial in better understanding PD motor progression,\nthus advancing the development of more effective and personalized therapeutics.\nIn this work, we present an explainable, geometric, weighted-graph attention\nneural network (xGW-GAT) to identify functional networks predictive of the\nprogression of gait difficulties in individuals with PD. xGW-GAT predicts the\nmulti-class gait impairment on the MDS Unified PD Rating Scale (MDS-UPDRS). Our\ncomputational- and data-efficient model represents functional connectomes as\nsymmetric positive definite (SPD) matrices on a Riemannian manifold to\nexplicitly encode pairwise interactions of entire connectomes, based on which\nwe learn an attention mask yielding individual- and group-level explainability.\nApplied to our resting-state functional MRI (rs-fMRI) dataset of individuals\nwith PD, xGW-GAT identifies functional connectivity patterns associated with\ngait impairment in PD and offers interpretable explanations of functional\nsubnetworks associated with motor impairment. Our model successfully\noutperforms several existing methods while simultaneously revealing\nclinically-relevant connectivity patterns. The source code is available at\nhttps://github.com/favour-nerrise/xGW-GAT .\n","authors":["Favour Nerrise","Qingyu Zhao","Kathleen L. Poston","Kilian M. Pohl","Ehsan Adeli"],"pdf_url":"https://arxiv.org/pdf/2307.13108v1.pdf","comment":"Accepted by the 26th International Conference on Medical Image\n  Computing and Computer Assisted Intervention (MICCAI 2023). MICCAI\n  Student-Author Registration (STAR) Award. 11 pages, 2 figures, 1 table,\n  appendix. Source Code: https://github.com/favour-nerrise/xGW-GAT"},{"id":"http://arxiv.org/abs/2111.02326v2","updated":"2023-07-24T19:44:53Z","published":"2021-11-03T16:20:16Z","title":"End-to-End Annotator Bias Approximation on Crowdsourced Single-Label\n  Sentiment Analysis","summary":"  Sentiment analysis is often a crowdsourcing task prone to subjective labels\ngiven by many annotators. It is not yet fully understood how the annotation\nbias of each annotator can be modeled correctly with state-of-the-art methods.\nHowever, resolving annotator bias precisely and reliably is the key to\nunderstand annotators' labeling behavior and to successfully resolve\ncorresponding individual misconceptions and wrongdoings regarding the\nannotation task. Our contribution is an explanation and improvement for precise\nneural end-to-end bias modeling and ground truth estimation, which reduces an\nundesired mismatch in that regard of the existing state-of-the-art.\nClassification experiments show that it has potential to improve accuracy in\ncases where each sample is annotated only by one single annotator. We provide\nthe whole source code publicly and release an own domain-specific sentiment\ndataset containing 10,000 sentences discussing organic food products. These are\ncrawled from social media and are singly labeled by 10 non-expert annotators.\n","authors":["Gerhard Johann Hagerer","David Szabo","Andreas Koch","Maria Luisa Ripoll Dominguez","Christian Widmer","Maximilian Wich","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2111.02326v2.pdf","comment":"10 pages, 2 figures, 2 tables, full conference paper, peer-reviewed"},{"id":"http://arxiv.org/abs/2307.13101v1","updated":"2023-07-24T19:43:22Z","published":"2023-07-24T19:43:22Z","title":"Contrastive Example-Based Control","summary":"  While many real-world problems that might benefit from reinforcement\nlearning, these problems rarely fit into the MDP mold: interacting with the\nenvironment is often expensive and specifying reward functions is challenging.\nMotivated by these challenges, prior work has developed data-driven approaches\nthat learn entirely from samples from the transition dynamics and examples of\nhigh-return states. These methods typically learn a reward function from\nhigh-return states, use that reward function to label the transitions, and then\napply an offline RL algorithm to these transitions. While these methods can\nachieve good results on many tasks, they can be complex, often requiring\nregularization and temporal difference updates. In this paper, we propose a\nmethod for offline, example-based control that learns an implicit model of\nmulti-step transitions, rather than a reward function. We show that this\nimplicit model can represent the Q-values for the example-based control\nproblem. Across a range of state-based and image-based offline control tasks,\nour method outperforms baselines that use learned reward functions; additional\nexperiments demonstrate improved robustness and scaling with dataset size.\n","authors":["Kyle Hatch","Benjamin Eysenbach","Rafael Rafailov","Tianhe Yu","Ruslan Salakhutdinov","Sergey Levine","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2307.13101v1.pdf","comment":"This is an updated version of a manuscript that originally appeared\n  at L4DC 2023. The project website is here\n  https://sites.google.com/view/laeo-rl"},{"id":"http://arxiv.org/abs/2307.13100v1","updated":"2023-07-24T19:41:19Z","published":"2023-07-24T19:41:19Z","title":"Label Noise: Correcting a Correction","summary":"  Training neural network classifiers on datasets with label noise poses a risk\nof overfitting them to the noisy labels. To address this issue, researchers\nhave explored alternative loss functions that aim to be more robust. However,\nmany of these alternatives are heuristic in nature and still vulnerable to\noverfitting or underfitting. In this work, we propose a more direct approach to\ntackling overfitting caused by label noise. We observe that the presence of\nlabel noise implies a lower bound on the noisy generalised risk. Building upon\nthis observation, we propose imposing a lower bound on the empirical risk\nduring training to mitigate overfitting. Our main contribution is providing\ntheoretical results that yield explicit, easily computable bounds on the\nminimum achievable noisy risk for different loss functions. We empirically\ndemonstrate that using these bounds significantly enhances robustness in\nvarious settings, with virtually no additional computational cost.\n","authors":["William Toner","Amos Storkey"],"pdf_url":"https://arxiv.org/pdf/2307.13100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10736v2","updated":"2023-07-24T19:28:46Z","published":"2023-07-20T10:03:50Z","title":"Long-Tail Theory under Gaussian Mixtures","summary":"  We suggest a simple Gaussian mixture model for data generation that complies\nwith Feldman's long tail theory (2020). We demonstrate that a linear classifier\ncannot decrease the generalization error below a certain level in the proposed\nmodel, whereas a nonlinear classifier with a memorization capacity can. This\nconfirms that for long-tailed distributions, rare training examples must be\nconsidered for optimal generalization to new data. Finally, we show that the\nperformance gap between linear and nonlinear models can be lessened as the tail\nbecomes shorter in the subpopulation frequency distribution, as confirmed by\nexperiments on synthetic and real data.\n","authors":["Arman Bolatov","Maxat Tezekbayev","Igor Melnykov","Artur Pak","Vassilina Nikoulina","Zhenisbek Assylbekov"],"pdf_url":"https://arxiv.org/pdf/2307.10736v2.pdf","comment":"accepted to ECAI 2023"},{"id":"http://arxiv.org/abs/2307.05946v2","updated":"2023-07-24T19:13:51Z","published":"2023-07-12T06:23:31Z","title":"A Bayesian approach to quantifying uncertainties and improving\n  generalizability in traffic prediction models","summary":"  Deep-learning models for traffic data prediction can have superior\nperformance in modeling complex functions using a multi-layer architecture.\nHowever, a major drawback of these approaches is that most of these approaches\ndo not offer forecasts with uncertainty estimates, which are essential for\ntraffic operations and control. Without uncertainty estimates, it is difficult\nto place any level of trust to the model predictions, and operational\nstrategies relying on overconfident predictions can lead to worsening traffic\nconditions. In this study, we propose a Bayesian recurrent neural network\nframework for uncertainty quantification in traffic prediction with higher\ngeneralizability by introducing spectral normalization to its hidden layers. In\nour paper, we have shown that normalization alters the training process of deep\nneural networks by controlling the model's complexity and reducing the risk of\noverfitting to the training data. This, in turn, helps improve the\ngeneralization performance of the model on out-of-distribution datasets.\nResults demonstrate that spectral normalization improves uncertainty estimates\nand significantly outperforms both the layer normalization and model without\nnormalization in single-step prediction horizons. This improved performance can\nbe attributed to the ability of spectral normalization to better localize the\nfeature space of the data under perturbations. Our findings are especially\nrelevant to traffic management applications, where predicting traffic\nconditions across multiple locations is the goal, but the availability of\ntraining data from multiple locations is limited. Spectral normalization,\ntherefore, provides a more generalizable approach that can effectively capture\nthe underlying patterns in traffic data without requiring location-specific\nmodels.\n","authors":["Agnimitra Sengupta","Sudeepta Mondal","Adway Das","S. Ilgin Guler"],"pdf_url":"https://arxiv.org/pdf/2307.05946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13081v1","updated":"2023-07-24T19:07:34Z","published":"2023-07-24T19:07:34Z","title":"Fairness Under Demographic Scarce Regime","summary":"  Most existing works on fairness assume the model has full access to\ndemographic information. However, there exist scenarios where demographic\ninformation is partially available because a record was not maintained\nthroughout data collection or due to privacy reasons. This setting is known as\ndemographic scarce regime. Prior research have shown that training an attribute\nclassifier to replace the missing sensitive attributes (proxy) can still\nimprove fairness. However, the use of proxy-sensitive attributes worsens\nfairness-accuracy trade-offs compared to true sensitive attributes. To address\nthis limitation, we propose a framework to build attribute classifiers that\nachieve better fairness-accuracy trade-offs. Our method introduces uncertainty\nawareness in the attribute classifier and enforces fairness on samples with\ndemographic information inferred with the lowest uncertainty. We show\nempirically that enforcing fairness constraints on samples with uncertain\nsensitive attributes is detrimental to fairness and accuracy. Our experiments\non two datasets showed that the proposed framework yields models with\nsignificantly better fairness-accuracy trade-offs compared to classic attribute\nclassifiers. Surprisingly, our framework outperforms models trained with\nconstraints on the true sensitive attributes.\n","authors":["Patrik Joslin Kenfack","Samira Ebrahimi Kahou","Ulrich Aïvodji"],"pdf_url":"https://arxiv.org/pdf/2307.13081v1.pdf","comment":"14 pages, 7 pages"},{"id":"http://arxiv.org/abs/2307.08811v2","updated":"2023-07-24T19:03:32Z","published":"2023-07-17T19:57:10Z","title":"DeepMem: ML Models as storage channels and their (mis-)applications","summary":"  Machine learning (ML) models are overparameterized to support generality and\navoid overfitting. Prior works have shown that these additional parameters can\nbe used for both malicious (e.g., hiding a model covertly within a trained\nmodel) and beneficial purposes (e.g., watermarking a model). In this paper, we\npropose a novel information theoretic perspective of the problem; we consider\nthe ML model as a storage channel with a capacity that increases with\noverparameterization. Specifically, we consider a sender that embeds arbitrary\ninformation in the model at training time, which can be extracted by a receiver\nwith a black-box access to the deployed model. We derive an upper bound on the\ncapacity of the channel based on the number of available parameters. We then\nexplore black-box write and read primitives that allow the attacker to: (i)\nstore data in an optimized way within the model by augmenting the training data\nat the transmitter side, and (ii) to read it by querying the model after it is\ndeployed. We also analyze the detectability of the writing primitive and\nconsider a new version of the problem which takes information storage\ncovertness into account. Specifically, to obtain storage covertness, we\nintroduce a new constraint such that the data augmentation used for the write\nprimitives minimizes the distribution shift with the initial (baseline task)\ndistribution. This constraint introduces a level of \"interference\" with the\ninitial task, thereby limiting the channel's effective capacity. Therefore, we\ndevelop optimizations to improve the capacity in this case, including a novel\nML-specific substitution based error correction protocol. We believe that the\nproposed modeling of the problem offers new tools to better understand and\nmitigate potential vulnerabilities of ML, especially in the context of\nincreasingly large models.\n","authors":["Md Abdullah Al Mamun","Quazi Mishkatul Alam","Erfan Shaigani","Pedram Zaree","Ihsen Alouani","Nael Abu-Ghazaleh"],"pdf_url":"https://arxiv.org/pdf/2307.08811v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13078v1","updated":"2023-07-24T18:59:46Z","published":"2023-07-24T18:59:46Z","title":"Adaptive Certified Training: Towards Better Accuracy-Robustness\n  Tradeoffs","summary":"  As deep learning models continue to advance and are increasingly utilized in\nreal-world systems, the issue of robustness remains a major challenge. Existing\ncertified training methods produce models that achieve high provable robustness\nguarantees at certain perturbation levels. However, the main problem of such\nmodels is a dramatically low standard accuracy, i.e. accuracy on clean\nunperturbed data, that makes them impractical. In this work, we consider a more\nrealistic perspective of maximizing the robustness of a model at certain levels\nof (high) standard accuracy. To this end, we propose a novel certified training\nmethod based on a key insight that training with adaptive certified radii helps\nto improve both the accuracy and robustness of the model, advancing\nstate-of-the-art accuracy-robustness tradeoffs. We demonstrate the\neffectiveness of the proposed method on MNIST, CIFAR-10, and TinyImageNet\ndatasets. Particularly, on CIFAR-10 and TinyImageNet, our method yields models\nwith up to two times higher robustness, measured as an average certified radius\nof a test set, at the same levels of standard accuracy compared to baseline\napproaches.\n","authors":["Zhakshylyk Nurlanov","Frank R. Schmidt","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2307.13078v1.pdf","comment":"Presented at ICML 2023 workshop \"New Frontiers in Adversarial Machine\n  Learning\""},{"id":"http://arxiv.org/abs/2307.13069v1","updated":"2023-07-24T18:50:49Z","published":"2023-07-24T18:50:49Z","title":"General-Purpose Multi-Modal OOD Detection Framework","summary":"  Out-of-distribution (OOD) detection identifies test samples that differ from\nthe training data, which is critical to ensuring the safety and reliability of\nmachine learning (ML) systems. While a plethora of methods have been developed\nto detect uni-modal OOD samples, only a few have focused on multi-modal OOD\ndetection. Current contrastive learning-based methods primarily study\nmulti-modal OOD detection in a scenario where both a given image and its\ncorresponding textual description come from a new domain. However, real-world\ndeployments of ML systems may face more anomaly scenarios caused by multiple\nfactors like sensor faults, bad weather, and environmental changes. Hence, the\ngoal of this work is to simultaneously detect from multiple different OOD\nscenarios in a fine-grained manner. To reach this goal, we propose a\ngeneral-purpose weakly-supervised OOD detection framework, called WOOD, that\ncombines a binary classifier and a contrastive learning component to reap the\nbenefits of both. In order to better distinguish the latent representations of\nin-distribution (ID) and OOD samples, we adopt the Hinge loss to constrain\ntheir similarity. Furthermore, we develop a new scoring metric to integrate the\nprediction results from both the binary classifier and contrastive learning for\nidentifying OOD samples. We evaluate the proposed WOOD model on multiple\nreal-world datasets, and the experimental results demonstrate that the WOOD\nmodel outperforms the state-of-the-art methods for multi-modal OOD detection.\nImportantly, our approach is able to achieve high accuracy in OOD detection in\nthree different OOD scenarios simultaneously. The source code will be made\npublicly available upon publication.\n","authors":["Viet Duong","Qiong Wu","Zhengyi Zhou","Eric Zavesky","Jiahe Chen","Xiangzhou Liu","Wen-Ling Hsu","Huajie Shao"],"pdf_url":"https://arxiv.org/pdf/2307.13069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12386v4","updated":"2023-07-24T18:42:08Z","published":"2023-01-29T07:45:17Z","title":"Plugin estimators for selective classification with out-of-distribution\n  detection","summary":"  Real-world classifiers can benefit from the option of abstaining from\npredicting on samples where they have low confidence. Such abstention is\nparticularly useful on samples which are close to the learned decision\nboundary, or which are outliers with respect to the training sample. These\nsettings have been the subject of extensive but disjoint study in the selective\nclassification (SC) and out-of-distribution (OOD) detection literature. Recent\nwork on selective classification with OOD detection (SCOD) has argued for the\nunified study of these problems; however, the formal underpinnings of this\nproblem are still nascent, and existing techniques are heuristic in nature. In\nthis paper, we propose new plugin estimators for SCOD that are theoretically\ngrounded, effective, and generalise existing approaches from the SC and OOD\ndetection literature. In the course of our analysis, we formally explicate how\nna\\\"{i}ve use of existing SC and OOD detection baselines may be inadequate for\nSCOD. We empirically demonstrate that our approaches yields competitive SC and\nOOD detection performance compared to baselines from both literatures.\n","authors":["Harikrishna Narasimhan","Aditya Krishna Menon","Wittawat Jitkrittum","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2301.12386v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13061v1","updated":"2023-07-24T18:25:59Z","published":"2023-07-24T18:25:59Z","title":"Feature Gradient Flow for Interpreting Deep Neural Networks in Head and\n  Neck Cancer Prediction","summary":"  This paper introduces feature gradient flow, a new technique for interpreting\ndeep learning models in terms of features that are understandable to humans.\nThe gradient flow of a model locally defines nonlinear coordinates in the input\ndata space representing the information the model is using to make its\ndecisions. Our idea is to measure the agreement of interpretable features with\nthe gradient flow of a model. To then evaluate the importance of a particular\nfeature to the model, we compare that feature's gradient flow measure versus\nthat of a baseline noise feature. We then develop a technique for training\nneural networks to be more interpretable by adding a regularization term to the\nloss function that encourages the model gradients to align with those of chosen\ninterpretable features. We test our method in a convolutional neural network\nprediction of distant metastasis of head and neck cancer from a computed\ntomography dataset from the Cancer Imaging Archive.\n","authors":["Yinzhu Jin","Jonathan C. Garneau","P. Thomas Fletcher"],"pdf_url":"https://arxiv.org/pdf/2307.13061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.09885v2","updated":"2023-07-24T18:12:19Z","published":"2021-11-18T18:59:35Z","title":"Optimal Simple Regret in Bayesian Best Arm Identification","summary":"  We consider best arm identification in the multi-armed bandit problem.\nAssuming certain continuity conditions of the prior, we characterize the rate\nof the Bayesian simple regret. Differing from Bayesian regret minimization\n(Lai, 1987), the leading term in the Bayesian simple regret derives from the\nregion where the gap between optimal and suboptimal arms is smaller than\n$\\sqrt{\\frac{\\log T}{T}}$. We propose a simple and easy-to-compute algorithm\nwith its leading term matching with the lower bound up to a constant factor;\nsimulation results support our theoretical findings.\n","authors":["Junpei Komiyama","Kaito Ariu","Masahiro Kato","Chao Qin"],"pdf_url":"https://arxiv.org/pdf/2111.09885v2.pdf","comment":"to appear in Mathematics of Operations Research"},{"id":"http://arxiv.org/abs/2305.17289v2","updated":"2023-07-24T18:10:47Z","published":"2023-05-26T22:17:28Z","title":"Fourier-DeepONet: Fourier-enhanced deep operator networks for full\n  waveform inversion with improved accuracy, generalizability, and robustness","summary":"  Full waveform inversion (FWI) infers the subsurface structure information\nfrom seismic waveform data by solving a non-convex optimization problem.\nData-driven FWI has been increasingly studied with various neural network\narchitectures to improve accuracy and computational efficiency. Nevertheless,\nthe applicability of pre-trained neural networks is severely restricted by\npotential discrepancies between the source function used in the field survey\nand the one utilized during training. Here, we develop a Fourier-enhanced deep\noperator network (Fourier-DeepONet) for FWI with the generalization of seismic\nsources, including the frequencies and locations of sources. Specifically, we\nemploy the Fourier neural operator as the decoder of DeepONet, and we utilize\nsource parameters as one input of Fourier-DeepONet, facilitating the resolution\nof FWI with variable sources. To test Fourier-DeepONet, we develop three new\nand realistic FWI benchmark datasets (FWI-F, FWI-L, and FWI-FL) with varying\nsource frequencies, locations, or both. Our experiments demonstrate that\ncompared with existing data-driven FWI methods, Fourier-DeepONet obtains more\naccurate predictions of subsurface structures in a wide range of source\nparameters. Moreover, the proposed Fourier-DeepONet exhibits superior\nrobustness when handling data with Gaussian noise or missing traces and sources\nwith Gaussian noise, paving the way for more reliable and accurate subsurface\nimaging across diverse real conditions.\n","authors":["Min Zhu","Shihang Feng","Youzuo Lin","Lu Lu"],"pdf_url":"https://arxiv.org/pdf/2305.17289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04759v2","updated":"2023-07-24T18:10:09Z","published":"2023-04-07T23:10:39Z","title":"Similarity search in the blink of an eye with compressed indices","summary":"  Nowadays, data is represented by vectors. Retrieving those vectors, among\nmillions and billions, that are similar to a given query is a ubiquitous\nproblem, known as similarity search, of relevance for a wide range of\napplications. Graph-based indices are currently the best performing techniques\nfor billion-scale similarity search. However, their random-access memory\npattern presents challenges to realize their full potential. In this work, we\npresent new techniques and systems for creating faster and smaller graph-based\nindices. To this end, we introduce a novel vector compression method,\nLocally-adaptive Vector Quantization (LVQ), that uses per-vector scaling and\nscalar quantization to improve search performance with fast similarity\ncomputations and a reduced effective bandwidth, while decreasing memory\nfootprint and barely impacting accuracy. LVQ, when combined with a new\nhigh-performance computing system for graph-based similarity search,\nestablishes the new state of the art in terms of performance and memory\nfootprint. For billions of vectors, LVQ outcompetes the second-best\nalternatives: (1) in the low-memory regime, by up to 20.7x in throughput with\nup to a 3x memory footprint reduction, and (2) in the high-throughput regime by\n5.8x with 1.4x less memory.\n","authors":["Cecilia Aguerrebere","Ishwar Bhati","Mark Hildebrand","Mariano Tepper","Ted Willke"],"pdf_url":"https://arxiv.org/pdf/2304.04759v2.pdf","comment":"VLDB 2023"}]},"2023-07-23T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2303.04496v2","updated":"2023-07-23T21:36:14Z","published":"2023-03-08T10:39:38Z","title":"MenuCraft: Interactive Menu System Design with Large Language Models","summary":"  Menu system design is a challenging task involving many design options and\nvarious human factors. For example, one crucial factor that designers need to\nconsider is the semantic and systematic relation of menu commands. However,\ncapturing these relations can be challenging due to limited available\nresources. With the advancement of neural language models, large language\nmodels can utilize their vast pre-existing knowledge in designing and refining\nmenu systems. In this paper, we propose MenuCraft, an AI-assisted designer for\nmenu design that enables collaboration between the designer and a dialogue\nsystem to design menus. MenuCraft offers an interactive language-based menu\ndesign tool that simplifies the menu design process and enables easy\ncustomization of design options. MenuCraft supports a variety of interactions\nthrough dialog that allows performing zero/few-shot learning.\n","authors":["Amir Hossein Kargaran","Nafiseh Nikeghbal","Abbas Heydarnoori","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2303.04496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12425v1","updated":"2023-07-23T20:43:21Z","published":"2023-07-23T20:43:21Z","title":"On the Effectiveness of Offline RL for Dialogue Response Generation","summary":"  A common training technique for language models is teacher forcing (TF). TF\nattempts to match human language exactly, even though identical meanings can be\nexpressed in different ways. This motivates use of sequence-level objectives\nfor dialogue response generation. In this paper, we study the efficacy of\nvarious offline reinforcement learning (RL) methods to maximize such\nobjectives. We present a comprehensive evaluation across multiple datasets,\nmodels, and metrics. Offline RL shows a clear performance improvement over\nteacher forcing while not inducing training instability or sacrificing\npractical training budgets.\n","authors":["Paloma Sodhi","Felix Wu","Ethan R. Elenberg","Kilian Q. Weinberger","Ryan McDonald"],"pdf_url":"https://arxiv.org/pdf/2307.12425v1.pdf","comment":"Accepted at ICML 2023. 18 pages, 12 figures. Code available at\n  https://github.com/asappresearch/dialogue-offline-rl"},{"id":"http://arxiv.org/abs/2307.12418v1","updated":"2023-07-23T20:08:38Z","published":"2023-07-23T20:08:38Z","title":"Testing Hateful Speeches against Policies","summary":"  In the recent years, many software systems have adopted AI techniques,\nespecially deep learning techniques. Due to their black-box nature, AI-based\nsystems brought challenges to traceability, because AI system behaviors are\nbased on models and data, whereas the requirements or policies are rules in the\nform of natural or programming language. To the best of our knowledge, there is\na limited amount of studies on how AI and deep neural network-based systems\nbehave against rule-based requirements/policies. This experience paper examines\ndeep neural network behaviors against rule-based requirements described in\nnatural language policies. In particular, we focus on a case study to check\nAI-based content moderation software against content moderation policies.\nFirst, using crowdsourcing, we collect natural language test cases which match\neach moderation policy, we name this dataset HateModerate; second, using the\ntest cases in HateModerate, we test the failure rates of state-of-the-art hate\nspeech detection software, and we find that these models have high failure\nrates for certain policies; finally, since manual labeling is costly, we\nfurther proposed an automated approach to augument HateModerate by finetuning\nOpenAI's large language models to automatically match new examples to policies.\nThe dataset and code of this work can be found on our anonymous website:\n\\url{https://sites.google.com/view/content-moderation-project}.\n","authors":["Jiangrui Zheng","Xueqing Liu","Girish Budhrani","Wei Yang","Ravishka Rathnasuriya"],"pdf_url":"https://arxiv.org/pdf/2307.12418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16259v5","updated":"2023-07-23T20:00:46Z","published":"2023-05-25T17:13:44Z","title":"Neural Natural Language Processing for Long Texts: A Survey of the\n  State-of-the-Art","summary":"  The adoption of Deep Neural Networks (DNNs) has greatly benefited Natural\nLanguage Processing (NLP) during the past decade. However, the demands of long\ndocument analysis are quite different from those of shorter texts, while the\never increasing size of documents uploaded on-line renders automated\nunderstanding of lengthy texts a critical issue. Relevant applications include\nautomated Web mining, legal document review, medical records analysis,\nfinancial reports analysis, contract management, environmental impact\nassessment, news aggregation, etc. Despite the relatively recent development of\nefficient algorithms for analyzing long documents, practical tools in this\nfield are currently flourishing. This article serves as an entry point into\nthis dynamic domain and aims to achieve two objectives. Firstly, it provides an\noverview of the relevant neural building blocks, serving as a concise tutorial\nfor the field. Secondly, it offers a brief examination of the current\nstate-of-the-art in long document NLP, with a primary focus on two key tasks:\ndocument classification and document summarization. Sentiment analysis for long\ntexts is also covered, since it is typically treated as a particular case of\ndocument classification. Consequently, this article presents an introductory\nexploration of document-level analysis, addressing the primary challenges,\nconcerns, and existing solutions. Finally, the article presents publicly\navailable annotated datasets that can facilitate further research in this area.\n","authors":["Dimitrios Tsirmpas","Ioannis Gkionis","Ioannis Mademlis","Georgios Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2305.16259v5.pdf","comment":"58 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2210.10012v3","updated":"2023-07-23T19:50:41Z","published":"2022-10-18T17:30:02Z","title":"Log-linear Guardedness and its Implications","summary":"  Methods for erasing human-interpretable concepts from neural representations\nthat assume linearity have been found to be tractable and useful. However, the\nimpact of this removal on the behavior of downstream classifiers trained on the\nmodified representations is not fully understood. In this work, we formally\ndefine the notion of log-linear guardedness as the inability of an adversary to\npredict the concept directly from the representation, and study its\nimplications. We show that, in the binary case, under certain assumptions, a\ndownstream log-linear model cannot recover the erased concept. However, we\ndemonstrate that a multiclass log-linear model \\emph{can} be constructed that\nindirectly recovers the concept in some cases, pointing to the inherent\nlimitations of log-linear guardedness as a downstream bias mitigation\ntechnique. These findings shed light on the theoretical limitations of linear\nerasure methods and highlight the need for further research on the connections\nbetween intrinsic and extrinsic bias in neural models.\n","authors":["Shauli Ravfogel","Yoav Goldberg","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2210.10012v3.pdf","comment":"Accepted as a long paper in ACL 2023"},{"id":"http://arxiv.org/abs/2307.12382v1","updated":"2023-07-23T17:16:13Z","published":"2023-07-23T17:16:13Z","title":"CommonsenseVIS: Visualizing and Understanding Commonsense Reasoning\n  Capabilities of Natural Language Models","summary":"  Recently, large pretrained language models have achieved compelling\nperformance on commonsense benchmarks. Nevertheless, it is unclear what\ncommonsense knowledge the models learn and whether they solely exploit spurious\npatterns. Feature attributions are popular explainability techniques that\nidentify important input concepts for model outputs. However, commonsense\nknowledge tends to be implicit and rarely explicitly presented in inputs. These\nmethods cannot infer models' implicit reasoning over mentioned concepts. We\npresent CommonsenseVIS, a visual explanatory system that utilizes external\ncommonsense knowledge bases to contextualize model behavior for commonsense\nquestion-answering. Specifically, we extract relevant commonsense knowledge in\ninputs as references to align model behavior with human knowledge. Our system\nfeatures multi-level visualization and interactive model probing and editing\nfor different concepts and their underlying relations. Through a user study, we\nshow that CommonsenseVIS helps NLP experts conduct a systematic and scalable\nvisual analysis of models' relational reasoning over concepts in different\nsituations.\n","authors":["Xingbo Wang","Renfei Huang","Zhihua Jin","Tianqing Fang","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2307.12382v1.pdf","comment":"This paper is accepted by IEEE VIS, 2023. To appear in IEEE\n  Transactions on Visualization and Computer Graphics (IEEE TVCG). 14 pages, 11\n  figures"},{"id":"http://arxiv.org/abs/2307.03691v2","updated":"2023-07-23T17:05:06Z","published":"2023-07-05T23:19:18Z","title":"Comparing Apples to Apples: Generating Aspect-Aware Comparative\n  Sentences from User Reviews","summary":"  It is time-consuming to find the best product among many similar\nalternatives. Comparative sentences can help to contrast one item from others\nin a way that highlights important features of an item that stand out. Given\nreviews of one or multiple items and relevant item features, we generate\ncomparative review sentences to aid users to find the best fit. Specifically,\nour model consists of three successive components in a transformer: (i) an item\nencoding module to encode an item for comparison, (ii) a comparison generation\nmodule that generates comparative sentences in an autoregressive manner, (iii)\na novel decoding method for user personalization. We show that our pipeline\ngenerates fluent and diverse comparative sentences. We run experiments on the\nrelevance and fidelity of our generated sentences in a human evaluation study\nand find that our algorithm creates comparative review sentences that are\nrelevant and truthful.\n","authors":["Jessica Echterhoff","An Yan","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2307.03691v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12375v1","updated":"2023-07-23T16:54:41Z","published":"2023-07-23T16:54:41Z","title":"In-Context Learning in Large Language Models Learns Label Relationships\n  but Is Not Conventional Learning","summary":"  The performance of Large Language Models (LLMs) on downstream tasks often\nimproves significantly when including examples of the input-label relationship\nin the context. However, there is currently no consensus about how this\nin-context learning (ICL) ability of LLMs works: for example, while Xie et al.\n(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)\nargue ICL does not even learn label relationships from in-context examples. In\nthis paper, we study (1) how labels of in-context examples affect predictions,\n(2) how label relationships learned during pre-training interact with\ninput-label examples provided in-context, and (3) how ICL aggregates label\ninformation across in-context examples. Our findings suggests LLMs usually\nincorporate information from in-context labels, but that pre-training and\nin-context label relationships are treated differently, and that the model does\nnot consider all in-context information equally. Our results give insights into\nunderstanding and aligning LLM behavior.\n","authors":["Jannik Kossen","Tom Rainforth","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2307.12375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11019v2","updated":"2023-07-23T16:52:59Z","published":"2023-07-20T16:46:10Z","title":"Investigating the Factual Knowledge Boundary of Large Language Models\n  with Retrieval Augmentation","summary":"  Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require\na substantial amount of factual knowledge and often rely on external\ninformation for assistance. Recently, large language models (LLMs) (e.g.,\nChatGPT), have demonstrated impressive prowess in solving a wide range of tasks\nwith world knowledge, including knowledge-intensive tasks. However, it remains\nunclear how well LLMs are able to perceive their factual knowledge boundaries,\nparticularly how they behave when incorporating retrieval augmentation. In this\nstudy, we present an initial analysis of the factual knowledge boundaries of\nLLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,\nwe focus on three primary research questions and analyze them by examining QA\nperformance, priori judgement and posteriori judgement of LLMs. We show\nevidence that LLMs possess unwavering confidence in their capabilities to\nrespond to questions and the accuracy of their responses. Furthermore,\nretrieval augmentation proves to be an effective approach in enhancing LLMs'\nawareness of knowledge boundaries, thereby improving their judgemental\nabilities. Additionally, we also find that LLMs have a propensity to rely on\nthe provided retrieval results when formulating answers, while the quality of\nthese results significantly impacts their reliance. The code to reproduce this\nwork is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.\n","authors":["Ruiyang Ren","Yuhao Wang","Yingqi Qu","Wayne Xin Zhao","Jing Liu","Hao Tian","Hua Wu","Ji-Rong Wen","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11019v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12371v1","updated":"2023-07-23T16:46:01Z","published":"2023-07-23T16:46:01Z","title":"Evaluating Emotional Nuances in Dialogue Summarization","summary":"  Automatic dialogue summarization is a well-established task that aims to\nidentify the most important content from human conversations to create a short\ntextual summary. Despite recent progress in the field, we show that most of the\nresearch has focused on summarizing the factual information, leaving aside the\naffective content, which can yet convey useful information to analyse, monitor,\nor support human interactions. In this paper, we propose and evaluate a set of\nmeasures $PEmo$, to quantify how much emotion is preserved in dialog summaries.\nResults show that, summarization models of the state-of-the-art do not preserve\nwell the emotional content in the summaries. We also show that by reducing the\ntraining set to only emotional dialogues, the emotional content is better\npreserved in the generated summaries, while conserving the most salient factual\ninformation.\n","authors":["Yongxin Zhou","Fabien Ringeval","François Portet"],"pdf_url":"https://arxiv.org/pdf/2307.12371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12369v1","updated":"2023-07-23T16:38:16Z","published":"2023-07-23T16:38:16Z","title":"Early Prediction of Alzheimers Disease Leveraging Symptom Occurrences\n  from Longitudinal Electronic Health Records of US Military Veterans","summary":"  Early prediction of Alzheimer's disease (AD) is crucial for timely\nintervention and treatment. This study aims to use machine learning approaches\nto analyze longitudinal electronic health records (EHRs) of patients with AD\nand identify signs and symptoms that can predict AD onset earlier. We used a\ncase-control design with longitudinal EHRs from the U.S. Department of Veterans\nAffairs Veterans Health Administration (VHA) from 2004 to 2021. Cases were VHA\npatients with AD diagnosed after 1/1/2016 based on ICD-10-CM codes, matched 1:9\nwith controls by age, sex and clinical utilization with replacement. We used a\npanel of AD-related keywords and their occurrences over time in a patient's\nlongitudinal EHRs as predictors for AD prediction with four machine learning\nmodels. We performed subgroup analyses by age, sex, and race/ethnicity, and\nvalidated the model in a hold-out and \"unseen\" VHA stations group. Model\ndiscrimination, calibration, and other relevant metrics were reported for\npredictions up to ten years before ICD-based diagnosis. The study population\nincluded 16,701 cases and 39,097 matched controls. The average number of\nAD-related keywords (e.g., \"concentration\", \"speaking\") per year increased\nrapidly for cases as diagnosis approached, from around 10 to over 40, while\nremaining flat at 10 for controls. The best model achieved high discriminative\naccuracy (ROCAUC 0.997) for predictions using data from at least ten years\nbefore ICD-based diagnoses. The model was well-calibrated (Hosmer-Lemeshow\ngoodness-of-fit p-value = 0.99) and consistent across subgroups of age, sex and\nrace/ethnicity, except for patients younger than 65 (ROCAUC 0.746). Machine\nlearning models using AD-related keywords identified from EHR notes can predict\nfuture AD diagnoses, suggesting its potential use for identifying AD risk using\nEHR notes, offering an affordable way for early screening on large population.\n","authors":["Rumeng Li","Xun Wang","Dan Berlowitz","Brian Silver","Wen Hu","Heather Keating","Raelene Goodwin","Weisong Liu","Honghuang Lin","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12369v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2111.09564v3","updated":"2023-07-23T16:02:01Z","published":"2021-11-18T07:46:35Z","title":"LAnoBERT: System Log Anomaly Detection based on BERT Masked Language\n  Model","summary":"  The system log generated in a computer system refers to large-scale data that\nare collected simultaneously and used as the basic data for determining errors,\nintrusion and abnormal behaviors. The aim of system log anomaly detection is to\npromptly identify anomalies while minimizing human intervention, which is a\ncritical problem in the industry. Previous studies performed anomaly detection\nthrough algorithms after converting various forms of log data into a\nstandardized template using a parser. Particularly, a template corresponding to\na specific event should be defined in advance for all the log data using which\nthe information within the log key may get lost. In this study, we propose\nLAnoBERT, a parser free system log anomaly detection method that uses the BERT\nmodel, exhibiting excellent natural language processing performance. The\nproposed method, LAnoBERT, learns the model through masked language modeling,\nwhich is a BERT-based pre-training method, and proceeds with unsupervised\nlearning-based anomaly detection using the masked language modeling loss\nfunction per log key during the test process. In addition, we also propose an\nefficient inference process to establish a practically applicable pipeline to\nthe actual system. Experiments on three well-known log datasets, i.e., HDFS,\nBGL, and Thunderbird, show that not only did LAnoBERT yield a higher anomaly\ndetection performance compared to unsupervised learning-based benchmark models,\nbut also it resulted in a comparable performance with supervised learning-based\nbenchmark models.\n","authors":["Yukyung Lee","Jina Kim","Pilsung Kang"],"pdf_url":"https://arxiv.org/pdf/2111.09564v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05303v2","updated":"2023-07-23T15:11:31Z","published":"2023-04-11T15:54:25Z","title":"ELVIS: Empowering Locality of Vision Language Pre-training with\n  Intra-modal Similarity","summary":"  Deep learning has shown great potential in assisting radiologists in reading\nchest X-ray (CXR) images, but its need for expensive annotations for improving\nperformance prevents widespread clinical application. Visual language\npre-training (VLP) can alleviate the burden and cost of annotation by\nleveraging routinely generated reports for radiographs, which exist in large\nquantities as well as in paired form (image-text pairs). Additionally,\nextensions to localization-aware VLPs are being proposed to address the needs\nfor accurate localization of abnormalities for computer-aided diagnosis (CAD)\nin CXR. However, we find that the formulation proposed by locality-aware VLP\nliterature actually leads to a loss in spatial relationships required for\ndownstream localization tasks. Therefore, we propose Empowering Locality of VLP\nwith Intra-modal Similarity, ELVIS, a VLP aware of intra-modal locality, to\nbetter preserve the locality within radiographs or reports, which enhances the\nability to comprehend location references in text reports. Our locality-aware\nVLP method significantly outperforms state-of-the art baselines in multiple\nsegmentation tasks and the MS-CXR phrase grounding task. Qualitatively, we show\nthat ELVIS focuses well on regions of interest described in the report text\ncompared to prior approaches, allowing for enhanced interpretability.\n","authors":["Sumin Seo","JaeWoong Shin","Jaewoo Kang","Tae Soo Kim","Thijs Kooi"],"pdf_url":"https://arxiv.org/pdf/2304.05303v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2207.03858v2","updated":"2023-07-23T14:52:20Z","published":"2022-07-08T12:27:19Z","title":"DSTEA: Improving Dialogue State Tracking via Entity Adaptive\n  Pre-training","summary":"  Dialogue State Tracking (DST) is critical for comprehensively interpreting\nuser and system utterances, thereby forming the cornerstone of efficient\ndialogue systems. Despite past research efforts focused on enhancing DST\nperformance through alterations to the model structure or integrating\nadditional features like graph relations, they often require additional\npre-training with external dialogue corpora. In this study, we propose DSTEA,\nimproving Dialogue State Tracking via Entity Adaptive pre-training, which can\nenhance the encoder through by intensively training key entities in dialogue\nutterances. DSTEA identifies these pivotal entities from input dialogues\nutilizing four different methods: ontology information, named-entity\nrecognition, the spaCy, and the flair library. Subsequently, it employs\nselective knowledge masking to train the model effectively. Remarkably, DSTEA\nonly requires pre-training without the direct infusion of extra knowledge into\nthe DST model. This approach resulted in substantial performance improvements\nof four robust DST models on MultiWOZ 2.0, 2.1, and 2.2, with joint goal\naccuracy witnessing an increase of up to 2.69% (from 52.41% to 55.10%). Further\nvalidation of DSTEA's efficacy was provided through comparative experiments\nconsidering various entity types and different entity adaptive pre-training\nconfigurations such as masking strategy and masking rate.\n","authors":["Yukyung Lee","Takyoung Kim","Hoonsang Yoon","Pilsung Kang","Junseong Bang","Misuk Kim"],"pdf_url":"https://arxiv.org/pdf/2207.03858v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12332v1","updated":"2023-07-23T13:58:00Z","published":"2023-07-23T13:58:00Z","title":"X-CapsNet For Fake News Detection","summary":"  News consumption has significantly increased with the growing popularity and\nuse of web-based forums and social media. This sets the stage for misinforming\nand confusing people. To help reduce the impact of misinformation on users'\npotential health-related decisions and other intents, it is desired to have\nmachine learning models to detect and combat fake news automatically. This\npaper proposes a novel transformer-based model using Capsule neural\nNetworks(CapsNet) called X-CapsNet. This model includes a CapsNet with dynamic\nrouting algorithm paralyzed with a size-based classifier for detecting short\nand long fake news statements. We use two size-based classifiers, a Deep\nConvolutional Neural Network (DCNN) for detecting long fake news statements and\na Multi-Layer Perceptron (MLP) for detecting short news statements. To resolve\nthe problem of representing short news statements, we use indirect features of\nnews created by concatenating the vector of news speaker profiles and a vector\nof polarity, sentiment, and counting words of news statements. For evaluating\nthe proposed architecture, we use the Covid-19 and the Liar datasets. The\nresults in terms of the F1-score for the Covid-19 dataset and accuracy for the\nLiar dataset show that models perform better than the state-of-the-art\nbaselines.\n","authors":["Mohammad Hadi Goldani","Reza Safabakhsh","Saeedeh Momtazi"],"pdf_url":"https://arxiv.org/pdf/2307.12332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10234v2","updated":"2023-07-23T13:48:15Z","published":"2023-07-16T05:33:35Z","title":"SentimentGPT: Exploiting GPT for Advanced Sentiment Analysis and its\n  Departure from Current Machine Learning","summary":"  This study presents a thorough examination of various Generative Pretrained\nTransformer (GPT) methodologies in sentiment analysis, specifically in the\ncontext of Task 4 on the SemEval 2017 dataset. Three primary strategies are\nemployed: 1) prompt engineering using the advanced GPT-3.5 Turbo, 2)\nfine-tuning GPT models, and 3) an inventive approach to embedding\nclassification. The research yields detailed comparative insights among these\nstrategies and individual GPT models, revealing their unique strengths and\npotential limitations. Additionally, the study compares these GPT-based\nmethodologies with other current, high-performing models previously used with\nthe same dataset. The results illustrate the significant superiority of the GPT\napproaches in terms of predictive performance, more than 22\\% in F1-score\ncompared to the state-of-the-art. Further, the paper sheds light on common\nchallenges in sentiment analysis tasks, such as understanding context and\ndetecting sarcasm. It underscores the enhanced capabilities of the GPT models\nto effectively handle these complexities. Taken together, these findings\nhighlight the promising potential of GPT models in sentiment analysis, setting\nthe stage for future research in this field. The code can be found at\nhttps://github.com/DSAatUSU/SentimentGPT\n","authors":["Kiana Kheiri","Hamid Karimi"],"pdf_url":"https://arxiv.org/pdf/2307.10234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12282v1","updated":"2023-07-23T10:23:00Z","published":"2023-07-23T10:23:00Z","title":"Milimili. Collecting Parallel Data via Crowdsourcing","summary":"  We present a methodology for gathering a parallel corpus through\ncrowdsourcing, which is more cost-effective than hiring professional\ntranslators, albeit at the expense of quality. Additionally, we have made\navailable experimental parallel data collected for Chechen-Russian and\nFula-English language pairs.\n","authors":["Alexander Antonov"],"pdf_url":"https://arxiv.org/pdf/2307.12282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04827v2","updated":"2023-07-23T10:20:28Z","published":"2023-07-07T16:25:59Z","title":"LaunchpadGPT: Language Model as Music Visualization Designer on\n  Launchpad","summary":"  Launchpad is a musical instrument that allows users to create and perform\nmusic by pressing illuminated buttons. To assist and inspire the design of the\nLaunchpad light effect, and provide a more accessible approach for beginners to\ncreate music visualization with this instrument, we proposed the LaunchpadGPT\nmodel to generate music visualization designs on Launchpad automatically. Based\non the language model with excellent generation ability, our proposed\nLaunchpadGPT takes an audio piece of music as input and outputs the lighting\neffects of Launchpad-playing in the form of a video (Launchpad-playing video).\nWe collect Launchpad-playing videos and process them to obtain music and\ncorresponding video frame of Launchpad-playing as prompt-completion pairs, to\ntrain the language model. The experiment result shows the proposed method can\ncreate better music visualization than random generation methods and hold the\npotential for a broader range of music visualization applications. Our code is\navailable at https://github.com/yunlong10/LaunchpadGPT/.\n","authors":["Siting Xu","Yunlong Tang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.04827v2.pdf","comment":"Accepted by International Computer Music Conference (ICMC) 2023"},{"id":"http://arxiv.org/abs/2305.04492v8","updated":"2023-07-23T08:54:43Z","published":"2023-05-08T06:36:46Z","title":"MGR: Multi-generator Based Rationalization","summary":"  Rationalization is to employ a generator and a predictor to construct a\nself-explaining NLP model in which the generator selects a subset of\nhuman-intelligible pieces of the input text to the following predictor.\nHowever, rationalization suffers from two key challenges, i.e., spurious\ncorrelation and degeneration, where the predictor overfits the spurious or\nmeaningless pieces solely selected by the not-yet well-trained generator and in\nturn deteriorates the generator. Although many studies have been proposed to\naddress the two challenges, they are usually designed separately and do not\ntake both of them into account. In this paper, we propose a simple yet\neffective method named MGR to simultaneously solve the two problems. The key\nidea of MGR is to employ multiple generators such that the occurrence stability\nof real pieces is improved and more meaningful pieces are delivered to the\npredictor. Empirically, we show that MGR improves the F1 score by up to 20.9%\nas compared to state-of-the-art methods. Codes are available at\nhttps://github.com/jugechengzi/Rationalization-MGR .\n","authors":["Wei Liu","Haozhao Wang","Jun Wang","Ruixuan Li","Xinyang Li","Yuankai Zhang","Yang Qiu"],"pdf_url":"https://arxiv.org/pdf/2305.04492v8.pdf","comment":"ACL 2023, oral presentation. Fixed some typos and clarified some\n  implementation details. arXiv admin note: text overlap with arXiv:2209.08285"},{"id":"http://arxiv.org/abs/2307.12267v1","updated":"2023-07-23T08:47:51Z","published":"2023-07-23T08:47:51Z","title":"Towards Automatic Boundary Detection for Human-AI Hybrid Essay in\n  Education","summary":"  Human-AI collaborative writing has been greatly facilitated with the help of\nmodern large language models (LLM), e.g., ChatGPT. While admitting the\nconvenience brought by technology advancement, educators also have concerns\nthat students might leverage LLM to partially complete their writing assignment\nand pass off the human-AI hybrid text as their original work. Driven by such\nconcerns, in this study, we investigated the automatic detection of Human-AI\nhybrid text in education, where we formalized the hybrid text detection as a\nboundary detection problem, i.e., identifying the transition points between\nhuman-written content and AI-generated content. We constructed a hybrid essay\ndataset by partially removing sentences from the original student-written\nessays and then instructing ChatGPT to fill in for the incomplete essays. Then\nwe proposed a two-step detection approach where we (1) Separated AI-generated\ncontent from human-written content during the embedding learning process; and\n(2) Calculated the distances between every two adjacent prototypes (a prototype\nis the mean of a set of consecutive sentences from the hybrid text in the\nembedding space) and assumed that the boundaries exist between the two\nprototypes that have the furthest distance from each other. Through extensive\nexperiments, we summarized the following main findings: (1) The proposed\napproach consistently outperformed the baseline methods across different\nexperiment settings; (2) The embedding learning process (i.e., step 1) can\nsignificantly boost the performance of the proposed approach; (3) When\ndetecting boundaries for single-boundary hybrid essays, the performance of the\nproposed approach could be enhanced by adopting a relatively large prototype\nsize, leading to a $22$\\% improvement (against the second-best baseline method)\nin the in-domain setting and an $18$\\% improvement in the out-of-domain\nsetting.\n","authors":["Zijie Zeng","Lele Sha","Yuheng Li","Kaixun Yang","Dragan Gašević","Guanliang Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12267v1.pdf","comment":"9 pages including references, 2 figures"},{"id":"http://arxiv.org/abs/2307.12266v1","updated":"2023-07-23T08:42:05Z","published":"2023-07-23T08:42:05Z","title":"Transformer-based Joint Source Channel Coding for Textual Semantic\n  Communication","summary":"  The Space-Air-Ground-Sea integrated network calls for more robust and secure\ntransmission techniques against jamming. In this paper, we propose a textual\nsemantic transmission framework for robust transmission, which utilizes the\nadvanced natural language processing techniques to model and encode sentences.\nSpecifically, the textual sentences are firstly split into tokens using\nwordpiece algorithm, and are embedded to token vectors for semantic extraction\nby Transformer-based encoder. The encoded data are quantized to a fixed length\nbinary sequence for transmission, where binary erasure, symmetric, and deletion\nchannels are considered for transmission. The received binary sequences are\nfurther decoded by the transformer decoders into tokens used for sentence\nreconstruction. Our proposed approach leverages the power of neural networks\nand attention mechanism to provide reliable and efficient communication of\ntextual data in challenging wireless environments, and simulation results on\nsemantic similarity and bilingual evaluation understudy prove the superiority\nof the proposed model in semantic transmission.\n","authors":["Shicong Liu","Zhen Gao","Gaojie Chen","Yu Su","Lu Peng"],"pdf_url":"https://arxiv.org/pdf/2307.12266v1.pdf","comment":"6 pages, 5 figures. Accepted by IEEE/CIC ICCC 2023"},{"id":"http://arxiv.org/abs/2307.12262v1","updated":"2023-07-23T08:23:26Z","published":"2023-07-23T08:23:26Z","title":"A meta learning scheme for fast accent domain expansion in Mandarin\n  speech recognition","summary":"  Spoken languages show significant variation across mandarin and accent.\nDespite the high performance of mandarin automatic speech recognition (ASR),\naccent ASR is still a challenge task. In this paper, we introduce meta-learning\ntechniques for fast accent domain expansion in mandarin speech recognition,\nwhich expands the field of accents without deteriorating the performance of\nmandarin ASR. Meta-learning or learn-to-learn can learn general relation in\nmulti domains not only for over-fitting a specific domain. So we select\nmeta-learning in the domain expansion task. This more essential learning will\ncause improved performance on accent domain extension tasks. We combine the\nmethods of meta learning and freeze of model parameters, which makes the\nrecognition performance more stable in different cases and the training faster\nabout 20%. Our approach significantly outperforms other methods about 3%\nrelatively in the accent domain expansion task. Compared to the baseline model,\nit improves relatively 37% under the condition that the mandarin test set\nremains unchanged. In addition, it also proved this method to be effective on a\nlarge amount of data with a relative performance improvement of 4% on the\naccent test set.\n","authors":["Ziwei Zhu","Changhao Shan","Bihong Zhang","Jian Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12231v1","updated":"2023-07-23T05:39:39Z","published":"2023-07-23T05:39:39Z","title":"Exploring the Integration of Speech Separation and Recognition with\n  Self-Supervised Learning Representation","summary":"  Neural speech separation has made remarkable progress and its integration\nwith automatic speech recognition (ASR) is an important direction towards\nrealizing multi-speaker ASR. This work provides an insightful investigation of\nspeech separation in reverberant and noisy-reverberant scenarios as an ASR\nfront-end. In detail, we explore multi-channel separation methods, mask-based\nbeamforming and complex spectral mapping, as well as the best features to use\nin the ASR back-end model. We employ the recent self-supervised learning\nrepresentation (SSLR) as a feature and improve the recognition performance from\nthe case with filterbank features. To further improve multi-speaker recognition\nperformance, we present a carefully designed training strategy for integrating\nspeech separation and recognition with SSLR. The proposed integration using\nTF-GridNet-based complex spectral mapping and WavLM-based SSLR achieves a 2.5%\nword error rate in reverberant WHAMR! test set, significantly outperforming an\nexisting mask-based MVDR beamforming and filterbank integration (28.9%).\n","authors":["Yoshiki Masuyama","Xuankai Chang","Wangyou Zhang","Samuele Cornell","Zhong-Qiu Wang","Nobutaka Ono","Yanmin Qian","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2307.12231v1.pdf","comment":"Accepted to IEEE WASPAA 2023"},{"id":"http://arxiv.org/abs/2305.11435v2","updated":"2023-07-23T05:32:05Z","published":"2023-05-19T05:19:04Z","title":"Syllable Discovery and Cross-Lingual Generalization in a Visually\n  Grounded, Self-Supervised Speech Model","summary":"  In this paper, we show that representations capturing syllabic units emerge\nwhen training a self-supervised speech model with a visually-grounded training\nobjective. We demonstrate that a nearly identical model architecture (HuBERT)\ntrained with a masked language modeling loss does not exhibit this same\nability, suggesting that the visual grounding objective is responsible for the\nemergence of this phenomenon. We propose the use of a minimum cut algorithm to\nautomatically predict syllable boundaries in speech, followed by a 2-stage\nclustering method to group identical syllables together. We show that our model\nnot only outperforms a state-of-the-art syllabic segmentation method on the\nlanguage it was trained on (English), but also generalizes in a zero-shot\nfashion to Estonian. Finally, we show that the same model is capable of\nzero-shot generalization for a word segmentation task on 4 other languages from\nthe Zerospeech Challenge, in some cases beating the previous state-of-the-art.\n","authors":["Puyuan Peng","Shang-Wen Li","Okko Räsänen","Abdelrahman Mohamed","David Harwath"],"pdf_url":"https://arxiv.org/pdf/2305.11435v2.pdf","comment":"Interspeech 2023. Code & Model:\n  https://github.com/jasonppy/syllable-discovery"},{"id":"http://arxiv.org/abs/2301.11305v2","updated":"2023-07-23T04:18:36Z","published":"2023-01-26T18:44:06Z","title":"DetectGPT: Zero-Shot Machine-Generated Text Detection using Probability\n  Curvature","summary":"  The increasing fluency and widespread usage of large language models (LLMs)\nhighlight the desirability of corresponding tools aiding detection of\nLLM-generated text. In this paper, we identify a property of the structure of\nan LLM's probability function that is useful for such detection. Specifically,\nwe demonstrate that text sampled from an LLM tends to occupy negative curvature\nregions of the model's log probability function. Leveraging this observation,\nwe then define a new curvature-based criterion for judging if a passage is\ngenerated from a given LLM. This approach, which we call DetectGPT, does not\nrequire training a separate classifier, collecting a dataset of real or\ngenerated passages, or explicitly watermarking generated text. It uses only log\nprobabilities computed by the model of interest and random perturbations of the\npassage from another generic pre-trained language model (e.g., T5). We find\nDetectGPT is more discriminative than existing zero-shot methods for model\nsample detection, notably improving detection of fake news articles generated\nby 20B parameter GPT-NeoX from 0.81 AUROC for the strongest zero-shot baseline\nto 0.95 AUROC for DetectGPT. See https://ericmitchell.ai/detectgpt for code,\ndata, and other project information.\n","authors":["Eric Mitchell","Yoonho Lee","Alexander Khazatsky","Christopher D. Manning","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2301.11305v2.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2307.12221v1","updated":"2023-07-23T04:01:24Z","published":"2023-07-23T04:01:24Z","title":"FATRER: Full-Attention Topic Regularizer for Accurate and Robust\n  Conversational Emotion Recognition","summary":"  This paper concentrates on the understanding of interlocutors' emotions\nevoked in conversational utterances. Previous studies in this literature mainly\nfocus on more accurate emotional predictions, while ignoring model robustness\nwhen the local context is corrupted by adversarial attacks. To maintain\nrobustness while ensuring accuracy, we propose an emotion recognizer augmented\nby a full-attention topic regularizer, which enables an emotion-related global\nview when modeling the local context in a conversation. A joint topic modeling\nstrategy is introduced to implement regularization from both representation and\nloss perspectives. To avoid over-regularization, we drop the constraints on\nprior distributions that exist in traditional topic modeling and perform\nprobabilistic approximations based entirely on attention alignment. Experiments\nshow that our models obtain more favorable results than state-of-the-art\nmodels, and gain convincing robustness under three types of adversarial\nattacks.\n","authors":["Yuzhao Mao","Di Lu","Xiaojie Wang","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12221v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.12450v1","updated":"2023-07-23T22:48:07Z","published":"2023-07-23T22:48:07Z","title":"ProtoFL: Unsupervised Federated Learning via Prototypical Distillation","summary":"  Federated learning (FL) is a promising approach for enhancing data privacy\npreservation, particularly for authentication systems. However, limited round\ncommunications, scarce representation, and scalability pose significant\nchallenges to its deployment, hindering its full potential. In this paper, we\npropose 'ProtoFL', Prototypical Representation Distillation based unsupervised\nFederated Learning to enhance the representation power of a global model and\nreduce round communication costs. Additionally, we introduce a local one-class\nclassifier based on normalizing flows to improve performance with limited data.\nOur study represents the first investigation of using FL to improve one-class\nclassification performance. We conduct extensive experiments on five widely\nused benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and\nKeystroke-Dynamics, to demonstrate the superior performance of our proposed\nframework over previous methods in the literature.\n","authors":["Hansol Kim","Youngjun Kwak","Minyoung Jung","Jinho Shin","Youngsung Kim","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12450v1.pdf","comment":"Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed\n  equally to this work"},{"id":"http://arxiv.org/abs/2307.12442v1","updated":"2023-07-23T22:11:23Z","published":"2023-07-23T22:11:23Z","title":"EnTri: Ensemble Learning with Tri-level Representations for Explainable\n  Scene Recognition","summary":"  Scene recognition based on deep-learning has made significant progress, but\nthere are still limitations in its performance due to challenges posed by\ninter-class similarities and intra-class dissimilarities. Furthermore, prior\nresearch has primarily focused on improving classification accuracy, yet it has\ngiven less attention to achieving interpretable, precise scene classification.\nTherefore, we are motivated to propose EnTri, an ensemble scene recognition\nframework that employs ensemble learning using a hierarchy of visual features.\nEnTri represents features at three distinct levels of detail: pixel-level,\nsemantic segmentation-level, and object class and frequency level. By\nincorporating distinct feature encoding schemes of differing complexity and\nleveraging ensemble strategies, our approach aims to improve classification\naccuracy while enhancing transparency and interpretability via visual and\ntextual explanations. To achieve interpretability, we devised an extension\nalgorithm that generates both visual and textual explanations highlighting\nvarious properties of a given scene that contribute to the final prediction of\nits category. This includes information about objects, statistics, spatial\nlayout, and textural details. Through experiments on benchmark scene\nclassification datasets, EnTri has demonstrated superiority in terms of\nrecognition accuracy, achieving competitive performance compared to\nstate-of-the-art approaches, with an accuracy of 87.69%, 75.56%, and 99.17% on\nthe MIT67, SUN397, and UIUC8 datasets, respectively.\n","authors":["Amirhossein Aminimehr","Amirali Molaei","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2307.12442v1.pdf","comment":"Submitted to Pattern Recognition journal"},{"id":"http://arxiv.org/abs/2306.17843v2","updated":"2023-07-23T21:27:30Z","published":"2023-06-30T17:59:08Z","title":"Magic123: One Image to High-Quality 3D Object Generation Using Both 2D\n  and 3D Diffusion Priors","summary":"  We present Magic123, a two-stage coarse-to-fine approach for high-quality,\ntextured 3D meshes generation from a single unposed image in the wild using\nboth2D and 3D priors. In the first stage, we optimize a neural radiance field\nto produce a coarse geometry. In the second stage, we adopt a memory-efficient\ndifferentiable mesh representation to yield a high-resolution mesh with a\nvisually appealing texture. In both stages, the 3D content is learned through\nreference view supervision and novel views guided by a combination of 2D and 3D\ndiffusion priors. We introduce a single trade-off parameter between the 2D and\n3D priors to control exploration (more imaginative) and exploitation (more\nprecise) of the generated geometry. Additionally, we employ textual inversion\nand monocular depth regularization to encourage consistent appearances across\nviews and to prevent degenerate solutions, respectively. Magic123 demonstrates\na significant improvement over previous image-to-3D techniques, as validated\nthrough extensive experiments on synthetic benchmarks and diverse real-world\nimages. Our code, models, and generated 3D assets are available at\nhttps://github.com/guochengqian/Magic123.\n","authors":["Guocheng Qian","Jinjie Mai","Abdullah Hamdi","Jian Ren","Aliaksandr Siarohin","Bing Li","Hsin-Ying Lee","Ivan Skorokhodov","Peter Wonka","Sergey Tulyakov","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2306.17843v2.pdf","comment":"webpage: https://guochengqian.github.io/project/magic123/"},{"id":"http://arxiv.org/abs/2307.12429v1","updated":"2023-07-23T20:55:11Z","published":"2023-07-23T20:55:11Z","title":"SwIPE: Efficient and Robust Medical Image Segmentation with Implicit\n  Patch Embeddings","summary":"  Modern medical image segmentation methods primarily use discrete\nrepresentations in the form of rasterized masks to learn features and generate\npredictions. Although effective, this paradigm is spatially inflexible, scales\npoorly to higher-resolution images, and lacks direct understanding of object\nshapes. To address these limitations, some recent works utilized implicit\nneural representations (INRs) to learn continuous representations for\nsegmentation. However, these methods often directly adopted components designed\nfor 3D shape reconstruction. More importantly, these formulations were also\nconstrained to either point-based or global contexts, lacking contextual\nunderstanding or local fine-grained details, respectively--both critical for\naccurate segmentation. To remedy this, we propose a novel approach, SwIPE\n(Segmentation with Implicit Patch Embeddings), that leverages the advantages of\nINRs and predicts shapes at the patch level--rather than at the point level or\nimage level--to enable both accurate local boundary delineation and global\nshape coherence. Extensive evaluations on two tasks (2D polyp segmentation and\n3D abdominal organ segmentation) show that SwIPE significantly improves over\nrecent implicit approaches and outperforms state-of-the-art discrete methods\nwith over 10x fewer parameters. Our method also demonstrates superior data\nefficiency and improved robustness to data shifts across image resolutions and\ndatasets. Code is available on Github.\n","authors":["Yejia Zhang","Pengfei Gu","Nishchal Sapkota","Danny Z. Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12429v1.pdf","comment":"Accepted to 2023 International Conference on Medical Image Computing\n  and Computer Assisted Intervention (MICCAI'23)"},{"id":"http://arxiv.org/abs/2307.12427v1","updated":"2023-07-23T20:47:03Z","published":"2023-07-23T20:47:03Z","title":"Augmented Box Replay: Overcoming Foreground Shift for Incremental Object\n  Detection","summary":"  In incremental learning, replaying stored samples from previous tasks\ntogether with current task samples is one of the most efficient approaches to\naddress catastrophic forgetting. However, unlike incremental classification,\nimage replay has not been successfully applied to incremental object detection\n(IOD). In this paper, we identify the overlooked problem of foreground shift as\nthe main reason for this. Foreground shift only occurs when replaying images of\nprevious tasks and refers to the fact that their background might contain\nforeground objects of the current task. To overcome this problem, a novel and\nefficient Augmented Box Replay (ABR) method is developed that only stores and\nreplays foreground objects and thereby circumvents the foreground shift\nproblem. In addition, we propose an innovative Attentive RoI Distillation loss\nthat uses spatial attention from region-of-interest (RoI) features to constrain\ncurrent model to focus on the most important information from old model. ABR\nsignificantly reduces forgetting of previous classes while maintaining high\nplasticity in current classes. Moreover, it considerably reduces the storage\nrequirements when compared to standard image replay. Comprehensive experiments\non Pascal-VOC and COCO datasets support the state-of-the-art performance of our\nmodel.\n","authors":["Liu Yuyang","Cong Yang","Goswami Dipam","Liu Xialei","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2307.12427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10373v2","updated":"2023-07-23T18:56:06Z","published":"2023-07-19T18:00:03Z","title":"TokenFlow: Consistent Diffusion Features for Consistent Video Editing","summary":"  The generative AI revolution has recently expanded to videos. Nevertheless,\ncurrent state-of-the-art video models are still lagging behind image models in\nterms of visual quality and user control over the generated content. In this\nwork, we present a framework that harnesses the power of a text-to-image\ndiffusion model for the task of text-driven video editing. Specifically, given\na source video and a target text-prompt, our method generates a high-quality\nvideo that adheres to the target text, while preserving the spatial layout and\nmotion of the input video. Our method is based on a key observation that\nconsistency in the edited video can be obtained by enforcing consistency in the\ndiffusion feature space. We achieve this by explicitly propagating diffusion\nfeatures based on inter-frame correspondences, readily available in the model.\nThus, our framework does not require any training or fine-tuning, and can work\nin conjunction with any off-the-shelf text-to-image editing method. We\ndemonstrate state-of-the-art editing results on a variety of real-world videos.\nWebpage: https://diffusion-tokenflow.github.io/\n","authors":["Michal Geyer","Omer Bar-Tal","Shai Bagon","Tali Dekel"],"pdf_url":"https://arxiv.org/pdf/2307.10373v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12400v1","updated":"2023-07-23T18:38:42Z","published":"2023-07-23T18:38:42Z","title":"TransNet: Transparent Object Manipulation Through Category-Level Pose\n  Estimation","summary":"  Transparent objects present multiple distinct challenges to visual perception\nsystems. First, their lack of distinguishing visual features makes transparent\nobjects harder to detect and localize than opaque objects. Even humans find\ncertain transparent surfaces with little specular reflection or refraction,\nlike glass doors, difficult to perceive. A second challenge is that depth\nsensors typically used for opaque object perception cannot obtain accurate\ndepth measurements on transparent surfaces due to their unique reflective\nproperties. Stemming from these challenges, we observe that transparent object\ninstances within the same category, such as cups, look more similar to each\nother than to ordinary opaque objects of that same category. Given this\nobservation, the present paper explores the possibility of category-level\ntransparent object pose estimation rather than instance-level pose estimation.\nWe propose \\textit{\\textbf{TransNet}}, a two-stage pipeline that estimates\ncategory-level transparent object pose using localized depth completion and\nsurface normal estimation. TransNet is evaluated in terms of pose estimation\naccuracy on a large-scale transparent object dataset and compared to a\nstate-of-the-art category-level pose estimation approach. Results from this\ncomparison demonstrate that TransNet achieves improved pose estimation accuracy\non transparent objects. Moreover, we use TransNet to build an autonomous\ntransparent object manipulation system for robotic pick-and-place and pouring\ntasks.\n","authors":["Huijie Zhang","Anthony Opipari","Xiaotong Chen","Jiyue Zhu","Zeren Yu","Odest Chadwicke Jenkins"],"pdf_url":"https://arxiv.org/pdf/2307.12400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12392v1","updated":"2023-07-23T17:55:24Z","published":"2023-07-23T17:55:24Z","title":"Iterative Robust Visual Grounding with Masked Reference based\n  Centerpoint Supervision","summary":"  Visual Grounding (VG) aims at localizing target objects from an image based\non given expressions and has made significant progress with the development of\ndetection and vision transformer. However, existing VG methods tend to generate\nfalse-alarm objects when presented with inaccurate or irrelevant descriptions,\nwhich commonly occur in practical applications. Moreover, existing methods fail\nto capture fine-grained features, accurate localization, and sufficient context\ncomprehension from the whole image and textual descriptions. To address both\nissues, we propose an Iterative Robust Visual Grounding (IR-VG) framework with\nMasked Reference based Centerpoint Supervision (MRCS). The framework introduces\niterative multi-level vision-language fusion (IMVF) for better alignment. We\nuse MRCS to ahieve more accurate localization with point-wised feature\nsupervision. Then, to improve the robustness of VG, we also present a\nmulti-stage false-alarm sensitive decoder (MFSD) to prevent the generation of\nfalse-alarm objects when presented with inaccurate expressions. The proposed\nframework is evaluated on five regular VG datasets and two newly constructed\nrobust VG datasets. Extensive experiments demonstrate that IR-VG achieves new\nstate-of-the-art (SOTA) results, with improvements of 25\\% and 10\\% compared to\nexisting SOTA approaches on the two newly proposed robust VG datasets.\nMoreover, the proposed framework is also verified effective on five regular VG\ndatasets. Codes and models will be publicly at\nhttps://github.com/cv516Buaa/IR-VG.\n","authors":["Menghao Li","Chunlei Wang","Wenquan Feng","Shuchang Lyu","Guangliang Cheng","Xiangtai Li","Binghao Liu","Qi Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.12392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05376v3","updated":"2023-07-23T16:30:27Z","published":"2022-12-10T23:48:27Z","title":"What's Wrong with the Absolute Trajectory Error?","summary":"  One of the limitations of the commonly used Absolute Trajectory Error (ATE)\nis that it is highly sensitive to outliers. As a result, in the presence of\njust a few outliers, it often fails to reflect the varying accuracy as the\ninlier trajectory error or the number of outliers varies. In this work, we\npropose an alternative error metric for evaluating the accuracy of the\nreconstructed camera trajectory. Our metric, named Discernible Trajectory Error\n(DTE), is computed in five steps: (1) Shift the ground-truth and estimated\ntrajectories such that both of their geometric medians are located at the\norigin. (2) Rotate the estimated trajectory such that it minimizes the sum of\ngeodesic distances between the corresponding camera orientations. (3) Scale the\nestimated trajectory such that the median distance of the cameras to their\ngeometric median is the same as that of the ground truth. (4) Compute,\nwinsorize and normalize the distances between the corresponding cameras. (5)\nObtain the DTE by taking the average of the mean and the root-mean-square (RMS)\nof the resulting distances. This metric is an attractive alternative to the\nATE, in that it is capable of discerning the varying trajectory accuracy as the\ninlier trajectory error or the number of outliers varies. Using the similar\nidea, we also propose a novel rotation error metric, named Discernible Rotation\nError (DRE), which has similar advantages to the DTE. Furthermore, we propose a\nsimple yet effective method for calibrating the camera-to-marker rotation,\nwhich is needed for the computation of our metrics. Our methods are verified\nthrough extensive simulations.\n","authors":["Seong Hun Lee","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2212.05376v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12349v1","updated":"2023-07-23T15:17:45Z","published":"2023-07-23T15:17:45Z","title":"ComPtr: Towards Diverse Bi-source Dense Prediction Tasks via A Simple\n  yet General Complementary Transformer","summary":"  Deep learning (DL) has advanced the field of dense prediction, while\ngradually dissolving the inherent barriers between different tasks. However,\nmost existing works focus on designing architectures and constructing visual\ncues only for the specific task, which ignores the potential uniformity\nintroduced by the DL paradigm. In this paper, we attempt to construct a novel\n\\underline{ComP}lementary \\underline{tr}ansformer, \\textbf{ComPtr}, for diverse\nbi-source dense prediction tasks. Specifically, unlike existing methods that\nover-specialize in a single task or a subset of tasks, ComPtr starts from the\nmore general concept of bi-source dense prediction. Based on the basic\ndependence on information complementarity, we propose consistency enhancement\nand difference awareness components with which ComPtr can evacuate and collect\nimportant visual semantic cues from different image sources for diverse tasks,\nrespectively. ComPtr treats different inputs equally and builds an efficient\ndense interaction model in the form of sequence-to-sequence on top of the\ntransformer. This task-generic design provides a smooth foundation for\nconstructing the unified model that can simultaneously deal with various\nbi-source information. In extensive experiments across several representative\nvision tasks, i.e. remote sensing change detection, RGB-T crowd counting,\nRGB-D/T salient object detection, and RGB-D semantic segmentation, the proposed\nmethod consistently obtains favorable performance. The code will be available\nat \\url{https://github.com/lartpang/ComPtr}.\n","authors":["Youwei Pang","Xiaoqi Zhao","Lihe Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2307.12349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05303v2","updated":"2023-07-23T15:11:31Z","published":"2023-04-11T15:54:25Z","title":"ELVIS: Empowering Locality of Vision Language Pre-training with\n  Intra-modal Similarity","summary":"  Deep learning has shown great potential in assisting radiologists in reading\nchest X-ray (CXR) images, but its need for expensive annotations for improving\nperformance prevents widespread clinical application. Visual language\npre-training (VLP) can alleviate the burden and cost of annotation by\nleveraging routinely generated reports for radiographs, which exist in large\nquantities as well as in paired form (image-text pairs). Additionally,\nextensions to localization-aware VLPs are being proposed to address the needs\nfor accurate localization of abnormalities for computer-aided diagnosis (CAD)\nin CXR. However, we find that the formulation proposed by locality-aware VLP\nliterature actually leads to a loss in spatial relationships required for\ndownstream localization tasks. Therefore, we propose Empowering Locality of VLP\nwith Intra-modal Similarity, ELVIS, a VLP aware of intra-modal locality, to\nbetter preserve the locality within radiographs or reports, which enhances the\nability to comprehend location references in text reports. Our locality-aware\nVLP method significantly outperforms state-of-the art baselines in multiple\nsegmentation tasks and the MS-CXR phrase grounding task. Qualitatively, we show\nthat ELVIS focuses well on regions of interest described in the report text\ncompared to prior approaches, allowing for enhanced interpretability.\n","authors":["Sumin Seo","JaeWoong Shin","Jaewoo Kang","Tae Soo Kim","Thijs Kooi"],"pdf_url":"https://arxiv.org/pdf/2304.05303v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.12348v1","updated":"2023-07-23T15:10:02Z","published":"2023-07-23T15:10:02Z","title":"ResShift: Efficient Diffusion Model for Image Super-resolution by\n  Residual Shifting","summary":"  Diffusion-based image super-resolution (SR) methods are mainly limited by the\nlow inference speed due to the requirements of hundreds or even thousands of\nsampling steps. Existing acceleration sampling techniques inevitably sacrifice\nperformance to some extent, leading to over-blurry SR results. To address this\nissue, we propose a novel and efficient diffusion model for SR that\nsignificantly reduces the number of diffusion steps, thereby eliminating the\nneed for post-acceleration during inference and its associated performance\ndeterioration. Our method constructs a Markov chain that transfers between the\nhigh-resolution image and the low-resolution image by shifting the residual\nbetween them, substantially improving the transition efficiency. Additionally,\nan elaborate noise schedule is developed to flexibly control the shifting speed\nand the noise strength during the diffusion process. Extensive experiments\ndemonstrate that the proposed method obtains superior or at least comparable\nperformance to current state-of-the-art methods on both synthetic and\nreal-world datasets, even only with 15 sampling steps. Our code and model are\navailable at https://github.com/zsyOAOA/ResShift.\n","authors":["Zongsheng Yue","Jianyi Wang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2307.12348v1.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.12344v1","updated":"2023-07-23T14:43:17Z","published":"2023-07-23T14:43:17Z","title":"Right for the Wrong Reason: Can Interpretable ML Techniques Detect\n  Spurious Correlations?","summary":"  While deep neural network models offer unmatched classification performance,\nthey are prone to learning spurious correlations in the data. Such dependencies\non confounding information can be difficult to detect using performance metrics\nif the test data comes from the same distribution as the training data.\nInterpretable ML methods such as post-hoc explanations or inherently\ninterpretable classifiers promise to identify faulty model reasoning. However,\nthere is mixed evidence whether many of these techniques are actually able to\ndo so. In this paper, we propose a rigorous evaluation strategy to assess an\nexplanation technique's ability to correctly identify spurious correlations.\nUsing this strategy, we evaluate five post-hoc explanation techniques and one\ninherently interpretable method for their ability to detect three types of\nartificially added confounders in a chest x-ray diagnosis task. We find that\nthe post-hoc technique SHAP, as well as the inherently interpretable Attri-Net\nprovide the best performance and can be used to reliably identify faulty model\nbehavior.\n","authors":["Susu Sun","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2307.12344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12342v1","updated":"2023-07-23T14:37:13Z","published":"2023-07-23T14:37:13Z","title":"Towards Generic and Controllable Attacks Against Object Detection","summary":"  Existing adversarial attacks against Object Detectors (ODs) suffer from two\ninherent limitations. Firstly, ODs have complicated meta-structure designs,\nhence most advanced attacks for ODs concentrate on attacking specific\ndetector-intrinsic structures, which makes it hard for them to work on other\ndetectors and motivates us to design a generic attack against ODs. Secondly,\nmost works against ODs make Adversarial Examples (AEs) by generalizing\nimage-level attacks from classification to detection, which brings redundant\ncomputations and perturbations in semantically meaningless areas (e.g.,\nbackgrounds) and leads to an emergency for seeking controllable attacks for\nODs. To this end, we propose a generic white-box attack, LGP (local\nperturbations with adaptively global attacks), to blind mainstream object\ndetectors with controllable perturbations. For a detector-agnostic attack, LGP\ntracks high-quality proposals and optimizes three heterogeneous losses\nsimultaneously. In this way, we can fool the crucial components of ODs with a\npart of their outputs without the limitations of specific structures. Regarding\ncontrollability, we establish an object-wise constraint that exploits\nforeground-background separation adaptively to induce the attachment of\nperturbations to foregrounds. Experimentally, the proposed LGP successfully\nattacked sixteen state-of-the-art object detectors on MS-COCO and DOTA\ndatasets, with promising imperceptibility and transferability obtained. Codes\nare publicly released in https://github.com/liguopeng0923/LGP.git\n","authors":["Guopeng Li","Yue Xu","Jian Ding","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2307.12342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12341v1","updated":"2023-07-23T14:32:07Z","published":"2023-07-23T14:32:07Z","title":"Rapid detection of soil carbonates by means of NIR spectroscopy, deep\n  learning methods and phase quantification by powder Xray diffraction","summary":"  Soil NIR spectral absorbance/reflectance libraries are utilized towards\nimproving agricultural production and analysis of soil properties which are key\nprerequisite for agroecological balance and environmental sustainability.\nCarbonates in particular, represent a soil property which is mostly affected\neven by mild, let alone extreme, changes of environmental conditions during\nclimate change. In this study we propose a rapid and efficient way to predict\ncarbonates content in soil by means of FT NIR reflectance spectroscopy and by\nuse of deep learning methods. We exploited multiple machine learning methods,\nsuch as: 1) a MLP Regressor and 2) a CNN and compare their performance with\nother traditional ML algorithms such as PLSR, Cubist and SVM on the combined\ndataset of two NIR spectral libraries: KSSL (USDA), a dataset of soil samples\nreflectance spectra collected nationwide, and LUCAS TopSoil (European Soil\nLibrary) which contains soil sample absorbance spectra from all over the\nEuropean Union, and use them to predict carbonate content on never before seen\nsoil samples. Soil samples in KSSL and in TopSoil spectral libraries were\nacquired in the spectral region of visNIR, however in this study, only the NIR\nspectral region was utilized. Quantification of carbonates by means of Xray\nDiffraction is in good agreement with the volumetric method and the MLP\nprediction. Our work contributes to rapid carbonates content prediction in soil\nsamples in cases where: 1) no volumetric method is available and 2) only NIR\nspectra absorbance data are available. Up till now and to the best of our\nknowledge, there exists no other study, that presents a prediction model\ntrained on such an extensive dataset with such promising results on unseen\ndata, undoubtedly supporting the notion that deep learning models present\nexcellent prediction tools for soil carbonates content.\n","authors":["Lykourgos Chiniadis","Petros Tamvakis"],"pdf_url":"https://arxiv.org/pdf/2307.12341v1.pdf","comment":"39 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.12335v1","updated":"2023-07-23T14:01:05Z","published":"2023-07-23T14:01:05Z","title":"Learning Navigational Visual Representations with Semantic Map\n  Supervision","summary":"  Being able to perceive the semantics and the spatial structure of the\nenvironment is essential for visual navigation of a household robot. However,\nmost existing works only employ visual backbones pre-trained either with\nindependent images for classification or with self-supervised learning methods\nto adapt to the indoor navigation domain, neglecting the spatial relationships\nthat are essential to the learning of navigation. Inspired by the behavior that\nhumans naturally build semantically and spatially meaningful cognitive maps in\ntheir brains during navigation, in this paper, we propose a novel\nnavigational-specific visual representation learning method by contrasting the\nagent's egocentric views and semantic maps (Ego$^2$-Map). We apply the visual\ntransformer as the backbone encoder and train the model with data collected\nfrom the large-scale Habitat-Matterport3D environments. Ego$^2$-Map learning\ntransfers the compact and rich information from a map, such as objects,\nstructure and transition, to the agent's egocentric representations for\nnavigation. Experiments show that agents using our learned representations on\nobject-goal navigation outperform recent visual pre-training methods. Moreover,\nour representations significantly improve vision-and-language navigation in\ncontinuous environments for both high-level and low-level action spaces,\nachieving new state-of-the-art results of 47% SR and 41% SPL on the test\nserver.\n","authors":["Yicong Hong","Yang Zhou","Ruiyi Zhang","Franck Dernoncourt","Trung Bui","Stephen Gould","Hao Tan"],"pdf_url":"https://arxiv.org/pdf/2307.12335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07464v2","updated":"2023-07-23T13:51:34Z","published":"2023-01-18T12:16:19Z","title":"CLIPTER: Looking at the Bigger Picture in Scene Text Recognition","summary":"  Reading text in real-world scenarios often requires understanding the context\nsurrounding it, especially when dealing with poor-quality text. However,\ncurrent scene text recognizers are unaware of the bigger picture as they\noperate on cropped text images. In this study, we harness the representative\ncapabilities of modern vision-language models, such as CLIP, to provide\nscene-level information to the crop-based recognizer. We achieve this by fusing\na rich representation of the entire image, obtained from the vision-language\nmodel, with the recognizer word-level features via a gated cross-attention\nmechanism. This component gradually shifts to the context-enhanced\nrepresentation, allowing for stable fine-tuning of a pretrained recognizer. We\ndemonstrate the effectiveness of our model-agnostic framework, CLIPTER (CLIP\nTExt Recognition), on leading text recognition architectures and achieve\nstate-of-the-art results across multiple benchmarks. Furthermore, our analysis\nhighlights improved robustness to out-of-vocabulary words and enhanced\ngeneralization in low-data regimes.\n","authors":["Aviad Aberdam","David Bensaïd","Alona Golts","Roy Ganz","Oren Nuriel","Royee Tichauer","Shai Mazor","Ron Litman"],"pdf_url":"https://arxiv.org/pdf/2301.07464v2.pdf","comment":"Accepted for publication by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12327v1","updated":"2023-07-23T13:50:41Z","published":"2023-07-23T13:50:41Z","title":"ES2Net: An Efficient Spectral-Spatial Network for Hyperspectral Image\n  Change Detection","summary":"  Hyperspectral image change detection (HSI-CD) aims to identify the\ndifferences in bitemporal HSIs. To mitigate spectral redundancy and improve the\ndiscriminativeness of changing features, some methods introduced band selection\ntechnology to select bands conducive for CD. However, these methods are limited\nby the inability to end-to-end training with the deep learning-based feature\nextractor and lack considering the complex nonlinear relationship among bands.\nIn this paper, we propose an end-to-end efficient spectral-spatial change\ndetection network (ES2Net) to address these issues. Specifically, we devised a\nlearnable band selection module to automatically select bands conducive to CD.\nIt can be jointly optimized with a feature extraction network and capture the\ncomplex nonlinear relationships among bands. Moreover, considering the large\nspatial feature distribution differences among different bands, we design the\ncluster-wise spatial attention mechanism that assigns a spatial attention\nfactor to each individual band to individually improve the feature\ndiscriminativeness for each band. Experiments on three widely used HSI-CD\ndatasets demonstrate the effectiveness and superiority of this method compared\nwith other state-of-the-art methods.\n","authors":["Qingren Yao","Yuan Zhou","Wei Xiang"],"pdf_url":"https://arxiv.org/pdf/2307.12327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09283v2","updated":"2023-07-23T13:33:51Z","published":"2023-07-18T14:24:33Z","title":"RepViT: Revisiting Mobile CNN From ViT Perspective","summary":"  Recently, lightweight Vision Transformers (ViTs) demonstrate superior\nperformance and lower latency compared with lightweight Convolutional Neural\nNetworks (CNNs) on resource-constrained mobile devices. This improvement is\nusually attributed to the multi-head self-attention module, which enables the\nmodel to learn global representations. However, the architectural disparities\nbetween lightweight ViTs and lightweight CNNs have not been adequately\nexamined. In this study, we revisit the efficient design of lightweight CNNs\nand emphasize their potential for mobile devices. We incrementally enhance the\nmobile-friendliness of a standard lightweight CNN, specifically MobileNetV3, by\nintegrating the efficient architectural choices of lightweight ViTs. This ends\nup with a new family of pure lightweight CNNs, namely RepViT. Extensive\nexperiments show that RepViT outperforms existing state-of-the-art lightweight\nViTs and exhibits favorable latency in various vision tasks. On ImageNet,\nRepViT achieves over 80\\% top-1 accuracy with nearly 1ms latency on an iPhone\n12, which is the first time for a lightweight model, to the best of our\nknowledge. Our largest model, RepViT-M3, obtains 81.4\\% accuracy with only\n1.3ms latency. The code and trained models are available at\n\\url{https://github.com/jameslahm/RepViT}.\n","authors":["Ao Wang","Hui Chen","Zijia Lin","Hengjun Pu","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2307.09283v2.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.12316v1","updated":"2023-07-23T12:57:47Z","published":"2023-07-23T12:57:47Z","title":"Development of pericardial fat count images using a combination of three\n  different deep-learning models","summary":"  Rationale and Objectives: Pericardial fat (PF), the thoracic visceral fat\nsurrounding the heart, promotes the development of coronary artery disease by\ninducing inflammation of the coronary arteries. For evaluating PF, this study\naimed to generate pericardial fat count images (PFCIs) from chest radiographs\n(CXRs) using a dedicated deep-learning model.\n  Materials and Methods: The data of 269 consecutive patients who underwent\ncoronary computed tomography (CT) were reviewed. Patients with metal implants,\npleural effusion, history of thoracic surgery, or that of malignancy were\nexcluded. Thus, the data of 191 patients were used. PFCIs were generated from\nthe projection of three-dimensional CT images, where fat accumulation was\nrepresented by a high pixel value. Three different deep-learning models,\nincluding CycleGAN, were combined in the proposed method to generate PFCIs from\nCXRs. A single CycleGAN-based model was used to generate PFCIs from CXRs for\ncomparison with the proposed method. To evaluate the image quality of the\ngenerated PFCIs, structural similarity index measure (SSIM), mean squared error\n(MSE), and mean absolute error (MAE) of (i) the PFCI generated using the\nproposed method and (ii) the PFCI generated using the single model were\ncompared.\n  Results: The mean SSIM, MSE, and MAE were as follows: 0.856, 0.0128, and\n0.0357, respectively, for the proposed model; and 0.762, 0.0198, and 0.0504,\nrespectively, for the single CycleGAN-based model.\n  Conclusion: PFCIs generated from CXRs with the proposed model showed better\nperformance than those with the single model. PFCI evaluation without CT may be\npossible with the proposed method.\n","authors":["Takaaki Matsunaga","Atsushi Kono","Hidetoshi Matsuo","Kaoru Kitagawab","Mizuho Nishio","Hiromi Hashimura","Yu Izawa","Takayoshi Toba","Kazuki Ishikawab","Akie Katsuki","Kazuyuki Ohmura","Takamichi Murakami"],"pdf_url":"https://arxiv.org/pdf/2307.12316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12309v1","updated":"2023-07-23T12:42:15Z","published":"2023-07-23T12:42:15Z","title":"Building Extraction from Remote Sensing Images via an Uncertainty-Aware\n  Network","summary":"  Building extraction aims to segment building pixels from remote sensing\nimages and plays an essential role in many applications, such as city planning\nand urban dynamic monitoring. Over the past few years, deep learning methods\nwith encoder-decoder architectures have achieved remarkable performance due to\ntheir powerful feature representation capability. Nevertheless, due to the\nvarying scales and styles of buildings, conventional deep learning models\nalways suffer from uncertain predictions and cannot accurately distinguish the\ncomplete footprints of the building from the complex distribution of ground\nobjects, leading to a large degree of omission and commission. In this paper,\nwe realize the importance of uncertain prediction and propose a novel and\nstraightforward Uncertainty-Aware Network (UANet) to alleviate this problem. To\nverify the performance of our proposed UANet, we conduct extensive experiments\non three public building datasets, including the WHU building dataset, the\nMassachusetts building dataset, and the Inria aerial image dataset. Results\ndemonstrate that the proposed UANet outperforms other state-of-the-art\nalgorithms by a large margin.\n","authors":["Wei He","Jiepan Li","Weinan Cao","Liangpei Zhang","Hongyan Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12301v1","updated":"2023-07-23T11:50:27Z","published":"2023-07-23T11:50:27Z","title":"RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC","summary":"  Image outlier detection (OD) is crucial for ensuring the quality and accuracy\nof image datasets used in computer vision tasks. The majority of OD algorithms,\nhowever, have not been targeted toward image data. Consequently, the results of\napplying such algorithms to images are often suboptimal. In this work, we\npropose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for\nimages. By comparing images in a RANSAC-based approach, our algorithm\nautomatically predicts the outlier score of each image without additional\ntraining or label information. We evaluate RANSAC-NN against state-of-the-art\nOD algorithms on 15 diverse datasets. Without any hyperparameter tuning,\nRANSAC-NN consistently performs favorably in contrast to other algorithms in\nalmost every dataset category. Furthermore, we provide a detailed analysis to\nunderstand each RANSAC-NN component, and we demonstrate its potential\napplications in image mislabeled detection. Code for RANSAC-NN is provided at\nhttps://github.com/mxtsai/ransac-nn\n","authors":["Chen-Han Tsai","Yu-Shao Peng"],"pdf_url":"https://arxiv.org/pdf/2307.12301v1.pdf","comment":"19 pages, 18 figures"},{"id":"http://arxiv.org/abs/2307.12299v1","updated":"2023-07-23T11:32:14Z","published":"2023-07-23T11:32:14Z","title":"Hybrid-CSR: Coupling Explicit and Implicit Shape Representation for\n  Cortical Surface Reconstruction","summary":"  We present Hybrid-CSR, a geometric deep-learning model that combines explicit\nand implicit shape representations for cortical surface reconstruction.\nSpecifically, Hybrid-CSR begins with explicit deformations of template meshes\nto obtain coarsely reconstructed cortical surfaces, based on which the oriented\npoint clouds are estimated for the subsequent differentiable poisson surface\nreconstruction. By doing so, our method unifies explicit (oriented point\nclouds) and implicit (indicator function) cortical surface reconstruction.\nCompared to explicit representation-based methods, our hybrid approach is more\nfriendly to capture detailed structures, and when compared with implicit\nrepresentation-based methods, our method can be topology aware because of\nend-to-end training with a mesh-based deformation module. In order to address\ntopology defects, we propose a new topology correction pipeline that relies on\noptimization-based diffeomorphic surface registration. Experimental results on\nthree brain datasets show that our approach surpasses existing implicit and\nexplicit cortical surface reconstruction methods in numeric metrics in terms of\naccuracy, regularity, and consistency.\n","authors":["Shanlin Sun","Thanh-Tung Le","Chenyu You","Hao Tang","Kun Han","Haoyu Ma","Deying Kong","Xiangyi Yan","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2307.12299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12297v1","updated":"2023-07-23T11:28:25Z","published":"2023-07-23T11:28:25Z","title":"Simultaneous temperature estimation and nonuniformity correction from\n  multiple frames","summary":"  Infrared (IR) cameras are widely used for temperature measurements in various\napplications, including agriculture, medicine, and security. Low-cost IR camera\nhave an immense potential to replace expansive radiometric cameras in these\napplications, however low-cost microbolometer-based IR cameras are prone to\nspatially-variant nonuniformity and to drift in temperature measurements, which\nlimits their usability in practical scenarios.\n  To address these limitations, we propose a novel approach for simultaneous\ntemperature estimation and nonuniformity correction from multiple frames\ncaptured by low-cost microbolometer-based IR cameras. We leverage the physical\nimage acquisition model of the camera and incorporate it into a deep learning\narchitecture called kernel estimation networks (KPN), which enables us to\ncombine multiple frames despite imperfect registration between them. We also\npropose a novel offset block that incorporates the ambient temperature into the\nmodel and enables us to estimate the offset of the camera, which is a key\nfactor in temperature estimation.\n  Our findings demonstrate that the number of frames has a significant impact\non the accuracy of temperature estimation and nonuniformity correction.\nMoreover, our approach achieves a significant improvement in performance\ncompared to vanilla KPN, thanks to the offset block. The method was tested on\nreal data collected by a low-cost IR camera mounted on a UAV, showing only a\nsmall average error of $0.27^\\circ C-0.54^\\circ C$ relative to costly\nscientific-grade radiometric cameras.\n  Our method provides an accurate and efficient solution for simultaneous\ntemperature estimation and nonuniformity correction, which has important\nimplications for a wide range of practical applications.\n","authors":["Navot Oz","Omri Berman","Nir Sochen","David Mendelovich","Iftach Klapp"],"pdf_url":"https://arxiv.org/pdf/2307.12297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12291v1","updated":"2023-07-23T10:59:51Z","published":"2023-07-23T10:59:51Z","title":"TransHuman: A Transformer-based Human Representation for Generalizable\n  Neural Human Rendering","summary":"  In this paper, we focus on the task of generalizable neural human rendering\nwhich trains conditional Neural Radiance Fields (NeRF) from multi-view videos\nof different characters. To handle the dynamic human motion, previous methods\nhave primarily used a SparseConvNet (SPC)-based human representation to process\nthe painted SMPL. However, such SPC-based representation i) optimizes under the\nvolatile observation space which leads to the pose-misalignment between\ntraining and inference stages, and ii) lacks the global relationships among\nhuman parts that is critical for handling the incomplete painted SMPL. Tackling\nthese issues, we present a brand-new framework named TransHuman, which learns\nthe painted SMPL under the canonical space and captures the global\nrelationships between human parts with transformers. Specifically, TransHuman\nis mainly composed of Transformer-based Human Encoding (TransHE), Deformable\nPartial Radiance Fields (DPaRF), and Fine-grained Detail Integration (FDI).\nTransHE first processes the painted SMPL under the canonical space via\ntransformers for capturing the global relationships between human parts. Then,\nDPaRF binds each output token with a deformable radiance field for encoding the\nquery point under the observation space. Finally, the FDI is employed to\nfurther integrate fine-grained information from reference images. Extensive\nexperiments on ZJU-MoCap and H36M show that our TransHuman achieves a\nsignificantly new state-of-the-art performance with high efficiency. Project\npage: https://pansanity666.github.io/TransHuman/\n","authors":["Xiao Pan","Zongxin Yang","Jianxin Ma","Chang Zhou","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12291v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.11041v2","updated":"2023-07-23T10:54:21Z","published":"2023-03-20T11:47:02Z","title":"From Sparse to Precise: A Practical Editing Approach for Intracardiac\n  Echocardiography Segmentation","summary":"  Accurate and safe catheter ablation procedures for patients with atrial\nfibrillation require precise segmentation of cardiac structures in Intracardiac\nEchocardiography (ICE) imaging. Prior studies have suggested methods that\nemploy 3D geometry information from the ICE transducer to create a sparse ICE\nvolume by placing 2D frames in a 3D grid, enabling training of 3D segmentation\nmodels. However, the resulting 3D masks from these models can be inaccurate and\nmay lead to serious clinical complications due to the sparse sampling in ICE\ndata, frames misalignment, and cardiac motion. To address this issue, we\npropose an interactive editing framework that allows users to edit segmentation\noutput by drawing scribbles on a 2D frame. The user interaction is mapped to\nthe 3D grid and utilized to execute an editing step that modifies the\nsegmentation in the vicinity of the interaction while preserving the previous\nsegmentation away from the interaction. Furthermore, our framework accommodates\nmultiple edits to the segmentation output in a sequential manner without\ncompromising previous edits. This paper presents a novel loss function and a\nnovel evaluation metric specifically designed for editing. Results from\ncross-validation and testing indicate that our proposed loss function\noutperforms standard losses and training strategies in terms of segmentation\nquality and following user input. Additionally, we show quantitatively and\nqualitatively that subsequent edits do not compromise previous edits when using\nour method, as opposed to standard segmentation losses. Overall, our approach\nenhances the accuracy of the segmentation while avoiding undesired changes away\nfrom user interactions and without compromising the quality of previously\nedited regions, leading to better patient outcomes.\n","authors":["Ahmed H. Shahin","Yan Zhuang","Noha El-Zehiry"],"pdf_url":"https://arxiv.org/pdf/2303.11041v2.pdf","comment":"Accepted to MICCAI 2023"},{"id":"http://arxiv.org/abs/2306.15880v3","updated":"2023-07-23T10:50:18Z","published":"2023-06-28T02:33:06Z","title":"Towards Open Vocabulary Learning: A Survey","summary":"  In the field of visual scene understanding, deep neural networks have made\nimpressive advancements in various core tasks like segmentation, tracking, and\ndetection. However, most approaches operate on the close-set assumption,\nmeaning that the model can only identify pre-defined categories that are\npresent in the training set. Recently, open vocabulary settings were proposed\ndue to the rapid progress of vision language pre-training. These new approaches\nseek to locate and recognize categories beyond the annotated label space. The\nopen vocabulary approach is more general, practical, and effective compared to\nweakly supervised and zero-shot settings. This paper provides a thorough review\nof open vocabulary learning, summarizing and analyzing recent developments in\nthe field. In particular, we begin by comparing it to related concepts such as\nzero-shot learning, open-set recognition, and out-of-distribution detection.\nThen, we review several closely related tasks in the case of segmentation and\ndetection, including long-tail problems, few-shot, and zero-shot settings. For\nthe method survey, we first present the basic knowledge of detection and\nsegmentation in close-set as the preliminary knowledge. Next, we examine\nvarious scenarios in which open vocabulary learning is used, identifying common\ndesign elements and core ideas. Then, we compare the recent detection and\nsegmentation approaches in commonly used datasets and benchmarks. Finally, we\nconclude with insights, issues, and discussions regarding future research\ndirections. To our knowledge, this is the first comprehensive literature review\nof open vocabulary learning. We keep tracing related works at\nhttps://github.com/jianzongwu/Awesome-Open-Vocabulary.\n","authors":["Jianzong Wu","Xiangtai Li","Shilin Xu","Haobo Yuan","Henghui Ding","Yibo Yang","Xia Li","Jiangning Zhang","Yunhai Tong","Xudong Jiang","Bernard Ghanem","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2306.15880v3.pdf","comment":"Project page: https://github.com/jianzongwu/Awesome-Open-Vocabulary"},{"id":"http://arxiv.org/abs/2301.00805v2","updated":"2023-07-23T10:35:05Z","published":"2023-01-02T18:52:12Z","title":"Betrayed by Captions: Joint Caption Grounding and Generation for Open\n  Vocabulary Instance Segmentation","summary":"  In this work, we focus on open vocabulary instance segmentation to expand a\nsegmentation model to classify and segment instance-level novel categories.\nPrevious approaches have relied on massive caption datasets and complex\npipelines to establish one-to-one mappings between image regions and words in\ncaptions. However, such methods build noisy supervision by matching non-visible\nwords to image regions, such as adjectives and verbs. Meanwhile, context words\nare also important for inferring the existence of novel objects as they show\nhigh inter-correlations with novel categories. To overcome these limitations,\nwe devise a joint \\textbf{Caption Grounding and Generation (CGG)} framework,\nwhich incorporates a novel grounding loss that only focuses on matching object\nnouns to improve learning efficiency. We also introduce a caption generation\nhead that enables additional supervision and contextual modeling as a\ncomplementation to the grounding loss. Our analysis and results demonstrate\nthat grounding and generation components complement each other, significantly\nenhancing the segmentation performance for novel classes. Experiments on the\nCOCO dataset with two settings: Open Vocabulary Instance Segmentation (OVIS)\nand Open Set Panoptic Segmentation (OSPS) demonstrate the superiority of the\nCGG. Specifically, CGG achieves a substantial improvement of 6.8% mAP for novel\nclasses without extra data on the OVIS task and 15% PQ improvements for novel\nclasses on the OSPS benchmark.\n","authors":["Jianzong Wu","Xiangtai Li","Henghui Ding","Xia Li","Guangliang Cheng","Yunhai Tong","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2301.00805v2.pdf","comment":"ICCV-2023"},{"id":"http://arxiv.org/abs/2209.09554v2","updated":"2023-07-23T10:27:35Z","published":"2022-09-20T08:48:26Z","title":"Towards Robust Referring Image Segmentation","summary":"  Referring Image Segmentation (RIS) is a fundamental vision-language task that\noutputs object masks based on text descriptions. Many works have achieved\nconsiderable progress for RIS, including different fusion method designs. In\nthis work, we explore an essential question, ``What if the text description is\nwrong or misleading?'' For example, the described objects are not in the image.\nWe term such a sentence as a negative sentence. However, existing solutions for\nRIS cannot handle such a setting. To this end, we propose a new formulation of\nRIS, named Robust Referring Image Segmentation (R-RIS). It considers the\nnegative sentence inputs besides the regular positive text inputs. To\nfacilitate this new task, we create three R-RIS datasets by augmenting existing\nRIS datasets with negative sentences and propose new metrics to evaluate both\ntypes of inputs in a unified manner. Furthermore, we propose a new\ntransformer-based model, called RefSegformer, with a token-based vision and\nlanguage fusion module. Our design can be easily extended to our R-RIS setting\nby adding extra blank tokens. Our proposed RefSegformer achieves\nstate-of-the-art results on both RIS and R-RIS datasets, establishing a solid\nbaseline for both settings. Our project page is at\n\\url{https://github.com/jianzongwu/robust-ref-seg}.\n","authors":["Jianzong Wu","Xiangtai Li","Xia Li","Henghui Ding","Yunhai Tong","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2209.09554v2.pdf","comment":"update more results"},{"id":"http://arxiv.org/abs/2307.12280v1","updated":"2023-07-23T10:16:47Z","published":"2023-07-23T10:16:47Z","title":"Downstream-agnostic Adversarial Examples","summary":"  Self-supervised learning usually uses a large amount of unlabeled data to\npre-train an encoder which can be used as a general-purpose feature extractor,\nsuch that downstream users only need to perform fine-tuning operations to enjoy\nthe benefit of \"large model\". Despite this promising prospect, the security of\npre-trained encoder has not been thoroughly investigated yet, especially when\nthe pre-trained encoder is publicly available for commercial use.\n  In this paper, we propose AdvEncoder, the first framework for generating\ndownstream-agnostic universal adversarial examples based on the pre-trained\nencoder. AdvEncoder aims to construct a universal adversarial perturbation or\npatch for a set of natural images that can fool all the downstream tasks\ninheriting the victim pre-trained encoder. Unlike traditional adversarial\nexample works, the pre-trained encoder only outputs feature vectors rather than\nclassification labels. Therefore, we first exploit the high frequency component\ninformation of the image to guide the generation of adversarial examples. Then\nwe design a generative attack framework to construct adversarial\nperturbations/patches by learning the distribution of the attack surrogate\ndataset to improve their attack success rates and transferability. Our results\nshow that an attacker can successfully attack downstream tasks without knowing\neither the pre-training dataset or the downstream dataset. We also tailor four\ndefenses for pre-trained encoders, the results of which further prove the\nattack ability of AdvEncoder.\n","authors":["Ziqi Zhou","Shengshan Hu","Ruizhi Zhao","Qian Wang","Leo Yu Zhang","Junhui Hou","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.12280v1.pdf","comment":"This paper has been accepted by the International Conference on\n  Computer Vision (ICCV '23, October 2--6, 2023, Paris, France)"},{"id":"http://arxiv.org/abs/2303.14676v2","updated":"2023-07-23T09:41:51Z","published":"2023-03-26T10:50:16Z","title":"PDPP:Projected Diffusion for Procedure Planning in Instructional Videos","summary":"  In this paper, we study the problem of procedure planning in instructional\nvideos, which aims to make goal-directed plans given the current visual\nobservations in unstructured real-life videos. Previous works cast this problem\nas a sequence planning problem and leverage either heavy intermediate visual\nobservations or natural language instructions as supervision, resulting in\ncomplex learning schemes and expensive annotation costs. In contrast, we treat\nthis problem as a distribution fitting problem. In this sense, we model the\nwhole intermediate action sequence distribution with a diffusion model (PDPP),\nand thus transform the planning problem to a sampling process from this\ndistribution. In addition, we remove the expensive intermediate supervision,\nand simply use task labels from instructional videos as supervision instead.\nOur model is a U-Net based diffusion model, which directly samples action\nsequences from the learned distribution with the given start and end\nobservations. Furthermore, we apply an efficient projection method to provide\naccurate conditional guides for our model during the learning and sampling\nprocess. Experiments on three datasets with different scales show that our PDPP\nmodel can achieve the state-of-the-art performance on multiple metrics, even\nwithout the task supervision. Code and trained models are available at\nhttps://github.com/MCG-NJU/PDPP.\n","authors":["Hanlin Wang","Yilu Wu","Sheng Guo","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2303.14676v2.pdf","comment":"Accepted as a highlight paper at CVPR 2023"},{"id":"http://arxiv.org/abs/2307.12274v1","updated":"2023-07-23T09:34:13Z","published":"2023-07-23T09:34:13Z","title":"FDCT: Fast Depth Completion for Transparent Objects","summary":"  Depth completion is crucial for many robotic tasks such as autonomous\ndriving, 3-D reconstruction, and manipulation. Despite the significant\nprogress, existing methods remain computationally intensive and often fail to\nmeet the real-time requirements of low-power robotic platforms. Additionally,\nmost methods are designed for opaque objects and struggle with transparent\nobjects due to the special properties of reflection and refraction. To address\nthese challenges, we propose a Fast Depth Completion framework for Transparent\nobjects (FDCT), which also benefits downstream tasks like object pose\nestimation. To leverage local information and avoid overfitting issues when\nintegrating it with global information, we design a new fusion branch and\nshortcuts to exploit low-level features and a loss function to suppress\noverfitting. This results in an accurate and user-friendly depth rectification\nframework which can recover dense depth estimation from RGB-D images alone.\nExtensive experiments demonstrate that FDCT can run about 70 FPS with a higher\naccuracy than the state-of-the-art methods. We also demonstrate that FDCT can\nimprove pose estimation in object grasping tasks. The source code is available\nat https://github.com/Nonmy/FDCT\n","authors":["Tianan Li","Zhehan Chen","Huan Liu","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12274v1.pdf","comment":"9pages,7figures"},{"id":"http://arxiv.org/abs/2307.12270v1","updated":"2023-07-23T09:04:13Z","published":"2023-07-23T09:04:13Z","title":"Context Perception Parallel Decoder for Scene Text Recognition","summary":"  Scene text recognition (STR) methods have struggled to attain high accuracy\nand fast inference speed. Autoregressive (AR)-based STR model uses the\npreviously recognized characters to decode the next character iteratively. It\nshows superiority in terms of accuracy. However, the inference speed is slow\nalso due to this iteration. Alternatively, parallel decoding (PD)-based STR\nmodel infers all the characters in a single decoding pass. It has advantages in\nterms of inference speed but worse accuracy, as it is difficult to build a\nrobust recognition context in such a pass. In this paper, we first present an\nempirical study of AR decoding in STR. In addition to constructing a new AR\nmodel with the top accuracy, we find out that the success of AR decoder lies\nalso in providing guidance on visual context perception rather than language\nmodeling as claimed in existing studies. As a consequence, we propose Context\nPerception Parallel Decoder (CPPD) to decode the character sequence in a single\nPD pass. CPPD devises a character counting module and a character ordering\nmodule. Given a text instance, the former infers the occurrence count of each\ncharacter, while the latter deduces the character reading order and\nplaceholders. Together with the character prediction task, they construct a\ncontext that robustly tells what the character sequence is and where the\ncharacters appear, well mimicking the context conveyed by AR decoding.\nExperiments on both English and Chinese benchmarks demonstrate that CPPD models\nachieve highly competitive accuracy. Moreover, they run approximately 7x faster\nthan their AR counterparts, and are also among the fastest recognizers. The\ncode will be released soon.\n","authors":["Yongkun Du","Zhineng Chen","Caiyan Jia","Xiaoting Yin","Chenxia Li","Yuning Du","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.12270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02401v5","updated":"2023-07-23T08:31:15Z","published":"2023-03-04T12:26:47Z","title":"Open-Vocabulary Affordance Detection in 3D Point Clouds","summary":"  Affordance detection is a challenging problem with a wide variety of robotic\napplications. Traditional affordance detection methods are limited to a\npredefined set of affordance labels, hence potentially restricting the\nadaptability of intelligent robots in complex and dynamic environments. In this\npaper, we present the Open-Vocabulary Affordance Detection (OpenAD) method,\nwhich is capable of detecting an unbounded number of affordances in 3D point\nclouds. By simultaneously learning the affordance text and the point feature,\nOpenAD successfully exploits the semantic relationships between affordances.\nTherefore, our proposed method enables zero-shot detection and can be able to\ndetect previously unseen affordances without a single annotation example.\nIntensive experimental results show that OpenAD works effectively on a wide\nrange of affordance detection setups and outperforms other baselines by a large\nmargin. Additionally, we demonstrate the practicality of the proposed OpenAD in\nreal-world robotic applications with a fast inference speed (~100ms). Our\nproject is available at https://openad2023.github.io.\n","authors":["Toan Nguyen","Minh Nhat Vu","An Vuong","Dzung Nguyen","Thieu Vo","Ngan Le","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2303.02401v5.pdf","comment":"Accepted at The 2023 IEEE/RSJ International Conference on Intelligent\n  Robots and Systems (IROS 2023)"},{"id":"http://arxiv.org/abs/2307.12256v1","updated":"2023-07-23T08:02:37Z","published":"2023-07-23T08:02:37Z","title":"Building-road Collaborative Extraction from Remotely Sensed Images via\n  Cross-Interaction","summary":"  Buildings are the basic carrier of social production and human life; roads\nare the links that interconnect social networks. Building and road information\nhas important application value in the frontier fields of regional coordinated\ndevelopment, disaster prevention, auto-driving, etc. Mapping buildings and\nroads from very high-resolution (VHR) remote sensing images have become a hot\nresearch topic. However, the existing methods often ignore the strong spatial\ncorrelation between roads and buildings and extract them in isolation. To fully\nutilize the complementary advantages between buildings and roads, we propose a\nbuilding-road collaborative extraction method based on multi-task and\ncross-scale feature interaction to improve the accuracy of both tasks in a\ncomplementary way. A multi-task interaction module is proposed to interact\ninformation across tasks and preserve the unique information of each task,\nwhich tackle the seesaw phenomenon in multitask learning. By considering the\nvariation in appearance and structure between buildings and roads, a\ncross-scale interaction module is designed to automatically learn the optimal\nreception field for different tasks. Compared with many existing methods that\ntrain each task individually, the proposed collaborative extraction method can\nutilize the complementary advantages between buildings and roads by the\nproposed inter-task and inter-scale feature interactions, and automatically\nselect the optimal reception field for different tasks. Experiments on a wide\nrange of urban and rural scenarios show that the proposed algorithm can achieve\nbuilding-road extraction with outstanding performance and efficiency.\n","authors":["Haonan Guo","Xin Su","Chen Wu","Bo Du","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12256v1.pdf","comment":"34 pages,9 figures, submitted to ISPRS Journal of Photogrammetry and\n  Remote Sensing"},{"id":"http://arxiv.org/abs/2307.12255v1","updated":"2023-07-23T08:02:27Z","published":"2023-07-23T08:02:27Z","title":"ResWCAE: Biometric Pattern Image Denoising Using Residual\n  Wavelet-Conditioned Autoencoder","summary":"  The utilization of biometric authentication with pattern images is\nincreasingly popular in compact Internet of Things (IoT) devices. However, the\nreliability of such systems can be compromised by image quality issues,\nparticularly in the presence of high levels of noise. While state-of-the-art\ndeep learning algorithms designed for generic image denoising have shown\npromise, their large number of parameters and lack of optimization for unique\nbiometric pattern retrieval make them unsuitable for these devices and\nscenarios. In response to these challenges, this paper proposes a lightweight\nand robust deep learning architecture, the Residual Wavelet-Conditioned\nConvolutional Autoencoder (Res-WCAE) with a Kullback-Leibler divergence (KLD)\nregularization, designed specifically for fingerprint image denoising. Res-WCAE\ncomprises two encoders - an image encoder and a wavelet encoder - and one\ndecoder. Residual connections between the image encoder and decoder are\nleveraged to preserve fine-grained spatial features, where the bottleneck layer\nconditioned on the compressed representation of features obtained from the\nwavelet encoder using approximation and detail subimages in the\nwavelet-transform domain. The effectiveness of Res-WCAE is evaluated against\nseveral state-of-the-art denoising methods, and the experimental results\ndemonstrate that Res-WCAE outperforms these methods, particularly for heavily\ndegraded fingerprint images in the presence of high levels of noise. Overall,\nRes-WCAE shows promise as a solution to the challenges faced by biometric\nauthentication systems in compact IoT devices.\n","authors":["Youzhi Liang","Wen Liang"],"pdf_url":"https://arxiv.org/pdf/2307.12255v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2211.11262v3","updated":"2023-07-23T07:18:08Z","published":"2022-11-21T08:51:01Z","title":"Boosting Novel Category Discovery Over Domains with Soft Contrastive\n  Learning and All-in-One Classifier","summary":"  Unsupervised domain adaptation (UDA) has proven to be highly effective in\ntransferring knowledge from a label-rich source domain to a label-scarce target\ndomain. However, the presence of additional novel categories in the target\ndomain has led to the development of open-set domain adaptation (ODA) and\nuniversal domain adaptation (UNDA). Existing ODA and UNDA methods treat all\nnovel categories as a single, unified unknown class and attempt to detect it\nduring training. However, we found that domain variance can lead to more\nsignificant view-noise in unsupervised data augmentation, which affects the\neffectiveness of contrastive learning (CL) and causes the model to be\noverconfident in novel category discovery. To address these issues, a framework\nnamed Soft-contrastive All-in-one Network (SAN) is proposed for ODA and UNDA\ntasks. SAN includes a novel data-augmentation-based soft contrastive learning\n(SCL) loss to fine-tune the backbone for feature transfer and a more\nhuman-intuitive classifier to improve new class discovery capability. The SCL\nloss weakens the adverse effects of the data augmentation view-noise problem\nwhich is amplified in domain transfer tasks. The All-in-One (AIO) classifier\novercomes the overconfidence problem of current mainstream closed-set and\nopen-set classifiers. Visualization and ablation experiments demonstrate the\neffectiveness of the proposed innovations. Furthermore, extensive experiment\nresults on ODA and UNDA show that SAN outperforms existing state-of-the-art\nmethods.\n","authors":["Zelin Zang","Lei Shang","Senqiao Yang","Fei Wang","Baigui Sun","Xuansong Xie","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2211.11262v3.pdf","comment":"Accepted by ICCV"},{"id":"http://arxiv.org/abs/2307.12241v1","updated":"2023-07-23T06:39:51Z","published":"2023-07-23T06:39:51Z","title":"Explainable Depression Detection via Head Motion Patterns","summary":"  While depression has been studied via multimodal non-verbal behavioural cues,\nhead motion behaviour has not received much attention as a biomarker. This\nstudy demonstrates the utility of fundamental head-motion units, termed\n\\emph{kinemes}, for depression detection by adopting two distinct approaches,\nand employing distinctive features: (a) discovering kinemes from head motion\ndata corresponding to both depressed patients and healthy controls, and (b)\nlearning kineme patterns only from healthy controls, and computing statistics\nderived from reconstruction errors for both the patient and control classes.\nEmploying machine learning methods, we evaluate depression classification\nperformance on the \\emph{BlackDog} and \\emph{AVEC2013} datasets. Our findings\nindicate that: (1) head motion patterns are effective biomarkers for detecting\ndepressive symptoms, and (2) explanatory kineme patterns consistent with prior\nfindings can be observed for the two classes. Overall, we achieve peak F1\nscores of 0.79 and 0.82, respectively, over BlackDog and AVEC2013 for binary\nclassification over episodic \\emph{thin-slices}, and a peak F1 of 0.72 over\nvideos for AVEC2013.\n","authors":["Monika Gahalawat","Raul Fernandez Rojas","Tanaya Guha","Ramanathan Subramanian","Roland Goecke"],"pdf_url":"https://arxiv.org/pdf/2307.12241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12239v1","updated":"2023-07-23T06:26:27Z","published":"2023-07-23T06:26:27Z","title":"DQ-Det: Learning Dynamic Query Combinations for Transformer-based Object\n  Detection and Segmentation","summary":"  Transformer-based detection and segmentation methods use a list of learned\ndetection queries to retrieve information from the transformer network and\nlearn to predict the location and category of one specific object from each\nquery. We empirically find that random convex combinations of the learned\nqueries are still good for the corresponding models. We then propose to learn a\nconvex combination with dynamic coefficients based on the high-level semantics\nof the image. The generated dynamic queries, named modulated queries, better\ncapture the prior of object locations and categories in the different images.\nEquipped with our modulated queries, a wide range of DETR-based models achieve\nconsistent and superior performance across multiple tasks including object\ndetection, instance segmentation, panoptic segmentation, and video instance\nsegmentation.\n","authors":["Yiming Cui","Linjie Yang","Haichao Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12239v1.pdf","comment":"12 pages, 4 figures, ICML 2023"},{"id":"http://arxiv.org/abs/2307.12236v1","updated":"2023-07-23T06:03:12Z","published":"2023-07-23T06:03:12Z","title":"Multi-Modal Machine Learning for Assessing Gaming Skills in Online\n  Streaming: A Case Study with CS:GO","summary":"  Online streaming is an emerging market that address much attention. Assessing\ngaming skills from videos is an important task for streaming service providers\nto discover talented gamers. Service providers require the information to offer\ncustomized recommendation and service promotion to their customers. Meanwhile,\nthis is also an important multi-modal machine learning tasks since online\nstreaming combines vision, audio and text modalities. In this study we begin by\nidentifying flaws in the dataset and proceed to clean it manually. Then we\npropose several variants of latest end-to-end models to learn joint\nrepresentation of multiple modalities. Through our extensive experimentation,\nwe demonstrate the efficacy of our proposals. Moreover, we identify that our\nproposed models is prone to identifying users instead of learning meaningful\nrepresentations. We purpose future work to address the issue in the end.\n","authors":["Longxiang Zhang","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12229v1","updated":"2023-07-23T05:31:47Z","published":"2023-07-23T05:31:47Z","title":"EchoGLAD: Hierarchical Graph Neural Networks for Left Ventricle Landmark\n  Detection on Echocardiograms","summary":"  The functional assessment of the left ventricle chamber of the heart requires\ndetecting four landmark locations and measuring the internal dimension of the\nleft ventricle and the approximate mass of the surrounding muscle. The key\nchallenge of automating this task with machine learning is the sparsity of\nclinical labels, i.e., only a few landmark pixels in a high-dimensional image\nare annotated, leading many prior works to heavily rely on isotropic label\nsmoothing. However, such a label smoothing strategy ignores the anatomical\ninformation of the image and induces some bias. To address this challenge, we\nintroduce an echocardiogram-based, hierarchical graph neural network (GNN) for\nleft ventricle landmark detection (EchoGLAD). Our main contributions are: 1) a\nhierarchical graph representation learning framework for multi-resolution\nlandmark detection via GNNs; 2) induced hierarchical supervision at different\nlevels of granularity using a multi-level loss. We evaluate our model on a\npublic and a private dataset under the in-distribution (ID) and\nout-of-distribution (OOD) settings. For the ID setting, we achieve the\nstate-of-the-art mean absolute errors (MAEs) of 1.46 mm and 1.86 mm on the two\ndatasets. Our model also shows better OOD generalization than prior works with\na testing MAE of 4.3 mm.\n","authors":["Masoud Mokhtari","Mobina Mahdavi","Hooman Vaseli","Christina Luong","Purang Abolmaesumi","Teresa S. M. Tsang","Renjie Liao"],"pdf_url":"https://arxiv.org/pdf/2307.12229v1.pdf","comment":"To be published in MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.12225v1","updated":"2023-07-23T04:36:05Z","published":"2023-07-23T04:36:05Z","title":"ASCON: Anatomy-aware Supervised Contrastive Learning Framework for\n  Low-dose CT Denoising","summary":"  While various deep learning methods have been proposed for low-dose computed\ntomography (CT) denoising, most of them leverage the normal-dose CT images as\nthe ground-truth to supervise the denoising process. These methods typically\nignore the inherent correlation within a single CT image, especially the\nanatomical semantics of human tissues, and lack the interpretability on the\ndenoising process. In this paper, we propose a novel Anatomy-aware Supervised\nCONtrastive learning framework, termed ASCON, which can explore the anatomical\nsemantics for low-dose CT denoising while providing anatomical\ninterpretability. The proposed ASCON consists of two novel designs: an\nefficient self-attention-based U-Net (ESAU-Net) and a multi-scale anatomical\ncontrastive network (MAC-Net). First, to better capture global-local\ninteractions and adapt to the high-resolution input, an efficient ESAU-Net is\nintroduced by using a channel-wise self-attention mechanism. Second, MAC-Net\nincorporates a patch-wise non-contrastive module to capture inherent anatomical\ninformation and a pixel-wise contrastive module to maintain intrinsic\nanatomical consistency. Extensive experimental results on two public low-dose\nCT denoising datasets demonstrate superior performance of ASCON over\nstate-of-the-art models. Remarkably, our ASCON provides anatomical\ninterpretability for low-dose CT denoising for the first time. Source code is\navailable at https://github.com/hao1635/ASCON.\n","authors":["Zhihao Chen","Qi Gao","Yi Zhang","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2307.12225v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2210.03296v2","updated":"2023-07-23T04:28:18Z","published":"2022-10-07T03:09:00Z","title":"GMA3D: Local-Global Attention Learning to Estimate Occluded Motions of\n  Scene Flow","summary":"  Scene flow represents the motion information of each point in the 3D point\nclouds. It is a vital downstream method applied to many tasks, such as motion\nsegmentation and object tracking. However, there are always occlusion points\nbetween two consecutive point clouds, whether from the sparsity data sampling\nor real-world occlusion. In this paper, we focus on addressing occlusion issues\nin scene flow by the semantic self-similarity and motion consistency of the\nmoving objects. We propose a GMA3D module based on the transformer framework,\nwhich utilizes local and global semantic similarity to infer the motion\ninformation of occluded points from the motion information of local and global\nnon-occluded points respectively, and then uses an offset aggregator to\naggregate them. Our module is the first to apply the transformer-based\narchitecture to gauge the scene flow occlusion problem on point clouds.\nExperiments show that our GMA3D can solve the occlusion problem in the scene\nflow, especially in the real scene. We evaluated the proposed method on the\noccluded version of point cloud datasets and get state-of-the-art results on\nthe real scene KITTI dataset. To testify that GMA3D is still beneficial to\nnon-occluded scene flow, we also conducted experiments on non-occluded version\ndatasets and achieved promising performance on FlyThings3D and KITTI. The code\nis available at https://anonymous.4open.science/r/GMA3D-E100.\n","authors":["Zhiyang Lu","Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2210.03296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12220v1","updated":"2023-07-23T03:55:13Z","published":"2023-07-23T03:55:13Z","title":"Expediting Building Footprint Segmentation from High-resolution Remote\n  Sensing Images via progressive lenient supervision","summary":"  The efficacy of building footprint segmentation from remotely sensed images\nhas been hindered by model transfer effectiveness. Many existing building\nsegmentation methods were developed upon the encoder-decoder architecture of\nU-Net, in which the encoder is finetuned from the newly developed backbone\nnetworks that are pre-trained on ImageNet. However, the heavy computational\nburden of the existing decoder designs hampers the successful transfer of these\nmodern encoder networks to remote sensing tasks. Even the widely-adopted deep\nsupervision strategy fails to mitigate these challenges due to its invalid loss\nin hybrid regions where foreground and background pixels are intermixed. In\nthis paper, we conduct a comprehensive evaluation of existing decoder network\ndesigns for building footprint segmentation and propose an efficient framework\ndenoted as BFSeg to enhance learning efficiency and effectiveness.\nSpecifically, a densely-connected coarse-to-fine feature fusion decoder network\nthat facilitates easy and fast feature fusion across scales is proposed.\nMoreover, considering the invalidity of hybrid regions in the down-sampled\nground truth during the deep supervision process, we present a lenient deep\nsupervision and distillation strategy that enables the network to learn proper\nknowledge from deep supervision. Building upon these advancements, we have\ndeveloped a new family of building segmentation networks, which consistently\nsurpass prior works with outstanding performance and efficiency across a wide\nrange of newly developed encoder networks. The code will be released on\nhttps://github.com/HaonanGuo/BFSeg-Efficient-Building-Footprint-Segmentation-Framework.\n","authors":["Haonan Guo","Bo Du","Chen Wu","Xin Su","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12220v1.pdf","comment":"13 pages,8 figures. Submitted to IEEE Transactions on Neural Networks\n  and Learning Systems"},{"id":"http://arxiv.org/abs/2307.12217v1","updated":"2023-07-23T03:38:55Z","published":"2023-07-23T03:38:55Z","title":"LoLep: Single-View View Synthesis with Locally-Learned Planes and\n  Self-Attention Occlusion Inference","summary":"  We propose a novel method, LoLep, which regresses Locally-Learned planes from\na single RGB image to represent scenes accurately, thus generating better novel\nviews. Without the depth information, regressing appropriate plane locations is\na challenging problem. To solve this issue, we pre-partition the disparity\nspace into bins and design a disparity sampler to regress local offsets for\nmultiple planes in each bin. However, only using such a sampler makes the\nnetwork not convergent; we further propose two optimizing strategies that\ncombine with different disparity distributions of datasets and propose an\nocclusion-aware reprojection loss as a simple yet effective geometric\nsupervision technique. We also introduce a self-attention mechanism to improve\nocclusion inference and present a Block-Sampling Self-Attention (BS-SA) module\nto address the problem of applying self-attention to large feature maps. We\ndemonstrate the effectiveness of our approach and generate state-of-the-art\nresults on different datasets. Compared to MINE, our approach has an LPIPS\nreduction of 4.8%-9.0% and an RV reduction of 83.1%-84.7%. We also evaluate the\nperformance on real-world images and demonstrate the benefits.\n","authors":["Cong Wang","Yu-Ping Wang","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2307.12217v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12208v1","updated":"2023-07-23T02:47:30Z","published":"2023-07-23T02:47:30Z","title":"DeepCL: Deep Change Feature Learning on Remote Sensing Images in the\n  Metric Space","summary":"  Change detection (CD) is an important yet challenging task in the Earth\nobservation field for monitoring Earth surface dynamics. The advent of deep\nlearning techniques has recently propelled automatic CD into a technological\nrevolution. Nevertheless, deep learning-based CD methods are still plagued by\ntwo primary issues: 1) insufficient temporal relationship modeling and 2)\npseudo-change misclassification. To address these issues, we complement the\nstrong temporal modeling ability of metric learning with the prominent fitting\nability of segmentation and propose a deep change feature learning (DeepCL)\nframework for robust and explainable CD. Firstly, we designed a hard\nsample-aware contrastive loss, which reweights the importance of hard and\nsimple samples. This loss allows for explicit modeling of the temporal\ncorrelation between bi-temporal remote sensing images. Furthermore, the modeled\ntemporal relations are utilized as knowledge prior to guide the segmentation\nprocess for detecting change regions. The DeepCL framework is thoroughly\nevaluated both theoretically and experimentally, demonstrating its superior\nfeature discriminability, resilience against pseudo changes, and adaptability\nto a variety of CD algorithms. Extensive comparative experiments substantiate\nthe quantitative and qualitative superiority of DeepCL over state-of-the-art CD\napproaches.\n","authors":["Haonan Guo","Bo Du","Chen Wu","Chengxi Han","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12208v1.pdf","comment":"12 pages,7 figures, submitted to IEEE Transactions on Image\n  Processing"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2302.11370v4","updated":"2023-07-23T23:35:25Z","published":"2023-02-22T13:39:54Z","title":"Recall, Robustness, and Lexicographic Evaluation","summary":"  Researchers use recall to evaluate rankings across a variety of retrieval,\nrecommendation, and machine learning tasks. While there is a colloquial\ninterpretation of recall in set-based evaluation, the research community is far\nfrom a principled understanding of recall metrics for rankings. The lack of\nprincipled understanding of or motivation for recall has resulted in criticism\namongst the retrieval community that recall is useful as a measure at all. In\nthis light, we reflect on the measurement of recall in rankings from a formal\nperspective. Our analysis is composed of three tenets: recall, robustness, and\nlexicographic evaluation. First, we formally define `recall-orientation' as\nsensitivity to movement of the bottom-ranked relevant item. Second, we analyze\nour concept of recall orientation from the perspective of robustness with\nrespect to possible searchers and content providers. Finally, we extend this\nconceptual and theoretical treatment of recall by developing a practical\npreference-based evaluation method based on lexicographic comparison. Through\nextensive empirical analysis across 17 TREC tracks, we establish that our new\nevaluation method, lexirecall, is correlated with existing recall metrics and\nexhibits substantially higher discriminative power and stability in the\npresence of missing labels. Our conceptual, theoretical, and empirical analysis\nsubstantially deepens our understanding of recall and motivates its adoption\nthrough connections to robustness and fairness.\n","authors":["Fernando Diaz","Bhaskar Mitra"],"pdf_url":"https://arxiv.org/pdf/2302.11370v4.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.12424v1","updated":"2023-07-23T20:34:18Z","published":"2023-07-23T20:34:18Z","title":"Interface Design to Mitigate Inflation in Recommender Systems","summary":"  Recommendation systems rely on user-provided data to learn about item quality\nand provide personalized recommendations. An implicit assumption when\naggregating ratings into item quality is that ratings are strong indicators of\nitem quality. In this work, we test this assumption using data collected from a\nmusic discovery application. Our study focuses on two factors that cause rating\ninflation: heterogeneous user rating behavior and the dynamics of personalized\nrecommendations. We show that user rating behavior substantially varies by\nuser, leading to item quality estimates that reflect the users who rated an\nitem more than the item quality itself. Additionally, items that are more\nlikely to be shown via personalized recommendations can experience a\nsubstantial increase in their exposure and potential bias toward them. To\nmitigate these effects, we analyze the results of a randomized controlled trial\nin which the rating interface was modified. The test resulted in a substantial\nimprovement in user rating behavior and a reduction in item quality inflation.\nThese findings highlight the importance of carefully considering the\nassumptions underlying recommendation systems and designing interfaces that\nencourage accurate rating behavior.\n","authors":["Rana Shahout","Yehonatan Peisakhovsky","Sasha Stoikov","Nikhil Garg"],"pdf_url":"https://arxiv.org/pdf/2307.12424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03691v2","updated":"2023-07-23T17:05:06Z","published":"2023-07-05T23:19:18Z","title":"Comparing Apples to Apples: Generating Aspect-Aware Comparative\n  Sentences from User Reviews","summary":"  It is time-consuming to find the best product among many similar\nalternatives. Comparative sentences can help to contrast one item from others\nin a way that highlights important features of an item that stand out. Given\nreviews of one or multiple items and relevant item features, we generate\ncomparative review sentences to aid users to find the best fit. Specifically,\nour model consists of three successive components in a transformer: (i) an item\nencoding module to encode an item for comparison, (ii) a comparison generation\nmodule that generates comparative sentences in an autoregressive manner, (iii)\na novel decoding method for user personalization. We show that our pipeline\ngenerates fluent and diverse comparative sentences. We run experiments on the\nrelevance and fidelity of our generated sentences in a human evaluation study\nand find that our algorithm creates comparative review sentences that are\nrelevant and truthful.\n","authors":["Jessica Echterhoff","An Yan","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2307.03691v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11019v2","updated":"2023-07-23T16:52:59Z","published":"2023-07-20T16:46:10Z","title":"Investigating the Factual Knowledge Boundary of Large Language Models\n  with Retrieval Augmentation","summary":"  Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require\na substantial amount of factual knowledge and often rely on external\ninformation for assistance. Recently, large language models (LLMs) (e.g.,\nChatGPT), have demonstrated impressive prowess in solving a wide range of tasks\nwith world knowledge, including knowledge-intensive tasks. However, it remains\nunclear how well LLMs are able to perceive their factual knowledge boundaries,\nparticularly how they behave when incorporating retrieval augmentation. In this\nstudy, we present an initial analysis of the factual knowledge boundaries of\nLLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,\nwe focus on three primary research questions and analyze them by examining QA\nperformance, priori judgement and posteriori judgement of LLMs. We show\nevidence that LLMs possess unwavering confidence in their capabilities to\nrespond to questions and the accuracy of their responses. Furthermore,\nretrieval augmentation proves to be an effective approach in enhancing LLMs'\nawareness of knowledge boundaries, thereby improving their judgemental\nabilities. Additionally, we also find that LLMs have a propensity to rely on\nthe provided retrieval results when formulating answers, while the quality of\nthese results significantly impacts their reliance. The code to reproduce this\nwork is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.\n","authors":["Ruiyang Ren","Yuhao Wang","Yingqi Qu","Wayne Xin Zhao","Jing Liu","Hao Tian","Hua Wu","Ji-Rong Wen","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11019v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12301v1","updated":"2023-07-23T11:50:27Z","published":"2023-07-23T11:50:27Z","title":"RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC","summary":"  Image outlier detection (OD) is crucial for ensuring the quality and accuracy\nof image datasets used in computer vision tasks. The majority of OD algorithms,\nhowever, have not been targeted toward image data. Consequently, the results of\napplying such algorithms to images are often suboptimal. In this work, we\npropose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for\nimages. By comparing images in a RANSAC-based approach, our algorithm\nautomatically predicts the outlier score of each image without additional\ntraining or label information. We evaluate RANSAC-NN against state-of-the-art\nOD algorithms on 15 diverse datasets. Without any hyperparameter tuning,\nRANSAC-NN consistently performs favorably in contrast to other algorithms in\nalmost every dataset category. Furthermore, we provide a detailed analysis to\nunderstand each RANSAC-NN component, and we demonstrate its potential\napplications in image mislabeled detection. Code for RANSAC-NN is provided at\nhttps://github.com/mxtsai/ransac-nn\n","authors":["Chen-Han Tsai","Yu-Shao Peng"],"pdf_url":"https://arxiv.org/pdf/2307.12301v1.pdf","comment":"19 pages, 18 figures"},{"id":"http://arxiv.org/abs/2109.12887v4","updated":"2023-07-23T01:30:10Z","published":"2021-09-27T09:17:53Z","title":"ICPE: An Item Cluster-Wise Pareto-Efficient Framework for Recommendation\n  Debiasing","summary":"  Recommender system based on historical user-item interactions is of vital\nimportance for web-based services. However, the observed data used to train the\nrecommender model suffers from severe bias issues. Practically, the item\nfrequency distribution of the dataset is a highly skewed power-law\ndistribution. Interactions of a small fraction of head items account for almost\nthe whole training data. The normal training paradigm from such biased data\ntends to repetitively generate recommendations from the head items, which\nfurther exacerbates the biases and affects the exploration of potentially\ninteresting items from the niche set. In this work, we innovatively explore the\ncentral theme of recommendation debiasing from an item cluster-wise\nmulti-objective optimization perspective. Aiming to balance the learning on\nvarious item clusters that differ in popularity during the training process, we\npropose a model-agnostic framework namely Item Cluster-Wise Pareto-Efficient\nRecommendation (ICPE). In detail, we define our item cluster-wise optimization\ntarget as the recommender model should balance all item clusters that differ in\npopularity, thus we set the model learning on each item cluster as a unique\noptimization objective. To achieve this goal, we first explore items'\npopularity levels from a novel causal reasoning perspective. Then, we devise\npopularity discrepancy-based bisecting clustering to separate the item\nclusters. Next, we adaptively find the overall harmonious gradient direction\nfor cluster-wise optimization objectives from a Pareto-efficient solver.\nFinally, in the prediction stage, we perform counterfactual inference to\nfurther eliminate the impact of global propensity. Extensive experimental\nresults verify the superiorities of ICPE on overall recommendation performance\nand biases elimination.\n","authors":["Yule Wang","Xin Xin","Yue Ding","Yunzhe Li","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2109.12887v4.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2305.18088v5","updated":"2023-07-23T23:53:00Z","published":"2023-05-25T05:34:39Z","title":"Drug Repurposing Targeting COVID-19 3CL Protease using Molecular Docking\n  and Machine Learning Regression Approach","summary":"  The COVID-19 pandemic has created a global health crisis, with an urgent need\nfor effective treatments. Drug repurposing has emerged as a promising solution,\nas it can save time, cost, and labor. However, the number of identified\nrepurposed drugs for COVID-19 treatment remains limited, and there is a need\nfor more efficient and comprehensive drug repurposing approaches. In this\nstudy, we aimed to identify potential therapeutic candidates for COVID-19\ntreatment through drug repurposing using a combination of molecular docking and\nmachine learning regression approaches. We utilized the Zinc database to screen\n5903 World-approved drugs for their potential to target the main protease 3CL\nof SARS-CoV-2, which is a key enzyme in the replication of the virus. We\nperformed molecular docking to evaluate the binding affinity of the drugs to\nthe main protease 3CL, and used several machine learning regression approaches\nfor QSAR modeling to identify drugs with high binding affinity. Our results\nshowed that the Decision Tree Regression (DTR) model had the best statistical\nmeasures of R2 and RMSE, and we shortlisted six promising drugs within the\nrange of -15 kcal/mol to -13 kcal/mol. These drugs have novel repurposing\npotential, except for one antiviral ZINC203757351 compound that has already\nbeen identified in other studies. We further analyzed the physiochemical and\npharmacokinetic properties of these top-ranked selected drugs and their best\nbinding interaction for specific target protease 3CLpro. Our study provides an\nefficient framework for drug repurposing against COVID-19, and demonstrates the\npotential of combining molecular docking with machine learning regression\napproaches to accelerate the identification of potential therapeutic\ncandidates. Our findings contribute to the larger goal of finding effective\ntreatments for COVID-19, which is a critical global health challenge.\n","authors":["Imra Aqeel","Abdul Majid"],"pdf_url":"https://arxiv.org/pdf/2305.18088v5.pdf","comment":"27 Pages"},{"id":"http://arxiv.org/abs/2307.12456v1","updated":"2023-07-23T23:42:06Z","published":"2023-07-23T23:42:06Z","title":"Information-theoretic Analysis of Test Data Sensitivity in Uncertainty","summary":"  Bayesian inference is often utilized for uncertainty quantification tasks. A\nrecent analysis by Xu and Raginsky 2022 rigorously decomposed the predictive\nuncertainty in Bayesian inference into two uncertainties, called aleatoric and\nepistemic uncertainties, which represent the inherent randomness in the\ndata-generating process and the variability due to insufficient data,\nrespectively. They analyzed those uncertainties in an information-theoretic\nway, assuming that the model is well-specified and treating the model's\nparameters as latent variables. However, the existing information-theoretic\nanalysis of uncertainty cannot explain the widely believed property of\nuncertainty, known as the sensitivity between the test and training data. It\nimplies that when test data are similar to training data in some sense, the\nepistemic uncertainty should become small. In this work, we study such\nuncertainty sensitivity using our novel decomposition method for the predictive\nuncertainty. Our analysis successfully defines such sensitivity using\ninformation-theoretic quantities. Furthermore, we extend the existing analysis\nof Bayesian meta-learning and show the novel sensitivities among tasks for the\nfirst time.\n","authors":["Futoshi Futami","Tomoharu Iwata"],"pdf_url":"https://arxiv.org/pdf/2307.12456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.10168v2","updated":"2023-07-23T23:06:21Z","published":"2022-09-21T07:57:27Z","title":"Improving Generalizability of Graph Anomaly Detection Models via Data\n  Augmentation","summary":"  Graph anomaly detection (GAD) is a vital task since even a few anomalies can\npose huge threats to benign users. Recent semi-supervised GAD methods, which\ncan effectively leverage the available labels as prior knowledge, have achieved\nsuperior performances than unsupervised methods. In practice, people usually\nneed to identify anomalies on new (sub)graphs to secure their business, but\nthey may lack labels to train an effective detection model. One natural idea is\nto directly adopt a trained GAD model to the new (sub)graph for testing.\nHowever, we find that existing semi-supervised GAD methods suffer from poor\ngeneralization issue, i.e., well-trained models could not perform well on an\nunseen area (i.e., not accessible in training) of the same graph. It may cause\ngreat troubles. In this paper, we base on the phenomenon and propose a general\nand novel research problem of generalized graph anomaly detection that aims to\neffectively identify anomalies on both the training-domain graph and unseen\ntesting graph to eliminate potential dangers. Nevertheless, it is a challenging\ntask since only limited labels are available, and the normal background may\ndiffer between training and testing data. Accordingly, we propose a data\naugmentation method named \\textit{AugAN} (\\uline{Aug}mentation for\n\\uline{A}nomaly and \\uline{N}ormal distributions) to enrich training data and\nboost the generalizability of GAD models. Experiments verify the effectiveness\nof our method in improving model generalizability.\n","authors":["Shuang Zhou","Xiao Huang","Ninghao Liu","Fu-Lai Chung","Long-Kai Huang"],"pdf_url":"https://arxiv.org/pdf/2209.10168v2.pdf","comment":"The updated version is accepted by TKDE 2023. Please refer to\n  arXiv:2306.10534v1"},{"id":"http://arxiv.org/abs/2307.12451v1","updated":"2023-07-23T23:05:08Z","published":"2023-07-23T23:05:08Z","title":"DiAMoNDBack: Diffusion-denoising Autoregressive Model for\n  Non-Deterministic Backmapping of Cα Protein Traces","summary":"  Coarse-grained molecular models of proteins permit access to length and time\nscales unattainable by all-atom models and the simulation of processes that\noccur on long-time scales such as aggregation and folding. The reduced\nresolution realizes computational accelerations but an atomistic representation\ncan be vital for a complete understanding of mechanistic details. Backmapping\nis the process of restoring all-atom resolution to coarse-grained molecular\nmodels. In this work, we report DiAMoNDBack (Diffusion-denoising Autoregressive\nModel for Non-Deterministic Backmapping) as an autoregressive denoising\ndiffusion probability model to restore all-atom details to coarse-grained\nprotein representations retaining only C{\\alpha} coordinates. The\nautoregressive generation process proceeds from the protein N-terminus to\nC-terminus in a residue-by-residue fashion conditioned on the C{\\alpha} trace\nand previously backmapped backbone and side chain atoms within the local\nneighborhood. The local and autoregressive nature of our model makes it\ntransferable between proteins. The stochastic nature of the denoising diffusion\nprocess means that the model generates a realistic ensemble of backbone and\nside chain all-atom configurations consistent with the coarse-grained C{\\alpha}\ntrace. We train DiAMoNDBack over 65k+ structures from Protein Data Bank (PDB)\nand validate it in applications to a hold-out PDB test set,\nintrinsically-disordered protein structures from the Protein Ensemble Database\n(PED), molecular dynamics simulations of fast-folding mini-proteins from DE\nShaw Research, and coarse-grained simulation data. We achieve state-of-the-art\nreconstruction performance in terms of correct bond formation, avoidance of\nside chain clashes, and diversity of the generated side chain configurational\nstates. We make DiAMoNDBack model publicly available as a free and open source\nPython package.\n","authors":["Michael S. Jones","Kirill Shmilovich","Andrew L. Ferguson"],"pdf_url":"https://arxiv.org/pdf/2307.12451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12450v1","updated":"2023-07-23T22:48:07Z","published":"2023-07-23T22:48:07Z","title":"ProtoFL: Unsupervised Federated Learning via Prototypical Distillation","summary":"  Federated learning (FL) is a promising approach for enhancing data privacy\npreservation, particularly for authentication systems. However, limited round\ncommunications, scarce representation, and scalability pose significant\nchallenges to its deployment, hindering its full potential. In this paper, we\npropose 'ProtoFL', Prototypical Representation Distillation based unsupervised\nFederated Learning to enhance the representation power of a global model and\nreduce round communication costs. Additionally, we introduce a local one-class\nclassifier based on normalizing flows to improve performance with limited data.\nOur study represents the first investigation of using FL to improve one-class\nclassification performance. We conduct extensive experiments on five widely\nused benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and\nKeystroke-Dynamics, to demonstrate the superior performance of our proposed\nframework over previous methods in the literature.\n","authors":["Hansol Kim","Youngjun Kwak","Minyoung Jung","Jinho Shin","Youngsung Kim","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12450v1.pdf","comment":"Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed\n  equally to this work"},{"id":"http://arxiv.org/abs/2305.14675v2","updated":"2023-07-23T22:41:32Z","published":"2023-05-24T03:32:31Z","title":"TriMLP: Revenge of a MLP-like Architecture in Sequential Recommendation","summary":"  Sequential recommendation models sequences of historical user-item\ninteractive behaviors (or referred as token) to better infer dynamic\npreferences. Fueled by the improved neural network architectures such as RNN,\nCNN and Transformer, this field has enjoyed rapid performance boost in the past\nyears. Recent progress on all-MLP models lights on an efficient method with\nless intensive computation, token-mixing MLP, to learn the transformation\npatterns among historical behaviors. However, due to the inherent\nfully-connection design that allows the unrestricted cross-token communication\nand ignores the chronological order, we find that directly applying\ntoken-mixing MLP into sequential recommendation leads to subpar performance. In\nthis paper, we present a purely MLP-based sequential recommendation\narchitecture TriMLP with a novel \\underline{Tri}angular Mixer where the\nmodified \\underline{MLP} endows tokens with ordered interactions. As the\ncross-token interaction in MLP is actually matrix multiplication, Triangular\nMixer drops the lower-triangle neurons in the weight matrix and thus blocks the\nconnections from future tokens, which prevents information leakage and improves\nprediction capability under the standard auto-regressive training fashion. To\nfurther model long and short-term preferences on fine-grained level, the mixer\nadopts a dual-branch structure based on the delicate MLP described above,\nnamely global and local mixing, to separately capture the sequential long-range\ndependencies and local patterns. Empirical study on 9 different scale datasets\n(contain 50K\\textasciitilde20M behaviors) of various benchmarks, including\nMovieLens, Amazon and Tenrec, demonstrates that TriMLP attains promising and\nstable accuracy/efficiency trade-off, i.e., averagely surpasses several\nstate-of-the-art baselines by 5.32\\% and saves 8.44\\% inference time cost.\n","authors":["Yiheng Jiang","Yuanbo Xu","Yongjian Yang","Funing Yang","Pengyang Wang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2305.14675v2.pdf","comment":"15 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.12449v1","updated":"2023-07-23T22:35:09Z","published":"2023-07-23T22:35:09Z","title":"WEPRO: Weight Prediction for Efficient Optimization of Hybrid\n  Quantum-Classical Algorithms","summary":"  The exponential run time of quantum simulators on classical machines and long\nqueue depths and high costs of real quantum devices present significant\nchallenges in the effective training of Variational Quantum Algorithms (VQAs)\nlike Quantum Neural Networks (QNNs), Variational Quantum Eigensolver (VQE) and\nQuantum Approximate Optimization Algorithm (QAOA). To address these\nlimitations, we propose a new approach, WEPRO (Weight Prediction), which\naccelerates the convergence of VQAs by exploiting regular trends in the\nparameter weights. We introduce two techniques for optimal prediction\nperformance namely, Naive Prediction (NaP) and Adaptive Prediction (AdaP).\nThrough extensive experimentation and training of multiple QNN models on\nvarious datasets, we demonstrate that WEPRO offers a speedup of approximately\n$2.25\\times$ compared to standard training methods, while also providing\nimproved accuracy (up to $2.3\\%$ higher) and loss (up to $6.1\\%$ lower) with\nlow storage and computational overheads. We also evaluate WEPRO's effectiveness\nin VQE for molecular ground-state energy estimation and in QAOA for graph\nMaxCut. Our results show that WEPRO leads to speed improvements of up to\n$3.1\\times$ for VQE and $2.91\\times$ for QAOA, compared to traditional\noptimization techniques, while using up to $3.3\\times$ less number of shots\n(i.e., repeated circuit executions) per training iteration.\n","authors":["Satwik Kundu","Debarshi Kundu","Swaroop Ghosh"],"pdf_url":"https://arxiv.org/pdf/2307.12449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12438v1","updated":"2023-07-23T21:46:55Z","published":"2023-07-23T21:46:55Z","title":"Multifidelity Covariance Estimation via Regression on the Manifold of\n  Symmetric Positive Definite Matrices","summary":"  We introduce a multifidelity estimator of covariance matrices formulated as\nthe solution to a regression problem on the manifold of symmetric positive\ndefinite matrices. The estimator is positive definite by construction, and the\nMahalanobis distance minimized to obtain it possesses properties which enable\npractical computation. We show that our manifold regression multifidelity\n(MRMF) covariance estimator is a maximum likelihood estimator under a certain\nerror model on manifold tangent space. More broadly, we show that our\nRiemannian regression framework encompasses existing multifidelity covariance\nestimators constructed from control variates. We demonstrate via numerical\nexamples that our estimator can provide significant decreases, up to one order\nof magnitude, in squared estimation error relative to both single-fidelity and\nother multifidelity covariance estimators. Furthermore, preservation of\npositive definiteness ensures that our estimator is compatible with downstream\ntasks, such as data assimilation and metric learning, in which this property is\nessential.\n","authors":["Aimee Maurais","Terrence Alsup","Benjamin Peherstorfer","Youssef Marzouk"],"pdf_url":"https://arxiv.org/pdf/2307.12438v1.pdf","comment":"30 pages + 15-page supplement"},{"id":"http://arxiv.org/abs/2207.03522v2","updated":"2023-07-23T21:18:29Z","published":"2022-07-07T18:34:34Z","title":"TF-GNN: Graph Neural Networks in TensorFlow","summary":"  TensorFlow-GNN (TF-GNN) is a scalable library for Graph Neural Networks in\nTensorFlow. It is designed from the bottom up to support the kinds of rich\nheterogeneous graph data that occurs in today's information ecosystems. In\naddition to enabling machine learning researchers and advanced developers,\nTF-GNN offers low-code solutions to empower the broader developer community in\ngraph learning. Many production models at Google use TF-GNN, and it has been\nrecently released as an open source project. In this paper we describe the\nTF-GNN data model, its Keras message passing API, and relevant capabilities\nsuch as graph sampling and distributed training.\n","authors":["Oleksandr Ferludin","Arno Eigenwillig","Martin Blais","Dustin Zelle","Jan Pfeifer","Alvaro Sanchez-Gonzalez","Wai Lok Sibon Li","Sami Abu-El-Haija","Peter Battaglia","Neslihan Bulut","Jonathan Halcrow","Filipe Miguel Gonçalves de Almeida","Pedro Gonnet","Liangze Jiang","Parth Kothari","Silvio Lattanzi","André Linhares","Brandon Mayer","Vahab Mirrokni","John Palowitch","Mihir Paradkar","Jennifer She","Anton Tsitsulin","Kevin Villela","Lisa Wang","David Wong","Bryan Perozzi"],"pdf_url":"https://arxiv.org/pdf/2207.03522v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12435v1","updated":"2023-07-23T21:18:04Z","published":"2023-07-23T21:18:04Z","title":"A Generalized Schwarz-type Non-overlapping Domain Decomposition Method\n  using Physics-constrained Neural Networks","summary":"  We present a meshless Schwarz-type non-overlapping domain decomposition\nmethod based on artificial neural networks for solving forward and inverse\nproblems involving partial differential equations (PDEs). To ensure the\nconsistency of solutions across neighboring subdomains, we adopt a generalized\nRobin-type interface condition, assigning unique Robin parameters to each\nsubdomain. These subdomain-specific Robin parameters are learned to minimize\nthe mismatch on the Robin interface condition, facilitating efficient\ninformation exchange during training. Our method is applicable to both the\nLaplace's and Helmholtz equations. It represents local solutions by an\nindependent neural network model which is trained to minimize the loss on the\ngoverning PDE while strictly enforcing boundary and interface conditions\nthrough an augmented Lagrangian formalism. A key strength of our method lies in\nits ability to learn a Robin parameter for each subdomain, thereby enhancing\ninformation exchange with its neighboring subdomains. We observe that the\nlearned Robin parameters adapt to the local behavior of the solution, domain\npartitioning and subdomain location relative to the overall domain. Extensive\nexperiments on forward and inverse problems, including one-way and two-way\ndecompositions with crosspoints, demonstrate the versatility and performance of\nour proposed approach.\n","authors":["Shamsulhaq Basir","Inanc Senocak"],"pdf_url":"https://arxiv.org/pdf/2307.12435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12427v1","updated":"2023-07-23T20:47:03Z","published":"2023-07-23T20:47:03Z","title":"Augmented Box Replay: Overcoming Foreground Shift for Incremental Object\n  Detection","summary":"  In incremental learning, replaying stored samples from previous tasks\ntogether with current task samples is one of the most efficient approaches to\naddress catastrophic forgetting. However, unlike incremental classification,\nimage replay has not been successfully applied to incremental object detection\n(IOD). In this paper, we identify the overlooked problem of foreground shift as\nthe main reason for this. Foreground shift only occurs when replaying images of\nprevious tasks and refers to the fact that their background might contain\nforeground objects of the current task. To overcome this problem, a novel and\nefficient Augmented Box Replay (ABR) method is developed that only stores and\nreplays foreground objects and thereby circumvents the foreground shift\nproblem. In addition, we propose an innovative Attentive RoI Distillation loss\nthat uses spatial attention from region-of-interest (RoI) features to constrain\ncurrent model to focus on the most important information from old model. ABR\nsignificantly reduces forgetting of previous classes while maintaining high\nplasticity in current classes. Moreover, it considerably reduces the storage\nrequirements when compared to standard image replay. Comprehensive experiments\non Pascal-VOC and COCO datasets support the state-of-the-art performance of our\nmodel.\n","authors":["Liu Yuyang","Cong Yang","Goswami Dipam","Liu Xialei","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2307.12427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07989v3","updated":"2023-07-23T20:21:48Z","published":"2023-02-15T23:18:47Z","title":"From Graph Generation to Graph Classification","summary":"  This note describes a new approach to classifying graphs that leverages graph\ngenerative models (GGM). Assuming a GGM that defines a joint probability\ndistribution over graphs and their class labels, I derive classification\nformulas for the probability of a class label given a graph. A new conditional\nELBO can be used to train a generative graph auto-encoder model for\ndiscrimination. While leveraging generative models for classification has been\nwell explored for non-relational i.i.d. data, to our knowledge it is a novel\napproach to graph classification.\n","authors":["Oliver Schulte"],"pdf_url":"https://arxiv.org/pdf/2302.07989v3.pdf","comment":"I welcome suggestions, comments, and proposals for collaboration to\n  develop further the ideas in this paper. Please email oschulte@cs.sfu.ca. I\n  am grateful to Renjie Liao for helpful comments"},{"id":"http://arxiv.org/abs/2206.02058v3","updated":"2023-07-23T20:17:42Z","published":"2022-06-04T20:49:31Z","title":"When Personalization Harms: Reconsidering the Use of Group Attributes in\n  Prediction","summary":"  Machine learning models are often personalized with categorical attributes\nthat are protected, sensitive, self-reported, or costly to acquire. In this\nwork, we show models that are personalized with group attributes can reduce\nperformance at a group level. We propose formal conditions to ensure the \"fair\nuse\" of group attributes in prediction tasks by training one additional model\n-- i.e., collective preference guarantees to ensure that each group who\nprovides personal data will receive a tailored gain in performance in return.\nWe present sufficient conditions to ensure fair use in empirical risk\nminimization and characterize failure modes that lead to fair use violations\ndue to standard practices in model development and deployment. We present a\ncomprehensive empirical study of fair use in clinical prediction tasks. Our\nresults demonstrate the prevalence of fair use violations in practice and\nillustrate simple interventions to mitigate their harm.\n","authors":["Vinith M. Suriyakumar","Marzyeh Ghassemi","Berk Ustun"],"pdf_url":"https://arxiv.org/pdf/2206.02058v3.pdf","comment":"ICML 2023 Oral"},{"id":"http://arxiv.org/abs/2307.12417v1","updated":"2023-07-23T20:01:18Z","published":"2023-07-23T20:01:18Z","title":"Practical Commercial 5G Standalone (SA) Uplink Throughput Prediction","summary":"  While the 5G New Radio (NR) network promises a huge uplift of the uplink\nthroughput, the improvement can only be seen when the User Equipment (UE) is\nconnected to the high-frequency millimeter wave (mmWave) band. With the rise of\nuplink-intensive smartphone applications such as the real-time transmission of\nUHD 4K/8K videos, and Virtual Reality (VR)/Augmented Reality (AR) contents,\nuplink throughput prediction plays a huge role in maximizing the users' quality\nof experience (QoE). In this paper, we propose using a ConvLSTM-based neural\nnetwork to predict the future uplink throughput based on past uplink throughput\nand RF parameters. The network is trained using the data from real-world drive\ntests on commercial 5G SA networks while riding commuter trains, which\naccounted for various frequency bands, handover, and blind spots. To make sure\nour model can be practically implemented, we then limited our model to only use\nthe information available via Android API, then evaluate our model using the\ndata from both commuter trains and other methods of transportation. The results\nshow that our model reaches an average prediction accuracy of 98.9\\% with an\naverage RMSE of 1.80 Mbps across all unseen evaluation scenarios.\n","authors":["Kasidis Arunruangsirilert","Jiro Katto"],"pdf_url":"https://arxiv.org/pdf/2307.12417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00928v2","updated":"2023-07-23T19:59:20Z","published":"2022-11-02T07:30:19Z","title":"Neural Active Learning on Heteroskedastic Distributions","summary":"  Models that can actively seek out the best quality training data hold the\npromise of more accurate, adaptable, and efficient machine learning. Active\nlearning techniques often tend to prefer examples that are the most difficult\nto classify. While this works well on homogeneous datasets, we find that it can\nlead to catastrophic failures when performed on multiple distributions with\ndifferent degrees of label noise or heteroskedasticity. These active learning\nalgorithms strongly prefer to draw from the distribution with more noise, even\nif their examples have no informative structure (such as solid color images\nwith random labels). To this end, we demonstrate the catastrophic failure of\nthese active learning algorithms on heteroskedastic distributions and propose a\nfine-tuning-based approach to mitigate these failures. Further, we propose a\nnew algorithm that incorporates a model difference scoring function for each\ndata point to filter out the noisy examples and sample clean examples that\nmaximize accuracy, outperforming the existing active learning techniques on the\nheteroskedastic datasets. We hope these observations and techniques are\nimmediately helpful to practitioners and can help to challenge common\nassumptions in the design of active learning algorithms.\n","authors":["Savya Khosla","Chew Kin Whye","Jordan T. Ash","Cyril Zhang","Kenji Kawaguchi","Alex Lamb"],"pdf_url":"https://arxiv.org/pdf/2211.00928v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13757v3","updated":"2023-07-23T19:51:25Z","published":"2023-01-31T16:45:49Z","title":"Toward Efficient Gradient-Based Value Estimation","summary":"  Gradient-based methods for value estimation in reinforcement learning have\nfavorable stability properties, but they are typically much slower than\nTemporal Difference (TD) learning methods. We study the root causes of this\nslowness and show that Mean Square Bellman Error (MSBE) is an ill-conditioned\nloss function in the sense that its Hessian has large condition-number. To\nresolve the adverse effect of poor conditioning of MSBE on gradient based\nmethods, we propose a low complexity batch-free proximal method that\napproximately follows the Gauss-Newton direction and is asymptotically robust\nto parameterization. Our main algorithm, called RANS, is efficient in the sense\nthat it is significantly faster than the residual gradient methods while having\nalmost the same computational complexity, and is competitive with TD on the\nclassic problems that we tested.\n","authors":["Arsalan Sharifnassab","Richard Sutton"],"pdf_url":"https://arxiv.org/pdf/2301.13757v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.10012v3","updated":"2023-07-23T19:50:41Z","published":"2022-10-18T17:30:02Z","title":"Log-linear Guardedness and its Implications","summary":"  Methods for erasing human-interpretable concepts from neural representations\nthat assume linearity have been found to be tractable and useful. However, the\nimpact of this removal on the behavior of downstream classifiers trained on the\nmodified representations is not fully understood. In this work, we formally\ndefine the notion of log-linear guardedness as the inability of an adversary to\npredict the concept directly from the representation, and study its\nimplications. We show that, in the binary case, under certain assumptions, a\ndownstream log-linear model cannot recover the erased concept. However, we\ndemonstrate that a multiclass log-linear model \\emph{can} be constructed that\nindirectly recovers the concept in some cases, pointing to the inherent\nlimitations of log-linear guardedness as a downstream bias mitigation\ntechnique. These findings shed light on the theoretical limitations of linear\nerasure methods and highlight the need for further research on the connections\nbetween intrinsic and extrinsic bias in neural models.\n","authors":["Shauli Ravfogel","Yoav Goldberg","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2210.10012v3.pdf","comment":"Accepted as a long paper in ACL 2023"},{"id":"http://arxiv.org/abs/2307.12409v1","updated":"2023-07-23T19:23:06Z","published":"2023-07-23T19:23:06Z","title":"A Machine Learning Approach to Two-Stage Adaptive Robust Optimization","summary":"  We propose an approach based on machine learning to solve two-stage linear\nadaptive robust optimization (ARO) problems with binary here-and-now variables\nand polyhedral uncertainty sets. We encode the optimal here-and-now decisions,\nthe worst-case scenarios associated with the optimal here-and-now decisions,\nand the optimal wait-and-see decisions into what we denote as the strategy. We\nsolve multiple similar ARO instances in advance using the column and constraint\ngeneration algorithm and extract the optimal strategies to generate a training\nset. We train a machine learning model that predicts high-quality strategies\nfor the here-and-now decisions, the worst-case scenarios associated with the\noptimal here-and-now decisions, and the wait-and-see decisions. We also\nintroduce an algorithm to reduce the number of different target classes the\nmachine learning algorithm needs to be trained on. We apply the proposed\napproach to the facility location, the multi-item inventory control and the\nunit commitment problems. Our approach solves ARO problems drastically faster\nthan the state-of-the-art algorithms with high accuracy.\n","authors":["Dimitris Bertsimas","Cheol Woo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12405v1","updated":"2023-07-23T19:12:44Z","published":"2023-07-23T19:12:44Z","title":"Optimal Control of Multiclass Fluid Queueing Networks: A Machine\n  Learning Approach","summary":"  We propose a machine learning approach to the optimal control of multiclass\nfluid queueing networks (MFQNETs) that provides explicit and insightful control\npolicies. We prove that a threshold type optimal policy exists for MFQNET\ncontrol problems, where the threshold curves are hyperplanes passing through\nthe origin. We use Optimal Classification Trees with hyperplane splits (OCT-H)\nto learn an optimal control policy for MFQNETs. We use numerical solutions of\nMFQNET control problems as a training set and apply OCT-H to learn explicit\ncontrol policies. We report experimental results with up to 33 servers and 99\nclasses that demonstrate that the learned policies achieve 100\\% accuracy on\nthe test set. While the offline training of OCT-H can take days in large\nnetworks, the online application takes milliseconds.\n","authors":["Dimitris Bertsimas","Cheol Woo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17108v2","updated":"2023-07-23T19:06:20Z","published":"2023-06-29T17:08:53Z","title":"ManimML: Communicating Machine Learning Architectures with Animation","summary":"  There has been an explosion in interest in machine learning (ML) in recent\nyears due to its applications to science and engineering. However, as ML\ntechniques have advanced, tools for explaining and visualizing novel ML\nalgorithms have lagged behind. Animation has been shown to be a powerful tool\nfor making engaging visualizations of systems that dynamically change over\ntime, which makes it well suited to the task of communicating ML algorithms.\nHowever, the current approach to animating ML algorithms is to handcraft\napplications that highlight specific algorithms or use complex generalized\nanimation software. We developed ManimML, an open-source Python library for\neasily generating animations of ML algorithms directly from code. We sought to\nleverage ML practitioners' preexisting knowledge of programming rather than\nrequiring them to learn complex animation software. ManimML has a familiar\nsyntax for specifying neural networks that mimics popular deep learning\nframeworks like Pytorch. A user can take a preexisting neural network\narchitecture and easily write a specification for an animation in ManimML,\nwhich will then automatically compose animations for different components of\nthe system into a final animation of the entire neural network. ManimML is open\nsource and available at https://github.com/helblazer811/ManimML.\n","authors":["Alec Helbling","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2306.17108v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09256v2","updated":"2023-07-23T18:17:37Z","published":"2023-06-15T16:32:08Z","title":"A Survey of Some Density Based Clustering Techniques","summary":"  Density Based Clustering are a type of Clustering methods using in data\nmining for extracting previously unknown patterns from data sets. There are a\nnumber of density based clustering methods such as DBSCAN, OPTICS, DENCLUE,\nVDBSCAN, DVBSCAN, DBCLASD and ST-DBSCAN. In this paper, a study of these\nmethods is done along with their characteristics, advantages and disadvantages\nand most importantly, their applicability to different types of data sets to\nmine useful and appropriate patterns.\n","authors":["Rupanka Bhuyan","Samarjeet Borah"],"pdf_url":"https://arxiv.org/pdf/2306.09256v2.pdf","comment":"4 pages, 1 figure, conference paper"},{"id":"http://arxiv.org/abs/2306.15907v2","updated":"2023-07-23T17:51:51Z","published":"2023-06-28T04:15:01Z","title":"Deep Learning Models for Water Stage Predictions in South Florida","summary":"  Simulating and predicting water levels in river systems is essential for\nflood warnings, hydraulic operations, and flood mitigations. In the engineering\nfield, tools such as HEC-RAS, MIKE, and SWMM are used to build detailed\nphysics-based hydrological and hydraulic computational models to simulate the\nentire watershed, thereby predicting the water stage at any point in the\nsystem. However, these physics-based models are computationally intensive,\nespecially for large watersheds and for longer simulations. To overcome this\nproblem, we train several deep learning (DL) models for use as surrogate models\nto rapidly predict the water stage. The downstream stage of the Miami River in\nSouth Florida is chosen as a case study for this paper. The dataset is from\nJanuary 1, 2010, to December 31, 2020, downloaded from the DBHYDRO database of\nthe South Florida Water Management District (SFWMD). Extensive experiments show\nthat the performance of the DL models is comparable to that of the\nphysics-based models, even during extreme precipitation conditions (i.e.,\ntropical storms). Furthermore, we study the decline in prediction accuracy of\nthe DL models with an increase in prediction lengths. In order to predict the\nwater stage in the future, our DL models use measured variables of the river\nsystem from the recent past as well as covariates that can be reliably\npredicted in the near future. In summary, the deep learning models achieve\ncomparable or better error rates with at least 1000x speedup in comparison to\nthe physics-based models.\n","authors":["Jimeng Shi","Zeda Yin","Rukmangadh Myana","Khandker Ishtiaq","Anupama John","Jayantha Obeysekera","Arturo Leon","Giri Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2306.15907v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12388v1","updated":"2023-07-23T17:35:49Z","published":"2023-07-23T17:35:49Z","title":"Uncertainty-aware Grounded Action Transformation towards Sim-to-Real\n  Transfer for Traffic Signal Control","summary":"  Traffic signal control (TSC) is a complex and important task that affects the\ndaily lives of millions of people. Reinforcement Learning (RL) has shown\npromising results in optimizing traffic signal control, but current RL-based\nTSC methods are mainly trained in simulation and suffer from the performance\ngap between simulation and the real world. In this paper, we propose a\nsimulation-to-real-world (sim-to-real) transfer approach called UGAT, which\ntransfers a learned policy trained from a simulated environment to a real-world\nenvironment by dynamically transforming actions in the simulation with\nuncertainty to mitigate the domain gap of transition dynamics. We evaluate our\nmethod on a simulated traffic environment and show that it significantly\nimproves the performance of the transferred RL policy in the real world.\n","authors":["Longchao Da","Hao Mei","Romir Sharma","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2307.12388v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.12375v1","updated":"2023-07-23T16:54:41Z","published":"2023-07-23T16:54:41Z","title":"In-Context Learning in Large Language Models Learns Label Relationships\n  but Is Not Conventional Learning","summary":"  The performance of Large Language Models (LLMs) on downstream tasks often\nimproves significantly when including examples of the input-label relationship\nin the context. However, there is currently no consensus about how this\nin-context learning (ICL) ability of LLMs works: for example, while Xie et al.\n(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)\nargue ICL does not even learn label relationships from in-context examples. In\nthis paper, we study (1) how labels of in-context examples affect predictions,\n(2) how label relationships learned during pre-training interact with\ninput-label examples provided in-context, and (3) how ICL aggregates label\ninformation across in-context examples. Our findings suggests LLMs usually\nincorporate information from in-context labels, but that pre-training and\nin-context label relationships are treated differently, and that the model does\nnot consider all in-context information equally. Our results give insights into\nunderstanding and aligning LLM behavior.\n","authors":["Jannik Kossen","Tom Rainforth","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2307.12375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12369v1","updated":"2023-07-23T16:38:16Z","published":"2023-07-23T16:38:16Z","title":"Early Prediction of Alzheimers Disease Leveraging Symptom Occurrences\n  from Longitudinal Electronic Health Records of US Military Veterans","summary":"  Early prediction of Alzheimer's disease (AD) is crucial for timely\nintervention and treatment. This study aims to use machine learning approaches\nto analyze longitudinal electronic health records (EHRs) of patients with AD\nand identify signs and symptoms that can predict AD onset earlier. We used a\ncase-control design with longitudinal EHRs from the U.S. Department of Veterans\nAffairs Veterans Health Administration (VHA) from 2004 to 2021. Cases were VHA\npatients with AD diagnosed after 1/1/2016 based on ICD-10-CM codes, matched 1:9\nwith controls by age, sex and clinical utilization with replacement. We used a\npanel of AD-related keywords and their occurrences over time in a patient's\nlongitudinal EHRs as predictors for AD prediction with four machine learning\nmodels. We performed subgroup analyses by age, sex, and race/ethnicity, and\nvalidated the model in a hold-out and \"unseen\" VHA stations group. Model\ndiscrimination, calibration, and other relevant metrics were reported for\npredictions up to ten years before ICD-based diagnosis. The study population\nincluded 16,701 cases and 39,097 matched controls. The average number of\nAD-related keywords (e.g., \"concentration\", \"speaking\") per year increased\nrapidly for cases as diagnosis approached, from around 10 to over 40, while\nremaining flat at 10 for controls. The best model achieved high discriminative\naccuracy (ROCAUC 0.997) for predictions using data from at least ten years\nbefore ICD-based diagnoses. The model was well-calibrated (Hosmer-Lemeshow\ngoodness-of-fit p-value = 0.99) and consistent across subgroups of age, sex and\nrace/ethnicity, except for patients younger than 65 (ROCAUC 0.746). Machine\nlearning models using AD-related keywords identified from EHR notes can predict\nfuture AD diagnoses, suggesting its potential use for identifying AD risk using\nEHR notes, offering an affordable way for early screening on large population.\n","authors":["Rumeng Li","Xun Wang","Dan Berlowitz","Brian Silver","Wen Hu","Heather Keating","Raelene Goodwin","Weisong Liu","Honghuang Lin","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12369v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2111.09564v3","updated":"2023-07-23T16:02:01Z","published":"2021-11-18T07:46:35Z","title":"LAnoBERT: System Log Anomaly Detection based on BERT Masked Language\n  Model","summary":"  The system log generated in a computer system refers to large-scale data that\nare collected simultaneously and used as the basic data for determining errors,\nintrusion and abnormal behaviors. The aim of system log anomaly detection is to\npromptly identify anomalies while minimizing human intervention, which is a\ncritical problem in the industry. Previous studies performed anomaly detection\nthrough algorithms after converting various forms of log data into a\nstandardized template using a parser. Particularly, a template corresponding to\na specific event should be defined in advance for all the log data using which\nthe information within the log key may get lost. In this study, we propose\nLAnoBERT, a parser free system log anomaly detection method that uses the BERT\nmodel, exhibiting excellent natural language processing performance. The\nproposed method, LAnoBERT, learns the model through masked language modeling,\nwhich is a BERT-based pre-training method, and proceeds with unsupervised\nlearning-based anomaly detection using the masked language modeling loss\nfunction per log key during the test process. In addition, we also propose an\nefficient inference process to establish a practically applicable pipeline to\nthe actual system. Experiments on three well-known log datasets, i.e., HDFS,\nBGL, and Thunderbird, show that not only did LAnoBERT yield a higher anomaly\ndetection performance compared to unsupervised learning-based benchmark models,\nbut also it resulted in a comparable performance with supervised learning-based\nbenchmark models.\n","authors":["Yukyung Lee","Jina Kim","Pilsung Kang"],"pdf_url":"https://arxiv.org/pdf/2111.09564v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12344v1","updated":"2023-07-23T14:43:17Z","published":"2023-07-23T14:43:17Z","title":"Right for the Wrong Reason: Can Interpretable ML Techniques Detect\n  Spurious Correlations?","summary":"  While deep neural network models offer unmatched classification performance,\nthey are prone to learning spurious correlations in the data. Such dependencies\non confounding information can be difficult to detect using performance metrics\nif the test data comes from the same distribution as the training data.\nInterpretable ML methods such as post-hoc explanations or inherently\ninterpretable classifiers promise to identify faulty model reasoning. However,\nthere is mixed evidence whether many of these techniques are actually able to\ndo so. In this paper, we propose a rigorous evaluation strategy to assess an\nexplanation technique's ability to correctly identify spurious correlations.\nUsing this strategy, we evaluate five post-hoc explanation techniques and one\ninherently interpretable method for their ability to detect three types of\nartificially added confounders in a chest x-ray diagnosis task. We find that\nthe post-hoc technique SHAP, as well as the inherently interpretable Attri-Net\nprovide the best performance and can be used to reliably identify faulty model\nbehavior.\n","authors":["Susu Sun","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2307.12344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12343v1","updated":"2023-07-23T14:40:50Z","published":"2023-07-23T14:40:50Z","title":"Self-Supervised Learning for Audio-Based Emotion Recognition","summary":"  Emotion recognition models using audio input data can enable the development\nof interactive systems with applications in mental healthcare, marketing,\ngaming, and social media analysis. While the field of affective computing using\naudio data is rich, a major barrier to achieve consistently high-performance\nmodels is the paucity of available training labels. Self-supervised learning\n(SSL) is a family of methods which can learn despite a scarcity of supervised\nlabels by predicting properties of the data itself. To understand the utility\nof self-supervised learning for audio-based emotion recognition, we have\napplied self-supervised learning pre-training to the classification of emotions\nfrom the CMU- MOSEI's acoustic modality. Unlike prior papers that have\nexperimented with raw acoustic data, our technique has been applied to encoded\nacoustic data. Our model is first pretrained to uncover the randomly-masked\ntimestamps of the acoustic data. The pre-trained model is then fine-tuned using\na small sample of annotated data. The performance of the final model is then\nevaluated via several evaluation metrics against a baseline deep learning model\nwith an identical backbone architecture. We find that self-supervised learning\nconsistently improves the performance of the model across all metrics. This\nwork shows the utility of self-supervised learning for affective computing,\ndemonstrating that self-supervised learning is most useful when the number of\ntraining examples is small, and that the effect is most pronounced for emotions\nwhich are easier to classify such as happy, sad and anger. This work further\ndemonstrates that self-supervised learning works when applied to embedded\nfeature representations rather than the traditional approach of pre-training on\nthe raw input space.\n","authors":["Peranut Nimitsurachat","Peter Washington"],"pdf_url":"https://arxiv.org/pdf/2307.12343v1.pdf","comment":"8 pages, 9 figures, submitted to IEEE Transactions on Affective\n  Computing"},{"id":"http://arxiv.org/abs/2307.12341v1","updated":"2023-07-23T14:32:07Z","published":"2023-07-23T14:32:07Z","title":"Rapid detection of soil carbonates by means of NIR spectroscopy, deep\n  learning methods and phase quantification by powder Xray diffraction","summary":"  Soil NIR spectral absorbance/reflectance libraries are utilized towards\nimproving agricultural production and analysis of soil properties which are key\nprerequisite for agroecological balance and environmental sustainability.\nCarbonates in particular, represent a soil property which is mostly affected\neven by mild, let alone extreme, changes of environmental conditions during\nclimate change. In this study we propose a rapid and efficient way to predict\ncarbonates content in soil by means of FT NIR reflectance spectroscopy and by\nuse of deep learning methods. We exploited multiple machine learning methods,\nsuch as: 1) a MLP Regressor and 2) a CNN and compare their performance with\nother traditional ML algorithms such as PLSR, Cubist and SVM on the combined\ndataset of two NIR spectral libraries: KSSL (USDA), a dataset of soil samples\nreflectance spectra collected nationwide, and LUCAS TopSoil (European Soil\nLibrary) which contains soil sample absorbance spectra from all over the\nEuropean Union, and use them to predict carbonate content on never before seen\nsoil samples. Soil samples in KSSL and in TopSoil spectral libraries were\nacquired in the spectral region of visNIR, however in this study, only the NIR\nspectral region was utilized. Quantification of carbonates by means of Xray\nDiffraction is in good agreement with the volumetric method and the MLP\nprediction. Our work contributes to rapid carbonates content prediction in soil\nsamples in cases where: 1) no volumetric method is available and 2) only NIR\nspectra absorbance data are available. Up till now and to the best of our\nknowledge, there exists no other study, that presents a prediction model\ntrained on such an extensive dataset with such promising results on unseen\ndata, undoubtedly supporting the notion that deep learning models present\nexcellent prediction tools for soil carbonates content.\n","authors":["Lykourgos Chiniadis","Petros Tamvakis"],"pdf_url":"https://arxiv.org/pdf/2307.12341v1.pdf","comment":"39 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.12336v1","updated":"2023-07-23T14:02:33Z","published":"2023-07-23T14:02:33Z","title":"TabADM: Unsupervised Tabular Anomaly Detection with Diffusion Models","summary":"  Tables are an abundant form of data with use cases across all scientific\nfields. Real-world datasets often contain anomalous samples that can negatively\naffect downstream analysis. In this work, we only assume access to contaminated\ndata and present a diffusion-based probabilistic model effective for\nunsupervised anomaly detection. Our model is trained to learn the density of\nnormal samples by utilizing a unique rejection scheme to attenuate the\ninfluence of anomalies on the density estimation. At inference, we identify\nanomalies as samples in low-density regions. We use real data to demonstrate\nthat our method improves detection capabilities over baselines. Furthermore,\nour method is relatively stable to the dimension of the data and does not\nrequire extensive hyperparameter tuning.\n","authors":["Guy Zamberg","Moshe Salhov","Ofir Lindenbaum","Amir Averbuch"],"pdf_url":"https://arxiv.org/pdf/2307.12336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12333v1","updated":"2023-07-23T14:00:33Z","published":"2023-07-23T14:00:33Z","title":"An axiomatized PDE model of deep neural networks","summary":"  Inspired by the relation between deep neural network (DNN) and partial\ndifferential equations (PDEs), we study the general form of the PDE models of\ndeep neural networks. To achieve this goal, we formulate DNN as an evolution\noperator from a simple base model. Based on several reasonable assumptions, we\nprove that the evolution operator is actually determined by\nconvection-diffusion equation. This convection-diffusion equation model gives\nmathematical explanation for several effective networks. Moreover, we show that\nthe convection-diffusion model improves the robustness and reduces the\nRademacher complexity. Based on the convection-diffusion equation, we design a\nnew training method for ResNets. Experiments validate the performance of the\nproposed method.\n","authors":["Tangjun Wang","Wenqi Tao","Chenglong Bao","Zuoqiang Shi"],"pdf_url":"https://arxiv.org/pdf/2307.12333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07464v2","updated":"2023-07-23T13:51:34Z","published":"2023-01-18T12:16:19Z","title":"CLIPTER: Looking at the Bigger Picture in Scene Text Recognition","summary":"  Reading text in real-world scenarios often requires understanding the context\nsurrounding it, especially when dealing with poor-quality text. However,\ncurrent scene text recognizers are unaware of the bigger picture as they\noperate on cropped text images. In this study, we harness the representative\ncapabilities of modern vision-language models, such as CLIP, to provide\nscene-level information to the crop-based recognizer. We achieve this by fusing\na rich representation of the entire image, obtained from the vision-language\nmodel, with the recognizer word-level features via a gated cross-attention\nmechanism. This component gradually shifts to the context-enhanced\nrepresentation, allowing for stable fine-tuning of a pretrained recognizer. We\ndemonstrate the effectiveness of our model-agnostic framework, CLIPTER (CLIP\nTExt Recognition), on leading text recognition architectures and achieve\nstate-of-the-art results across multiple benchmarks. Furthermore, our analysis\nhighlights improved robustness to out-of-vocabulary words and enhanced\ngeneralization in low-data regimes.\n","authors":["Aviad Aberdam","David Bensaïd","Alona Golts","Roy Ganz","Oren Nuriel","Royee Tichauer","Shai Mazor","Ron Litman"],"pdf_url":"https://arxiv.org/pdf/2301.07464v2.pdf","comment":"Accepted for publication by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10234v2","updated":"2023-07-23T13:48:15Z","published":"2023-07-16T05:33:35Z","title":"SentimentGPT: Exploiting GPT for Advanced Sentiment Analysis and its\n  Departure from Current Machine Learning","summary":"  This study presents a thorough examination of various Generative Pretrained\nTransformer (GPT) methodologies in sentiment analysis, specifically in the\ncontext of Task 4 on the SemEval 2017 dataset. Three primary strategies are\nemployed: 1) prompt engineering using the advanced GPT-3.5 Turbo, 2)\nfine-tuning GPT models, and 3) an inventive approach to embedding\nclassification. The research yields detailed comparative insights among these\nstrategies and individual GPT models, revealing their unique strengths and\npotential limitations. Additionally, the study compares these GPT-based\nmethodologies with other current, high-performing models previously used with\nthe same dataset. The results illustrate the significant superiority of the GPT\napproaches in terms of predictive performance, more than 22\\% in F1-score\ncompared to the state-of-the-art. Further, the paper sheds light on common\nchallenges in sentiment analysis tasks, such as understanding context and\ndetecting sarcasm. It underscores the enhanced capabilities of the GPT models\nto effectively handle these complexities. Taken together, these findings\nhighlight the promising potential of GPT models in sentiment analysis, setting\nthe stage for future research in this field. The code can be found at\nhttps://github.com/DSAatUSU/SentimentGPT\n","authors":["Kiana Kheiri","Hamid Karimi"],"pdf_url":"https://arxiv.org/pdf/2307.10234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12306v1","updated":"2023-07-23T12:18:12Z","published":"2023-07-23T12:18:12Z","title":"Tackling the Curse of Dimensionality with Physics-Informed Neural\n  Networks","summary":"  The curse-of-dimensionality (CoD) taxes computational resources heavily with\nexponentially increasing computational cost as the dimension increases. This\nposes great challenges in solving high-dimensional PDEs as Richard Bellman\nfirst pointed out over 60 years ago. While there has been some recent success\nin solving numerically partial differential equations (PDEs) in high\ndimensions, such computations are prohibitively expensive, and true scaling of\ngeneral nonlinear PDEs to high dimensions has never been achieved. In this\npaper, we develop a new method of scaling up physics-informed neural networks\n(PINNs) to solve arbitrary high-dimensional PDEs. The new method, called\nStochastic Dimension Gradient Descent (SDGD), decomposes a gradient of PDEs\ninto pieces corresponding to different dimensions and samples randomly a subset\nof these dimensional pieces in each iteration of training PINNs. We\ntheoretically prove the convergence guarantee and other desired properties of\nthe proposed method. We experimentally demonstrate that the proposed method\nallows us to solve many notoriously hard high-dimensional PDEs, including the\nHamilton-Jacobi-Bellman and the Schr\\\"{o}dinger equations in thousands of\ndimensions very fast on a single GPU using the PINNs mesh-free approach. For\nexample, we solve nontrivial nonlinear PDEs (the HJB-Lin equation and the BSB\nequation) in 100,000 dimensions in 6 hours on a single GPU using SDGD with\nPINNs. Since SDGD is a general training methodology of PINNs, SDGD can be\napplied to any current and future variants of PINNs to scale them up for\narbitrary high-dimensional PDEs.\n","authors":["Zheyuan Hu","Khemraj Shukla","George Em Karniadakis","Kenji Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2307.12306v1.pdf","comment":"32 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.12304v1","updated":"2023-07-23T12:12:44Z","published":"2023-07-23T12:12:44Z","title":"Physics-Informed Machine Learning of Argon Gas-Driven Melt Pool Dynamics","summary":"  Melt pool dynamics in metal additive manufacturing (AM) is critical to\nprocess stability, microstructure formation, and final properties of the\nprinted materials. Physics-based simulation including computational fluid\ndynamics (CFD) is the dominant approach to predict melt pool dynamics. However,\nthe physics-based simulation approaches suffer from the inherent issue of very\nhigh computational cost. This paper provides a physics-informed machine\nlearning (PIML) method by integrating neural networks with the governing\nphysical laws to predict the melt pool dynamics such as temperature, velocity,\nand pressure without using any training data on velocity. This approach avoids\nsolving the highly non-linear Navier-Stokes equation numerically, which\nsignificantly reduces the computational cost. The difficult-to-determine model\nconstants of the governing equations of the melt pool can also be inferred\nthrough data-driven discovery. In addition, the physics-informed neural network\n(PINN) architecture has been optimized for efficient model training. The\ndata-efficient PINN model is attributed to the soft penalty by incorporating\ngoverning partial differential equations (PDEs), initial conditions, and\nboundary conditions in the PINN model.\n","authors":["R. Sharma","W. Grace Guo","M. Raissi","Y. B. Guo"],"pdf_url":"https://arxiv.org/pdf/2307.12304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12301v1","updated":"2023-07-23T11:50:27Z","published":"2023-07-23T11:50:27Z","title":"RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC","summary":"  Image outlier detection (OD) is crucial for ensuring the quality and accuracy\nof image datasets used in computer vision tasks. The majority of OD algorithms,\nhowever, have not been targeted toward image data. Consequently, the results of\napplying such algorithms to images are often suboptimal. In this work, we\npropose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for\nimages. By comparing images in a RANSAC-based approach, our algorithm\nautomatically predicts the outlier score of each image without additional\ntraining or label information. We evaluate RANSAC-NN against state-of-the-art\nOD algorithms on 15 diverse datasets. Without any hyperparameter tuning,\nRANSAC-NN consistently performs favorably in contrast to other algorithms in\nalmost every dataset category. Furthermore, we provide a detailed analysis to\nunderstand each RANSAC-NN component, and we demonstrate its potential\napplications in image mislabeled detection. Code for RANSAC-NN is provided at\nhttps://github.com/mxtsai/ransac-nn\n","authors":["Chen-Han Tsai","Yu-Shao Peng"],"pdf_url":"https://arxiv.org/pdf/2307.12301v1.pdf","comment":"19 pages, 18 figures"},{"id":"http://arxiv.org/abs/2307.12296v1","updated":"2023-07-23T11:21:34Z","published":"2023-07-23T11:21:34Z","title":"Comparative analysis using classification methods versus early stage\n  diabetes","summary":"  In this research work, a comparative analysis was carried out using\nclassification methods such as: Discriminant Analysis and Logistic Regression\nto subsequently predict whether a person may have the presence of early stage\ndiabetes. For this purpose, use was made of a database of the UC IRVINE\nplatform of the year 2020 where specific variables that influence diabetes were\nused for a better result. Likewise in terms of methodology, the corresponding\nanalysis was performed for each of the 3 classification methods and then take\nthem to a comparative table and analyze the results obtained. Finally we can\nadd that the majority of the studies carried out applying the classification\nmethods to the diseases can be clearly seen that there is a certain attachment\nand more use of the logistic regression classification method, on the other\nhand, in the results we could see significant differences in terms of the 2\nclassification methods that were applied, which was valuable information for\nlater drawing final conclusions.\n","authors":["Alca-Vilca Gabriel Anthony","Carpio-Vargas Eloy"],"pdf_url":"https://arxiv.org/pdf/2307.12296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.06868v4","updated":"2023-07-23T10:36:55Z","published":"2022-08-14T15:25:41Z","title":"Frouros: A Python library for drift detection in machine learning\n  systems","summary":"  Frouros is an open-source Python library capable of detecting drift in\nmachine learning systems. It provides a combination of classical and more\nrecent algorithms for drift detection: both concept and data drift. We have\ndesigned it with the objective of making it compatible with any machine\nlearning framework and easily adaptable to real-world use cases. The library is\ndeveloped following a set of best development and continuous integration\npractices to ensure ease of maintenance and extensibility. The source code is\navailable at https://github.com/IFCA/frouros.\n","authors":["Jaime Céspedes-Sisniega","Álvaro López-García"],"pdf_url":"https://arxiv.org/pdf/2208.06868v4.pdf","comment":"11 pages, 1 table"},{"id":"http://arxiv.org/abs/2304.06833v2","updated":"2023-07-23T09:58:36Z","published":"2023-04-13T21:54:53Z","title":"Estimate-Then-Optimize versus Integrated-Estimation-Optimization versus\n  Sample Average Approximation: A Stochastic Dominance Perspective","summary":"  In data-driven stochastic optimization, model parameters of the underlying\ndistribution need to be estimated from data in addition to the optimization\ntask. Recent literature considers integrating the estimation and optimization\nprocesses by selecting model parameters that lead to the best empirical\nobjective performance. This integrated approach, which we call\nintegrated-estimation-optimization (IEO), can be readily shown to outperform\nsimple estimate-then-optimize (ETO) when the model is misspecified. In this\npaper, we show that a reverse behavior appears when the model class is\nwell-specified and there is sufficient data. Specifically, for a general class\nof nonlinear stochastic optimization problems, we show that simple ETO\noutperforms IEO asymptotically when the model class covers the ground truth, in\nthe strong sense of stochastic dominance of the regret. Namely, the entire\ndistribution of the regret, not only its mean or other moments, is always\nbetter for ETO compared to IEO. Our results also apply to constrained,\ncontextual optimization problems where the decision depends on observed\nfeatures. Whenever applicable, we also demonstrate how standard sample average\napproximation (SAA) performs the worst when the model class is well-specified\nin terms of regret, and best when it is misspecified. Finally, we provide\nexperimental results to support our theoretical comparisons and illustrate when\nour insights hold in finite-sample regimes and under various degrees of\nmisspecification.\n","authors":["Adam N. Elmachtoub","Henry Lam","Haofeng Zhang","Yunfan Zhao"],"pdf_url":"https://arxiv.org/pdf/2304.06833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04492v8","updated":"2023-07-23T08:54:43Z","published":"2023-05-08T06:36:46Z","title":"MGR: Multi-generator Based Rationalization","summary":"  Rationalization is to employ a generator and a predictor to construct a\nself-explaining NLP model in which the generator selects a subset of\nhuman-intelligible pieces of the input text to the following predictor.\nHowever, rationalization suffers from two key challenges, i.e., spurious\ncorrelation and degeneration, where the predictor overfits the spurious or\nmeaningless pieces solely selected by the not-yet well-trained generator and in\nturn deteriorates the generator. Although many studies have been proposed to\naddress the two challenges, they are usually designed separately and do not\ntake both of them into account. In this paper, we propose a simple yet\neffective method named MGR to simultaneously solve the two problems. The key\nidea of MGR is to employ multiple generators such that the occurrence stability\nof real pieces is improved and more meaningful pieces are delivered to the\npredictor. Empirically, we show that MGR improves the F1 score by up to 20.9%\nas compared to state-of-the-art methods. Codes are available at\nhttps://github.com/jugechengzi/Rationalization-MGR .\n","authors":["Wei Liu","Haozhao Wang","Jun Wang","Ruixuan Li","Xinyang Li","Yuankai Zhang","Yang Qiu"],"pdf_url":"https://arxiv.org/pdf/2305.04492v8.pdf","comment":"ACL 2023, oral presentation. Fixed some typos and clarified some\n  implementation details. arXiv admin note: text overlap with arXiv:2209.08285"},{"id":"http://arxiv.org/abs/2307.06148v2","updated":"2023-07-23T08:02:30Z","published":"2023-07-12T13:10:08Z","title":"NetGPT: A Native-AI Network Architecture Beyond Provisioning\n  Personalized Generative Services","summary":"  Large language models (LLMs) have triggered tremendous success to empower\ndaily life by generative information, and the personalization of LLMs could\nfurther contribute to their applications due to better alignment with human\nintents. Towards personalized generative services, a collaborative cloud-edge\nmethodology sounds promising, as it facilitates the effective orchestration of\nheterogeneous distributed communication and computing resources. In this\narticle, after discussing the pros and cons of several candidate cloud-edge\ncollaboration techniques, we put forward NetGPT to capably deploy appropriate\nLLMs at the edge and the cloud in accordance with their computing capacity. In\naddition, edge LLMs could efficiently leverage location-based information for\npersonalized prompt completion, thus benefiting the interaction with cloud\nLLMs. After deploying representative open-source LLMs (e.g., GPT-2-base and\nLLaMA model) at the edge and the cloud, we present the feasibility of NetGPT on\nthe basis of low-rank adaptation-based light-weight fine-tuning. Subsequently,\nwe highlight substantial essential changes required for a native artificial\nintelligence (AI) network architecture towards NetGPT, with special emphasis on\ndeeper integration of communications and computing resources and careful\ncalibration of logical AI workflow. Furthermore, we demonstrate several\nby-product benefits of NetGPT, given edge LLM's astonishing capability to\npredict trends and infer intents, which possibly leads to a unified solution\nfor intelligent network management \\& orchestration. In a nutshell, we argue\nthat NetGPT is a promising native-AI network architecture beyond provisioning\npersonalized generative services.\n","authors":["Yuxuan Chen","Rongpeng Li","Zhifeng Zhao","Chenghui Peng","Jianjun Wu","Ekram Hossain","Honggang Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.06148v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12255v1","updated":"2023-07-23T08:02:27Z","published":"2023-07-23T08:02:27Z","title":"ResWCAE: Biometric Pattern Image Denoising Using Residual\n  Wavelet-Conditioned Autoencoder","summary":"  The utilization of biometric authentication with pattern images is\nincreasingly popular in compact Internet of Things (IoT) devices. However, the\nreliability of such systems can be compromised by image quality issues,\nparticularly in the presence of high levels of noise. While state-of-the-art\ndeep learning algorithms designed for generic image denoising have shown\npromise, their large number of parameters and lack of optimization for unique\nbiometric pattern retrieval make them unsuitable for these devices and\nscenarios. In response to these challenges, this paper proposes a lightweight\nand robust deep learning architecture, the Residual Wavelet-Conditioned\nConvolutional Autoencoder (Res-WCAE) with a Kullback-Leibler divergence (KLD)\nregularization, designed specifically for fingerprint image denoising. Res-WCAE\ncomprises two encoders - an image encoder and a wavelet encoder - and one\ndecoder. Residual connections between the image encoder and decoder are\nleveraged to preserve fine-grained spatial features, where the bottleneck layer\nconditioned on the compressed representation of features obtained from the\nwavelet encoder using approximation and detail subimages in the\nwavelet-transform domain. The effectiveness of Res-WCAE is evaluated against\nseveral state-of-the-art denoising methods, and the experimental results\ndemonstrate that Res-WCAE outperforms these methods, particularly for heavily\ndegraded fingerprint images in the presence of high levels of noise. Overall,\nRes-WCAE shows promise as a solution to the challenges faced by biometric\nauthentication systems in compact IoT devices.\n","authors":["Youzhi Liang","Wen Liang"],"pdf_url":"https://arxiv.org/pdf/2307.12255v1.pdf","comment":"8 pages, 2 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.04827v2","updated":"2023-07-23T10:20:28Z","published":"2023-07-07T16:25:59Z","title":"LaunchpadGPT: Language Model as Music Visualization Designer on\n  Launchpad","summary":"  Launchpad is a musical instrument that allows users to create and perform\nmusic by pressing illuminated buttons. To assist and inspire the design of the\nLaunchpad light effect, and provide a more accessible approach for beginners to\ncreate music visualization with this instrument, we proposed the LaunchpadGPT\nmodel to generate music visualization designs on Launchpad automatically. Based\non the language model with excellent generation ability, our proposed\nLaunchpadGPT takes an audio piece of music as input and outputs the lighting\neffects of Launchpad-playing in the form of a video (Launchpad-playing video).\nWe collect Launchpad-playing videos and process them to obtain music and\ncorresponding video frame of Launchpad-playing as prompt-completion pairs, to\ntrain the language model. The experiment result shows the proposed method can\ncreate better music visualization than random generation methods and hold the\npotential for a broader range of music visualization applications. Our code is\navailable at https://github.com/yunlong10/LaunchpadGPT/.\n","authors":["Siting Xu","Yunlong Tang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.04827v2.pdf","comment":"Accepted by International Computer Music Conference (ICMC) 2023"},{"id":"http://arxiv.org/abs/2306.09635v2","updated":"2023-07-23T07:53:42Z","published":"2023-06-16T05:42:01Z","title":"CLIPSonic: Text-to-Audio Synthesis with Unlabeled Videos and Pretrained\n  Language-Vision Models","summary":"  Recent work has studied text-to-audio synthesis using large amounts of paired\ntext-audio data. However, audio recordings with high-quality text annotations\ncan be difficult to acquire. In this work, we approach text-to-audio synthesis\nusing unlabeled videos and pretrained language-vision models. We propose to\nlearn the desired text-audio correspondence by leveraging the visual modality\nas a bridge. We train a conditional diffusion model to generate the audio track\nof a video, given a video frame encoded by a pretrained contrastive\nlanguage-image pretraining (CLIP) model. At test time, we first explore\nperforming a zero-shot modality transfer and condition the diffusion model with\na CLIP-encoded text query. However, we observe a noticeable performance drop\nwith respect to image queries. To close this gap, we further adopt a pretrained\ndiffusion prior model to generate a CLIP image embedding given a CLIP text\nembedding. Our results show the effectiveness of the proposed method, and that\nthe pretrained diffusion prior can reduce the modality transfer gap. While we\nfocus on text-to-audio synthesis, the proposed model can also generate audio\nfrom image queries, and it shows competitive performance against a\nstate-of-the-art image-to-audio synthesis model in a subjective listening test.\nThis study offers a new direction of approaching text-to-audio synthesis that\nleverages the naturally-occurring audio-visual correspondence in videos and the\npower of pretrained language-vision models.\n","authors":["Hao-Wen Dong","Xiaoyu Liu","Jordi Pons","Gautam Bhattacharya","Santiago Pascual","Joan Serrà","Taylor Berg-Kirkpatrick","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2306.09635v2.pdf","comment":"Accepted by WASPAA 2023. Demo:\n  https://salu133445.github.io/clipsonic/"}]},"2023-07-22T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.12166v1","updated":"2023-07-22T21:00:14Z","published":"2023-07-22T21:00:14Z","title":"The Imitation Game: Detecting Human and AI-Generated Texts in the Era of\n  Large Language Models","summary":"  The potential of artificial intelligence (AI)-based large language models\n(LLMs) holds considerable promise in revolutionizing education, research, and\npractice. However, distinguishing between human-written and AI-generated text\nhas become a significant task. This paper presents a comparative study,\nintroducing a novel dataset of human-written and LLM-generated texts in\ndifferent genres: essays, stories, poetry, and Python code. We employ several\nmachine learning models to classify the texts. Results demonstrate the efficacy\nof these models in discerning between human and AI-generated text, despite the\ndataset's limited sample size. However, the task becomes more challenging when\nclassifying GPT-generated text, particularly in story writing. The results\nindicate that the models exhibit superior performance in binary classification\ntasks, such as distinguishing human-generated text from a specific LLM,\ncompared to the more complex multiclass tasks that involve discerning among\nhuman-generated and multiple LLMs. Our findings provide insightful implications\nfor AI text detection while our dataset paves the way for future research in\nthis evolving area.\n","authors":["Kadhim Hayawi","Sakib Shahriar","Sujith Samuel Mathew"],"pdf_url":"https://arxiv.org/pdf/2307.12166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12155v1","updated":"2023-07-22T19:59:16Z","published":"2023-07-22T19:59:16Z","title":"Identifying Misinformation on YouTube through Transcript Contextual\n  Analysis with Transformer Models","summary":"  Misinformation on YouTube is a significant concern, necessitating robust\ndetection strategies. In this paper, we introduce a novel methodology for video\nclassification, focusing on the veracity of the content. We convert the\nconventional video classification task into a text classification task by\nleveraging the textual content derived from the video transcripts. We employ\nadvanced machine learning techniques like transfer learning to solve the\nclassification challenge. Our approach incorporates two forms of transfer\nlearning: (a) fine-tuning base transformer models such as BERT, RoBERTa, and\nELECTRA, and (b) few-shot learning using sentence-transformers MPNet and\nRoBERTa-large. We apply the trained models to three datasets: (a) YouTube\nVaccine-misinformation related videos, (b) YouTube Pseudoscience videos, and\n(c) Fake-News dataset (a collection of articles). Including the Fake-News\ndataset extended the evaluation of our approach beyond YouTube videos. Using\nthese datasets, we evaluated the models distinguishing valid information from\nmisinformation. The fine-tuned models yielded Matthews Correlation\nCoefficient>0.81, accuracy>0.90, and F1 score>0.90 in two of three datasets.\nInterestingly, the few-shot models outperformed the fine-tuned ones by 20% in\nboth Accuracy and F1 score for the YouTube Pseudoscience dataset, highlighting\nthe potential utility of this approach -- especially in the context of limited\ntraining data.\n","authors":["Christos Christodoulou","Nikos Salamanos","Pantelitsa Leonidou","Michail Papadakis","Michael Sirivianos"],"pdf_url":"https://arxiv.org/pdf/2307.12155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12134v1","updated":"2023-07-22T17:47:31Z","published":"2023-07-22T17:47:31Z","title":"Modality Confidence Aware Training for Robust End-to-End Spoken Language\n  Understanding","summary":"  End-to-end (E2E) spoken language understanding (SLU) systems that generate a\nsemantic parse from speech have become more promising recently. This approach\nuses a single model that utilizes audio and text representations from\npre-trained speech recognition models (ASR), and outperforms traditional\npipeline SLU systems in on-device streaming scenarios. However, E2E SLU systems\nstill show weakness when text representation quality is low due to ASR\ntranscription errors. To overcome this issue, we propose a novel E2E SLU system\nthat enhances robustness to ASR errors by fusing audio and text representations\nbased on the estimated modality confidence of ASR hypotheses. We introduce two\nnovel techniques: 1) an effective method to encode the quality of ASR\nhypotheses and 2) an effective approach to integrate them into E2E SLU models.\nWe show accuracy improvements on STOP dataset and share the analysis to\ndemonstrate the effectiveness of our approach.\n","authors":["Suyoun Kim","Akshat Shrivastava","Duc Le","Ju Lin","Ozlem Kalinli","Michael L. Seltzer"],"pdf_url":"https://arxiv.org/pdf/2307.12134v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2307.12131v1","updated":"2023-07-22T17:26:55Z","published":"2023-07-22T17:26:55Z","title":"Explainable Topic-Enhanced Argument Mining from Heterogeneous Sources","summary":"  Given a controversial target such as ``nuclear energy'', argument mining aims\nto identify the argumentative text from heterogeneous sources. Current\napproaches focus on exploring better ways of integrating the target-associated\nsemantic information with the argumentative text. Despite their empirical\nsuccesses, two issues remain unsolved: (i) a target is represented by a word or\na phrase, which is insufficient to cover a diverse set of target-related\nsubtopics; (ii) the sentence-level topic information within an argument, which\nwe believe is crucial for argument mining, is ignored. To tackle the above\nissues, we propose a novel explainable topic-enhanced argument mining approach.\nSpecifically, with the use of the neural topic model and the language model,\nthe target information is augmented by explainable topic representations.\nMoreover, the sentence-level topic information within the argument is captured\nby minimizing the distance between its latent topic distribution and its\nsemantic representation through mutual learning. Experiments have been\nconducted on the benchmark dataset in both the in-target setting and the\ncross-target setting. Results demonstrate the superiority of the proposed model\nagainst the state-of-the-art baselines.\n","authors":["Jiasheng Si","Yingjie Zhu","Xingyu Shi","Deyu Zhou","Yulan He"],"pdf_url":"https://arxiv.org/pdf/2307.12131v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.12114v1","updated":"2023-07-22T15:58:17Z","published":"2023-07-22T15:58:17Z","title":"A Zero-shot and Few-shot Study of Instruction-Finetuned Large Language\n  Models Applied to Clinical and Biomedical Tasks","summary":"  We evaluate four state-of-the-art instruction-tuned large language models\n(LLMs) -- ChatGPT, Flan-T5 UL2, Tk-Instruct, and Alpaca -- on a set of 13\nreal-world clinical and biomedical natural language processing (NLP) tasks in\nEnglish, such as named-entity recognition (NER), question-answering (QA),\nrelation extraction (RE), etc. Our overall results demonstrate that the\nevaluated LLMs begin to approach performance of state-of-the-art models in\nzero- and few-shot scenarios for most tasks, and particularly well for the QA\ntask, even though they have never seen examples from these tasks before.\nHowever, we observed that the classification and RE tasks perform below what\ncan be achieved with a specifically trained model for the medical field, such\nas PubMedBERT. Finally, we noted that no LLM outperforms all the others on all\nthe studied tasks, with some models being better suited for certain tasks than\nothers.\n","authors":["Yanis Labrak","Mickael Rouvier","Richard Dufour"],"pdf_url":"https://arxiv.org/pdf/2307.12114v1.pdf","comment":"Under review process"},{"id":"http://arxiv.org/abs/2303.13379v2","updated":"2023-07-22T15:26:28Z","published":"2023-03-17T18:14:46Z","title":"Practical and Ethical Challenges of Large Language Models in Education:\n  A Systematic Scoping Review","summary":"  Educational technology innovations leveraging large language models (LLMs)\nhave shown the potential to automate the laborious process of generating and\nanalysing textual content. While various innovations have been developed to\nautomate a range of educational tasks (e.g., question generation, feedback\nprovision, and essay grading), there are concerns regarding the practicality\nand ethicality of these innovations. Such concerns may hinder future research\nand the adoption of LLMs-based innovations in authentic educational contexts.\nTo address this, we conducted a systematic scoping review of 118 peer-reviewed\npapers published since 2017 to pinpoint the current state of research on using\nLLMs to automate and support educational tasks. The findings revealed 53 use\ncases for LLMs in automating education tasks, categorised into nine main\ncategories: profiling/labelling, detection, grading, teaching support,\nprediction, knowledge representation, feedback, content generation, and\nrecommendation. Additionally, we also identified several practical and ethical\nchallenges, including low technological readiness, lack of replicability and\ntransparency, and insufficient privacy and beneficence considerations. The\nfindings were summarised into three recommendations for future studies,\nincluding updating existing innovations with state-of-the-art models (e.g.,\nGPT-3/4), embracing the initiative of open-sourcing models/systems, and\nadopting a human-centred approach throughout the developmental process. As the\nintersection of AI and education is continuously evolving, the findings of this\nstudy can serve as an essential reference point for researchers, allowing them\nto leverage the strengths, learn from the limitations, and uncover potential\nresearch opportunities enabled by ChatGPT and other generative AI models.\n","authors":["Lixiang Yan","Lele Sha","Linxuan Zhao","Yuheng Li","Roberto Martinez-Maldonado","Guanliang Chen","Xinyu Li","Yueqiao Jin","Dragan Gašević"],"pdf_url":"https://arxiv.org/pdf/2303.13379v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10025v2","updated":"2023-07-22T15:07:57Z","published":"2023-07-19T15:09:50Z","title":"An Empirical Study on Fertility Proposals Using Multi-Grained Topic\n  Analysis Methods","summary":"  Fertility issues are closely related to population security, in 60 years\nChina's population for the first time in a negative growth trend, the change of\nfertility policy is of great concern to the community. 2023 \"two sessions\"\nproposal \"suggests that the country in the form of legislation, the birth of\nthe registration of the cancellation of the marriage restriction\" This topic\nwas once a hot topic on the Internet, and \"unbundling\" the relationship between\nbirth registration and marriage has become the focus of social debate. In this\npaper, we adopt co-occurrence semantic analysis, topic analysis and sentiment\nanalysis to conduct multi-granularity semantic analysis of microblog comments.\nIt is found that the discussion on the proposal of \"removing marriage\nrestrictions from birth registration\" involves the individual, society and the\nstate at three dimensions, and is detailed into social issues such as personal\nbehaviour, social ethics and law, and national policy, with people's sentiment\ninclined to be negative in most of the topics. Based on this, eight proposals\nwere made to provide a reference for governmental decision making and to form a\nreference method for researching public opinion on political issues.\n","authors":["Yulin Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.10025v2.pdf","comment":"7 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2212.05767v7","updated":"2023-07-22T13:05:21Z","published":"2022-12-12T08:40:04Z","title":"A Survey of Knowledge Graph Reasoning on Graph Types: Static, Dynamic,\n  and Multimodal","summary":"  Knowledge graph reasoning (KGR), aiming to deduce new facts from existing\nfacts based on mined logic rules underlying knowledge graphs (KGs), has become\na fast-growing research direction. It has been proven to significantly benefit\nthe usage of KGs in many AI applications, such as question answering,\nrecommendation systems, and etc. According to the graph types, existing KGR\nmodels can be roughly divided into three categories, i.e., static models,\ntemporal models, and multi-modal models. Early works in this domain mainly\nfocus on static KGR, and recent works try to leverage the temporal and\nmulti-modal information, which are more practical and closer to real-world.\nHowever, no survey papers and open-source repositories comprehensively\nsummarize and discuss models in this important direction. To fill the gap, we\nconduct a first survey for knowledge graph reasoning tracing from static to\ntemporal and then to multi-modal KGs. Concretely, the models are reviewed based\non bi-level taxonomy, i.e., top-level (graph types) and base-level (techniques\nand scenarios). Besides, the performances, as well as datasets, are summarized\nand presented. Moreover, we point out the challenges and potential\nopportunities to enlighten the readers. The corresponding open-source\nrepository is shared on GitHub\nhttps://github.com/LIANGKE23/Awesome-Knowledge-Graph-Reasoning.\n","authors":["Ke Liang","Lingyuan Meng","Meng Liu","Yue Liu","Wenxuan Tu","Siwei Wang","Sihang Zhou","Xinwang Liu","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2212.05767v7.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2306.17727v2","updated":"2023-07-22T12:41:04Z","published":"2023-06-30T15:16:52Z","title":"Improved NL2SQL based on Multi-layer Expert Network","summary":"  The Natural Language to SQL (NL2SQL) technique is used to convert natural\nlanguage queries into executable SQL statements. Typically, slot-filling is\nemployed as a classification method for multi-task cases to achieve this goal.\nHowever, slot-filling can result in inaccurate SQL statement generation due to\nnegative migration issues arising from different classification tasks. To\novercome this limitation, this study introduces a new approach called\nMulti-Layer Expert Generate SQL (MLEG-SQL), which utilizes a dedicated\nmulti-task hierarchical network. The lower layer of the network extracts\nsemantic features of natural language statements, while the upper layer builds\na specialized expert system for handling specific classification tasks. This\nhierarchical approach mitigates performance degradation resulting from\ndifferent task conflicts. The proposed method was evaluated on the WiKSQL\ndataset and was found to be effective in generating accurate SQL statements.\n","authors":["Chenduo Hao","Xu Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.17727v2.pdf","comment":"the paper's figure has something wrong"},{"id":"http://arxiv.org/abs/2307.12045v1","updated":"2023-07-22T10:35:25Z","published":"2023-07-22T10:35:25Z","title":"Revisiting Distillation for Continual Learning on Visual Question\n  Localized-Answering in Robotic Surgery","summary":"  The visual-question localized-answering (VQLA) system can serve as a\nknowledgeable assistant in surgical education. Except for providing text-based\nanswers, the VQLA system can highlight the interested region for better\nsurgical scene understanding. However, deep neural networks (DNNs) suffer from\ncatastrophic forgetting when learning new knowledge. Specifically, when DNNs\nlearn on incremental classes or tasks, their performance on old tasks drops\ndramatically. Furthermore, due to medical data privacy and licensing issues, it\nis often difficult to access old data when updating continual learning (CL)\nmodels. Therefore, we develop a non-exemplar continual surgical VQLA framework,\nto explore and balance the rigidity-plasticity trade-off of DNNs in a\nsequential learning paradigm. We revisit the distillation loss in CL tasks, and\npropose rigidity-plasticity-aware distillation (RP-Dist) and self-calibrated\nheterogeneous distillation (SH-Dist) to preserve the old knowledge. The weight\naligning (WA) technique is also integrated to adjust the weight bias between\nold and new tasks. We further establish a CL framework on three public surgical\ndatasets in the context of surgical settings that consist of overlapping\nclasses between old and new surgical VQLA tasks. With extensive experiments, we\ndemonstrate that our proposed method excellently reconciles learning and\nforgetting on the continual surgical VQLA over conventional CL methods. Our\ncode is publicly accessible.\n","authors":["Long Bai","Mobarakol Islam","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2307.12045v1.pdf","comment":"To appear in MICCAI 2023. Code availability:\n  https://github.com/longbai1006/CS-VQLA"},{"id":"http://arxiv.org/abs/2307.10443v2","updated":"2023-07-22T08:36:18Z","published":"2023-07-19T20:17:37Z","title":"Integrating a Heterogeneous Graph with Entity-aware Self-attention using\n  Relative Position Labels for Reading Comprehension Model","summary":"  Despite the significant progress made by transformer models in machine\nreading comprehension tasks, they still fall short in handling complex\nreasoning tasks due to the absence of explicit knowledge in the input sequence.\nTo address this limitation, many recent works have proposed injecting external\nknowledge into the model. However, selecting relevant external knowledge,\nensuring its availability, and requiring additional processing steps remain\nchallenging. In this paper, we introduce a novel attention pattern that\nintegrates reasoning knowledge derived from a heterogeneous graph into the\ntransformer architecture without relying on external knowledge. The proposed\nattention pattern comprises three key elements: global-local attention for word\ntokens, graph attention for entity tokens that exhibit strong attention towards\ntokens connected in the graph as opposed to those unconnected, and the\nconsideration of the type of relationship between each entity token and word\ntoken. This results in optimized attention between the two if a relationship\nexists. The pattern is coupled with special relative position labels, allowing\nit to integrate with LUKE's entity-aware self-attention mechanism. The\nexperimental findings corroborate that our model outperforms both the\ncutting-edge LUKE-Graph and the baseline LUKE model on the ReCoRD dataset that\nfocuses on commonsense reasoning.\n","authors":["Shima Foolad","Kourosh Kiani"],"pdf_url":"https://arxiv.org/pdf/2307.10443v2.pdf","comment":"submitted for Knowledge-Based Systems Journal"},{"id":"http://arxiv.org/abs/2307.07851v2","updated":"2023-07-22T07:39:59Z","published":"2023-07-15T17:01:56Z","title":"AspectCSE: Sentence Embeddings for Aspect-based Semantic Textual\n  Similarity using Contrastive Learning and Structured Knowledge","summary":"  Generic sentence embeddings provide a coarse-grained approximation of\nsemantic textual similarity but ignore specific aspects that make texts\nsimilar. Conversely, aspect-based sentence embeddings provide similarities\nbetween texts based on certain predefined aspects. Thus, similarity predictions\nof texts are more targeted to specific requirements and more easily\nexplainable. In this paper, we present AspectCSE, an approach for aspect-based\ncontrastive learning of sentence embeddings. Results indicate that AspectCSE\nachieves an average improvement of 3.97% on information retrieval tasks across\nmultiple aspects compared to the previous best results. We also propose using\nWikidata knowledge graph properties to train models of multi-aspect sentence\nembeddings in which multiple specific aspects are simultaneously considered\nduring similarity predictions. We demonstrate that multi-aspect embeddings\noutperform single-aspect embeddings on aspect-specific information retrieval\ntasks. Finally, we examine the aspect-based sentence embedding space and\ndemonstrate that embeddings of semantically similar aspect labels are often\nclose, even without explicit similarity training between different aspect\nlabels.\n","authors":["Tim Schopf","Emanuel Gerber","Malte Ostendorff","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.07851v2.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n  Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2307.03104v2","updated":"2023-07-22T07:20:00Z","published":"2023-07-06T16:26:34Z","title":"Efficient Domain Adaptation of Sentence Embeddings using Adapters","summary":"  Sentence embeddings enable us to capture the semantic similarity of short\ntexts. Most sentence embedding models are trained for general semantic textual\nsimilarity (STS) tasks. Therefore, to use sentence embeddings in a particular\ndomain, the model must be adapted to it in order to achieve good results.\nUsually, this is done by fine-tuning the entire sentence embedding model for\nthe domain of interest. While this approach yields state-of-the-art results,\nall of the model's weights are updated during fine-tuning, making this method\nresource-intensive. Therefore, instead of fine-tuning entire sentence embedding\nmodels for each target domain individually, we propose to train lightweight\nadapters. These domain-specific adapters do not require fine-tuning all\nunderlying sentence embedding model parameters. Instead, we only train a small\nnumber of additional parameters while keeping the weights of the underlying\nsentence embedding model fixed. Training domain-specific adapters allows always\nusing the same base model and only exchanging the domain-specific adapters to\nadapt sentence embeddings to a specific domain. We show that using adapters for\nparameter-efficient domain adaptation of sentence embeddings yields competitive\nperformance within 1% of a domain-adapted, entirely fine-tuned sentence\nembedding model while only training approximately 3.6% of the parameters.\n","authors":["Tim Schopf","Dennis Schneider","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.03104v2.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n  Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2307.11991v1","updated":"2023-07-22T06:21:41Z","published":"2023-07-22T06:21:41Z","title":"Psy-LLM: Scaling up Global Mental Health Psychological Services with\n  AI-based Large Language Models","summary":"  The demand for psychological counseling has grown significantly in recent\nyears, particularly with the global outbreak of COVID-19, which has heightened\nthe need for timely and professional mental health support. Online\npsychological counseling has emerged as the predominant mode of providing\nservices in response to this demand. In this study, we propose the Psy-LLM\nframework, an AI-based system leveraging Large Language Models (LLMs) for\nquestion-answering in online psychological consultation. Our framework combines\npre-trained LLMs with real-world professional Q&A from psychologists and\nextensively crawled psychological articles. The Psy-LLM framework serves as a\nfront-end tool for healthcare professionals, allowing them to provide immediate\nresponses and mindfulness activities to alleviate patient stress. Additionally,\nit functions as a screening tool to identify urgent cases requiring further\nassistance. We evaluated the framework using intrinsic metrics, such as\nperplexity, and extrinsic evaluation metrics, with human participant\nassessments of response helpfulness, fluency, relevance, and logic. The results\ndemonstrate the effectiveness of the Psy-LLM framework in generating coherent\nand relevant answers to psychological questions. This article concludes by\ndiscussing the potential of large language models to enhance mental health\nsupport through AI technologies in online psychological consultation.\n","authors":["Tin Lai","Yukun Shi","Zicong Du","Jiajie Wu","Ken Fu","Yichao Dou","Ziqi Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11984v1","updated":"2023-07-22T05:26:50Z","published":"2023-07-22T05:26:50Z","title":"Learning Vision-and-Language Navigation from YouTube Videos","summary":"  Vision-and-language navigation (VLN) requires an embodied agent to navigate\nin realistic 3D environments using natural language instructions. Existing VLN\nmethods suffer from training on small-scale environments or unreasonable\npath-instruction datasets, limiting the generalization to unseen environments.\nThere are massive house tour videos on YouTube, providing abundant real\nnavigation experiences and layout information. However, these videos have not\nbeen explored for VLN before. In this paper, we propose to learn an agent from\nthese videos by creating a large-scale dataset which comprises reasonable\npath-instruction pairs from house tour videos and pre-training the agent on it.\nTo achieve this, we have to tackle the challenges of automatically constructing\npath-instruction pairs and exploiting real layout knowledge from raw and\nunlabeled videos. To address these, we first leverage an entropy-based method\nto construct the nodes of a path trajectory. Then, we propose an action-aware\ngenerator for generating instructions from unlabeled trajectories. Last, we\ndevise a trajectory judgment pretext task to encourage the agent to mine the\nlayout knowledge. Experimental results show that our method achieves\nstate-of-the-art performance on two popular benchmarks (R2R and REVERIE). Code\nis available at https://github.com/JeremyLinky/YouTube-VLN\n","authors":["Kunyang Lin","Peihao Chen","Diwei Huang","Thomas H. Li","Mingkui Tan","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2307.11984v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.09998v2","updated":"2023-07-22T04:03:35Z","published":"2023-07-19T14:13:02Z","title":"Generating Mathematical Derivations with Large Language Models","summary":"  The derivation of mathematical results in specialised fields using Large\nLanguage Models (LLMs) is an emerging research direction that can help identify\nmodels' limitations, and potentially support mathematical discovery. In this\npaper, we leverage a symbolic engine to generate derivations of equations at\nscale, and investigate the capabilities of LLMs when deriving goal equations\nfrom premises. Specifically, we employ in-context learning for GPT and\nfine-tune a range of T5 models to compare the robustness and generalisation of\npre-training strategies to specialised models. Empirical results show that\nfine-tuned FLAN-T5-large (MathT5) outperforms GPT models on all static and\nout-of-distribution test sets in terms of absolute performance. However, an\nin-depth analysis reveals that the fine-tuned models are more sensitive to\nperturbations involving unseen symbols and (to a lesser extent) changes to\nequation structure. In addition, we analyse 1.7K equations and over 200\nderivations to highlight common reasoning errors such as the inclusion of\nincorrect, irrelevant, and redundant equations, along with the tendency to skip\nderivation steps. Finally, we explore the suitability of existing metrics for\nevaluating mathematical derivations finding evidence that, while they capture\ngeneral properties such as sensitivity to perturbations, they fail to highlight\nfine-grained reasoning errors and essential differences between models.\nOverall, this work demonstrates that training models on synthetic data can\nimprove their mathematical capabilities beyond larger architectures.\n","authors":["Jordan Meadows","Marco Valentino","Andre Freitas"],"pdf_url":"https://arxiv.org/pdf/2307.09998v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2302.00102v2","updated":"2023-07-22T02:21:10Z","published":"2023-01-31T21:08:58Z","title":"Detecting Harmful Agendas in News Articles","summary":"  Manipulated news online is a growing problem which necessitates the use of\nautomated systems to curtail its spread. We argue that while misinformation and\ndisinformation detection have been studied, there has been a lack of investment\nin the important open challenge of detecting harmful agendas in news articles;\nidentifying harmful agendas is critical to flag news campaigns with the\ngreatest potential for real world harm. Moreover, due to real concerns around\ncensorship, harmful agenda detectors must be interpretable to be effective. In\nthis work, we propose this new task and release a dataset, NewsAgendas, of\nannotated news articles for agenda identification. We show how interpretable\nsystems can be effective on this task and demonstrate that they can perform\ncomparably to black-box models.\n","authors":["Melanie Subbiah","Amrita Bhattacharjee","Yilun Hua","Tharindu Kumarage","Huan Liu","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2302.00102v2.pdf","comment":"Camera-ready for ACL-WASSA 2023"},{"id":"http://arxiv.org/abs/2307.08074v2","updated":"2023-07-22T00:11:24Z","published":"2023-07-16T15:18:25Z","title":"Disco-Bench: A Discourse-Aware Evaluation Benchmark for Language\n  Modelling","summary":"  Modeling discourse -- the linguistic phenomena that go beyond individual\nsentences, is a fundamental yet challenging aspect of natural language\nprocessing (NLP). However, existing evaluation benchmarks primarily focus on\nthe evaluation of inter-sentence properties and overlook critical discourse\nphenomena that cross sentences. To bridge the gap, we propose Disco-Bench, a\nbenchmark that can evaluate intra-sentence discourse properties across a\ndiverse set of NLP tasks, covering understanding, translation, and generation.\nDisco-Bench consists of 9 document-level testsets in the literature domain,\nwhich contain rich discourse phenomena (e.g. cohesion and coherence) in Chinese\nand/or English. For linguistic analysis, we also design a diagnostic test suite\nthat can examine whether the target models learn discourse knowledge. We\ntotally evaluate 20 general-, in-domain and commercial models based on\nTransformer, advanced pretraining architectures and large language models\n(LLMs). Our results show (1) the challenge and necessity of our evaluation\nbenchmark; (2) fine-grained pretraining based on literary document-level\ntraining data consistently improves the modeling of discourse information. We\nwill release the datasets, pretrained models, and leaderboard, which we hope\ncan significantly facilitate research in this field:\nhttps://github.com/longyuewangdcu/Disco-Bench.\n","authors":["Longyue Wang","Zefeng Du","Donghuai Liu","Deng Cai","Dian Yu","Haiyun Jiang","Yan Wang","Leyang Cui","Shuming Shi","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2307.08074v2.pdf","comment":"Zhaopeng Tu is the corresponding author"},{"id":"http://arxiv.org/abs/2307.12996v1","updated":"2023-07-22T10:32:58Z","published":"2023-07-22T10:32:58Z","title":"Extracting Molecular Properties from Natural Language with Multimodal\n  Contrastive Learning","summary":"  Deep learning in computational biochemistry has traditionally focused on\nmolecular graphs neural representations; however, recent advances in language\nmodels highlight how much scientific knowledge is encoded in text. To bridge\nthese two modalities, we investigate how molecular property information can be\ntransferred from natural language to graph representations. We study property\nprediction performance gains after using contrastive learning to align neural\ngraph representations with representations of textual descriptions of their\ncharacteristics. We implement neural relevance scoring strategies to improve\ntext retrieval, introduce a novel chemically-valid molecular graph augmentation\nstrategy inspired by organic reactions, and demonstrate improved performance on\ndownstream MoleculeNet property classification tasks. We achieve a +4.26% AUROC\ngain versus models pre-trained on the graph modality alone, and a +1.54% gain\ncompared to recently proposed molecular graph/text contrastively trained MoMu\nmodel (Su et al. 2022).\n","authors":["Romain Lacombe","Andrew Gaut","Jeff He","David Lüdeke","Kateryna Pistunova"],"pdf_url":"https://arxiv.org/pdf/2307.12996v1.pdf","comment":"2023 ICML Workshop on Computational Biology"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2212.05767v7","updated":"2023-07-22T13:05:21Z","published":"2022-12-12T08:40:04Z","title":"A Survey of Knowledge Graph Reasoning on Graph Types: Static, Dynamic,\n  and Multimodal","summary":"  Knowledge graph reasoning (KGR), aiming to deduce new facts from existing\nfacts based on mined logic rules underlying knowledge graphs (KGs), has become\na fast-growing research direction. It has been proven to significantly benefit\nthe usage of KGs in many AI applications, such as question answering,\nrecommendation systems, and etc. According to the graph types, existing KGR\nmodels can be roughly divided into three categories, i.e., static models,\ntemporal models, and multi-modal models. Early works in this domain mainly\nfocus on static KGR, and recent works try to leverage the temporal and\nmulti-modal information, which are more practical and closer to real-world.\nHowever, no survey papers and open-source repositories comprehensively\nsummarize and discuss models in this important direction. To fill the gap, we\nconduct a first survey for knowledge graph reasoning tracing from static to\ntemporal and then to multi-modal KGs. Concretely, the models are reviewed based\non bi-level taxonomy, i.e., top-level (graph types) and base-level (techniques\nand scenarios). Besides, the performances, as well as datasets, are summarized\nand presented. Moreover, we point out the challenges and potential\nopportunities to enlighten the readers. The corresponding open-source\nrepository is shared on GitHub\nhttps://github.com/LIANGKE23/Awesome-Knowledge-Graph-Reasoning.\n","authors":["Ke Liang","Lingyuan Meng","Meng Liu","Yue Liu","Wenxuan Tu","Siwei Wang","Sihang Zhou","Xinwang Liu","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2212.05767v7.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2307.12034v1","updated":"2023-07-22T10:03:32Z","published":"2023-07-22T10:03:32Z","title":"Conformal Group Recommender System","summary":"  Group recommender systems (GRS) are critical in discovering relevant items\nfrom a near-infinite inventory based on group preferences rather than\nindividual preferences, like recommending a movie, restaurant, or tourist\ndestination to a group of individuals. The traditional models of group\nrecommendation are designed to act like a black box with a strict focus on\nimproving recommendation accuracy, and most often, they place the onus on the\nusers to interpret recommendations. In recent years, the focus of Recommender\nSystems (RS) research has shifted away from merely improving recommendation\naccuracy towards value additions such as confidence and explanation. In this\nwork, we propose a conformal prediction framework that provides a measure of\nconfidence with prediction in conjunction with a group recommender system to\naugment the system-generated plain recommendations. In the context of group\nrecommender systems, we propose various nonconformity measures that play a\nvital role in the efficiency of the conformal framework. We also show that\ndefined nonconformity satisfies the exchangeability property. Experimental\nresults demonstrate the effectiveness of the proposed approach over several\nbenchmark datasets. Furthermore, our proposed approach also satisfies validity\nand efficiency properties.\n","authors":["Venkateswara Rao Kagita","Anshuman Singh","Vikas Kumar","Pavan Kalyan Reddy Neerudu","Arun K Pujari","Rohit Kumar Bondugula"],"pdf_url":"https://arxiv.org/pdf/2307.12034v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2307.12019v1","updated":"2023-07-22T08:38:14Z","published":"2023-07-22T08:38:14Z","title":"XWalk: Random Walk Based Candidate Retrieval for Product Search","summary":"  In e-commerce, head queries account for the vast majority of gross\nmerchandise sales and improvements to head queries are highly impactful to the\nbusiness. While most supervised approaches to search perform better in head\nqueries vs. tail queries, we propose a method that further improves head query\nperformance dramatically. We propose XWalk, a random-walk based graph approach\nto candidate retrieval for product search that borrows from recommendation\nsystem techniques. XWalk is highly efficient to train and inference in a\nlarge-scale high traffic e-commerce setting, and shows substantial improvements\nin head query performance over state-of-the-art neural retreivers. Ensembling\nXWalk with a neural and/or lexical retriever combines the best of both worlds\nand the resulting retrieval system outperforms all other methods in both\noffline relevance-based evaluation and in online A/B tests.\n","authors":["Jon Eskreis-Winkler","Yubin Kim","Andrew Stanton"],"pdf_url":"https://arxiv.org/pdf/2307.12019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07145v4","updated":"2023-07-22T07:54:02Z","published":"2023-04-14T14:13:37Z","title":"EvalRS 2023. Well-Rounded Recommender Systems For Real-World Deployments","summary":"  EvalRS aims to bring together practitioners from industry and academia to\nfoster a debate on rounded evaluation of recommender systems, with a focus on\nreal-world impact across a multitude of deployment scenarios. Recommender\nsystems are often evaluated only through accuracy metrics, which fall short of\nfully characterizing their generalization capabilities and miss important\naspects, such as fairness, bias, usefulness, informativeness. This workshop\nbuilds on the success of last year's workshop at CIKM, but with a broader scope\nand an interactive format.\n","authors":["Federico Bianchi","Patrick John Chia","Ciro Greco","Claudio Pomo","Gabriel Moreira","Davide Eynard","Fahd Husain","Jacopo Tagliabue"],"pdf_url":"https://arxiv.org/pdf/2304.07145v4.pdf","comment":"EvalRS 2023 is a workshop at KDD23. Code and hackathon materials:\n  https://github.com/RecList/evalRS-KDD-2023"},{"id":"http://arxiv.org/abs/2307.11994v1","updated":"2023-07-22T06:44:28Z","published":"2023-07-22T06:44:28Z","title":"HTP: Exploiting Holistic Temporal Patterns for Sequential Recommendation","summary":"  Sequential recommender systems have demonstrated a huge success for next-item\nrecommendation by explicitly exploiting the temporal order of users' historical\ninteractions. In practice, user interactions contain more useful temporal\ninformation beyond order, as shown by some pioneering studies. In this paper,\nwe systematically investigate various temporal information for sequential\nrecommendation and identify three types of advantageous temporal patterns\nbeyond order, including absolute time information, relative item time intervals\nand relative recommendation time intervals. We are the first to explore\nitem-oriented absolute time patterns. While existing models consider only one\nor two of these three patterns, we propose a novel holistic temporal pattern\nbased neural network, named HTP, to fully leverage all these three patterns. In\nparticular, we introduce novel components to address the subtle correlations\nbetween relative item time intervals and relative recommendation time\nintervals, which render a major technical challenge. Extensive experiments on\nthree real-world benchmark datasets show that our HTP model consistently and\nsubstantially outperforms many state-of-the-art models. Our code is publically\navailable at https://github.com/623851394/HTP/tree/main/HTP-main\n","authors":["Chen Rui","Liang Guotao","Ma Chenrui","Han Qilong","Li Li","Huang Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.11994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11981v1","updated":"2023-07-22T04:52:27Z","published":"2023-07-22T04:52:27Z","title":"Collaborative Graph Neural Networks for Attributed Network Embedding","summary":"  Graph neural networks (GNNs) have shown prominent performance on attributed\nnetwork embedding. However, existing efforts mainly focus on exploiting network\nstructures, while the exploitation of node attributes is rather limited as they\nonly serve as node features at the initial layer. This simple strategy impedes\nthe potential of node attributes in augmenting node connections, leading to\nlimited receptive field for inactive nodes with few or even no neighbors.\nFurthermore, the training objectives (i.e., reconstructing network structures)\nof most GNNs also do not include node attributes, although studies have shown\nthat reconstructing node attributes is beneficial. Thus, it is encouraging to\ndeeply involve node attributes in the key components of GNNs, including graph\nconvolution operations and training objectives. However, this is a nontrivial\ntask since an appropriate way of integration is required to maintain the merits\nof GNNs. To bridge the gap, in this paper, we propose COllaborative graph\nNeural Networks--CONN, a tailored GNN architecture for attribute network\nembedding. It improves model capacity by 1) selectively diffusing messages from\nneighboring nodes and involved attribute categories, and 2) jointly\nreconstructing node-to-node and node-to-attribute-category interactions via\ncross-correlation. Experiments on real-world networks demonstrate that CONN\nexcels state-of-the-art embedding algorithms with a great margin.\n","authors":["Qiaoyu Tan","Xin Zhang","Xiao Huang","Hao Chen","Jundong Li","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2307.11981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12996v1","updated":"2023-07-22T10:32:58Z","published":"2023-07-22T10:32:58Z","title":"Extracting Molecular Properties from Natural Language with Multimodal\n  Contrastive Learning","summary":"  Deep learning in computational biochemistry has traditionally focused on\nmolecular graphs neural representations; however, recent advances in language\nmodels highlight how much scientific knowledge is encoded in text. To bridge\nthese two modalities, we investigate how molecular property information can be\ntransferred from natural language to graph representations. We study property\nprediction performance gains after using contrastive learning to align neural\ngraph representations with representations of textual descriptions of their\ncharacteristics. We implement neural relevance scoring strategies to improve\ntext retrieval, introduce a novel chemically-valid molecular graph augmentation\nstrategy inspired by organic reactions, and demonstrate improved performance on\ndownstream MoleculeNet property classification tasks. We achieve a +4.26% AUROC\ngain versus models pre-trained on the graph modality alone, and a +1.54% gain\ncompared to recently proposed molecular graph/text contrastively trained MoMu\nmodel (Su et al. 2022).\n","authors":["Romain Lacombe","Andrew Gaut","Jeff He","David Lüdeke","Kateryna Pistunova"],"pdf_url":"https://arxiv.org/pdf/2307.12996v1.pdf","comment":"2023 ICML Workshop on Computational Biology"}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.12152v1","updated":"2023-07-22T19:52:04Z","published":"2023-07-22T19:52:04Z","title":"Real-Time Neural Video Recovery and Enhancement on Mobile Devices","summary":"  As mobile devices become increasingly popular for video streaming, it's\ncrucial to optimize the streaming experience for these devices. Although deep\nlearning-based video enhancement techniques are gaining attention, most of them\ncannot support real-time enhancement on mobile devices. Additionally, many of\nthese techniques are focused solely on super-resolution and cannot handle\npartial or complete loss or corruption of video frames, which is common on the\nInternet and wireless networks.\n  To overcome these challenges, we present a novel approach in this paper. Our\napproach consists of (i) a novel video frame recovery scheme, (ii) a new\nsuper-resolution algorithm, and (iii) a receiver enhancement-aware video bit\nrate adaptation algorithm. We have implemented our approach on an iPhone 12,\nand it can support 30 frames per second (FPS). We have evaluated our approach\nin various networks such as WiFi, 3G, 4G, and 5G networks. Our evaluation shows\nthat our approach enables real-time enhancement and results in a significant\nincrease in video QoE (Quality of Experience) of 24\\% - 82\\% in our video\nstreaming system.\n","authors":["Zhaoyuan He","Yifan Yang","Lili Qiu","Kyoungjun Park"],"pdf_url":"https://arxiv.org/pdf/2307.12152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07847v2","updated":"2023-07-22T19:51:27Z","published":"2023-07-15T16:45:01Z","title":"Neural Video Recovery for Cloud Gaming","summary":"  Cloud gaming is a multi-billion dollar industry. A client in cloud gaming\nsends its movement to the game server on the Internet, which renders and\ntransmits the resulting video back. In order to provide a good gaming\nexperience, a latency below 80 ms is required. This means that video rendering,\nencoding, transmission, decoding, and display have to finish within that time\nframe, which is especially challenging to achieve due to server overload,\nnetwork congestion, and losses. In this paper, we propose a new method for\nrecovering lost or corrupted video frames in cloud gaming. Unlike traditional\nvideo frame recovery, our approach uses game states to significantly enhance\nrecovery accuracy and utilizes partially decoded frames to recover lost\nportions. We develop a holistic system that consists of (i) efficiently\nextracting game states, (ii) modifying H.264 video decoder to generate a mask\nto indicate which portions of video frames need recovery, and (iii) designing a\nnovel neural network to recover either complete or partial video frames. Our\napproach is extensively evaluated using iPhone 12 and laptop implementations,\nand we demonstrate the utility of game states in the game video recovery and\nthe effectiveness of our overall design.\n","authors":["Zhaoyuan He","Yifan Yang","Shuozhe Li","Diyuan Dai","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2307.07847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16181v2","updated":"2023-07-22T13:59:13Z","published":"2023-06-28T13:03:43Z","title":"Learning to Pan-sharpening with Memories of Spatial Details","summary":"  Pan-sharpening, as one of the most commonly used techniques in remote sensing\nsystems, aims to inject spatial details from panchromatic images into\nmultispectral images (MS) to obtain high-resolution multispectral images. Since\ndeep learning has received widespread attention because of its powerful fitting\nability and efficient feature extraction, a variety of pan-sharpening methods\nhave been proposed to achieve remarkable performance. However, current\npan-sharpening methods usually require the paired panchromatic (PAN) and MS\nimages as input, which limits their usage in some scenarios. To address this\nissue, in this paper we observe that the spatial details from PAN images are\nmainly high-frequency cues, i.e., the edges reflect the contour of input PAN\nimages. This motivates us to develop a PAN-agnostic representation to store\nsome base edges, so as to compose the contour for the corresponding PAN image\nvia them. As a result, we can perform the pan-sharpening task with only the MS\nimage when inference. To this end, a memory-based network is adapted to extract\nand memorize the spatial details during the training phase and is used to\nreplace the process of obtaining spatial information from PAN images when\ninference, which is called Memory-based Spatial Details Network (MSDN).\nFinally, we integrate the proposed MSDN module into the existing deep\nlearning-based pan-sharpening methods to achieve an end-to-end pan-sharpening\nnetwork. With extensive experiments on the Gaofen1 and WorldView-4 satellites,\nwe verify that our method constructs good spatial details without PAN images\nand achieves the best performance. The code is available at\nhttps://github.com/Zhao-Tian-yi/Learning-to-Pan-sharpening-with-Memories-of-Spatial-Details.git.\n","authors":["Maoxun Yuan","Tianyi Zhao","Bo Li","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2306.16181v2.pdf","comment":null}]},"2023-07-25T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.13693v1","updated":"2023-07-25T17:57:18Z","published":"2023-07-25T17:57:18Z","title":"Evaluating Large Language Models for Radiology Natural Language\n  Processing","summary":"  The rise of large language models (LLMs) has marked a pivotal shift in the\nfield of natural language processing (NLP). LLMs have revolutionized a\nmultitude of domains, and they have made a significant impact in the medical\nfield. Large language models are now more abundant than ever, and many of these\nmodels exhibit bilingual capabilities, proficient in both English and Chinese.\nHowever, a comprehensive evaluation of these models remains to be conducted.\nThis lack of assessment is especially apparent within the context of radiology\nNLP. This study seeks to bridge this gap by critically evaluating thirty two\nLLMs in interpreting radiology reports, a crucial component of radiology NLP.\nSpecifically, the ability to derive impressions from radiologic findings is\nassessed. The outcomes of this evaluation provide key insights into the\nperformance, strengths, and weaknesses of these LLMs, informing their practical\napplications within the medical domain.\n","authors":["Zhengliang Liu","Tianyang Zhong","Yiwei Li","Yutong Zhang","Yi Pan","Zihao Zhao","Peixin Dong","Chao Cao","Yuxiao Liu","Peng Shu","Yaonai Wei","Zihao Wu","Chong Ma","Jiaqi Wang","Sheng Wang","Mengyue Zhou","Zuowei Jiang","Chunlin Li","Shaochen Xu","Lu Zhang","Haixing Dai","Kai Zhang","Xu Liu","Lin Zhao","Peilong Wang","Pingkun Yan","Jun Liu","Bao Ge","Lichao Sun","Dajiang Zhu","Xiang Li","Wei Liu","Xiaoyan Cai","Xintao Hu","Xi Jiang","Shu Zhang","Xin Zhang","Tuo Zhang","Shijie Zhao","Quanzheng Li","Hongtu Zhu","Dinggang Shen","Tianming Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13692v1","updated":"2023-07-25T17:55:19Z","published":"2023-07-25T17:55:19Z","title":"ARB: Advanced Reasoning Benchmark for Large Language Models","summary":"  Large Language Models (LLMs) have demonstrated remarkable performance on\nvarious quantitative reasoning and knowledge benchmarks. However, many of these\nbenchmarks are losing utility as LLMs get increasingly high scores, despite not\nyet reaching expert performance in these domains. We introduce ARB, a novel\nbenchmark composed of advanced reasoning problems in multiple fields. ARB\npresents a more challenging test than prior benchmarks, featuring problems in\nmathematics, physics, biology, chemistry, and law. As a subset of ARB, we\nintroduce a challenging set of math and physics problems which require advanced\nsymbolic reasoning and domain knowledge. We evaluate recent models such as\nGPT-4 and Claude on ARB and demonstrate that current models score well below\n50% on more demanding tasks. In order to improve both automatic and assisted\nevaluation capabilities, we introduce a rubric-based evaluation approach,\nallowing GPT-4 to score its own intermediate reasoning steps. Further, we\nconduct a human evaluation of the symbolic subset of ARB, finding promising\nagreement between annotators and GPT-4 rubric evaluation scores.\n","authors":["Tomohiro Sawada","Daniel Paleka","Alexander Havrilla","Pranav Tadepalli","Paula Vidas","Alexander Kranias","John J. Nay","Kshitij Gupta","Aran Komatsuzaki"],"pdf_url":"https://arxiv.org/pdf/2307.13692v1.pdf","comment":"Submitted to NeurIPS Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2303.06296v2","updated":"2023-07-25T17:42:37Z","published":"2023-03-11T03:30:47Z","title":"Stabilizing Transformer Training by Preventing Attention Entropy\n  Collapse","summary":"  Training stability is of great importance to Transformers. In this work, we\ninvestigate the training dynamics of Transformers by examining the evolution of\nthe attention layers. In particular, we track the attention entropy for each\nattention head during the course of training, which is a proxy for model\nsharpness. We identify a common pattern across different architectures and\ntasks, where low attention entropy is accompanied by high training instability,\nwhich can take the form of oscillating loss or divergence. We denote the\npathologically low attention entropy, corresponding to highly concentrated\nattention scores, as $\\textit{entropy collapse}$. As a remedy, we propose\n$\\sigma$Reparam, a simple and efficient solution where we reparametrize all\nlinear layers with spectral normalization and an additional learned scalar. We\ndemonstrate that $\\sigma$Reparam successfully prevents entropy collapse in the\nattention layers, promoting more stable training. Additionally, we prove a\ntight lower bound of the attention entropy, which decreases exponentially fast\nwith the spectral norm of the attention logits, providing additional motivation\nfor our approach. We conduct experiments with $\\sigma$Reparam on image\nclassification, image self-supervised learning, machine translation, speech\nrecognition, and language modeling tasks. We show that $\\sigma$Reparam provides\nstability and robustness with respect to the choice of hyperparameters, going\nso far as enabling training (a) a Vision Transformer {to competitive\nperformance} without warmup, weight decay, layer normalization or adaptive\noptimizers; (b) deep architectures in machine translation and (c) speech\nrecognition to competitive performance without warmup and adaptive optimizers.\nCode is available at \\url{https://github.com/apple/ml-sigma-reparam}.\n","authors":["Shuangfei Zhai","Tatiana Likhomanenko","Etai Littwin","Dan Busbridge","Jason Ramapuram","Yizhe Zhang","Jiatao Gu","Josh Susskind"],"pdf_url":"https://arxiv.org/pdf/2303.06296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13655v1","updated":"2023-07-25T17:02:38Z","published":"2023-07-25T17:02:38Z","title":"A Comprehensive Evaluation and Analysis Study for Chinese Spelling Check","summary":"  With the development of pre-trained models and the incorporation of phonetic\nand graphic information, neural models have achieved high scores in Chinese\nSpelling Check (CSC). However, it does not provide a comprehensive reflection\nof the models' capability due to the limited test sets. In this study, we\nabstract the representative model paradigm, implement it with nine structures\nand experiment them on comprehensive test sets we constructed with different\npurposes. We perform a detailed analysis of the results and find that: 1)\nFusing phonetic and graphic information reasonably is effective for CSC. 2)\nModels are sensitive to the error distribution of the test set, which reflects\nthe shortcomings of models and reveals the direction we should work on. 3)\nWhether or not the errors and contexts have been seen has a significant impact\non models. 4) The commonly used benchmark, SIGHAN, can not reliably evaluate\nmodels' performance.\n","authors":["Xunjian Yin","Xiaojun Wan"],"pdf_url":"https://arxiv.org/pdf/2307.13655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13631v1","updated":"2023-07-25T16:31:20Z","published":"2023-07-25T16:31:20Z","title":"Contributions to the Improvement of Question Answering Systems in the\n  Biomedical Domain","summary":"  This thesis work falls within the framework of question answering (QA) in the\nbiomedical domain where several specific challenges are addressed, such as\nspecialized lexicons and terminologies, the types of treated questions, and the\ncharacteristics of targeted documents. We are particularly interested in\nstudying and improving methods that aim at finding accurate and short answers\nto biomedical natural language questions from a large scale of biomedical\ntextual documents in English. QA aims at providing inquirers with direct, short\nand precise answers to their natural language questions. In this Ph.D. thesis,\nwe propose four contributions to improve the performance of QA in the\nbiomedical domain. In our first contribution, we propose a machine\nlearning-based method for question type classification to determine the types\nof given questions which enable to a biomedical QA system to use the\nappropriate answer extraction method. We also propose an another machine\nlearning-based method to assign one or more topics (e.g., pharmacological,\ntest, treatment, etc.) to given questions in order to determine the semantic\ntypes of the expected answers which are very useful in generating specific\nanswer retrieval strategies. In the second contribution, we first propose a\ndocument retrieval method to retrieve a set of relevant documents that are\nlikely to contain the answers to biomedical questions from the MEDLINE\ndatabase. We then present a passage retrieval method to retrieve a set of\nrelevant passages to questions. In the third contribution, we propose specific\nanswer extraction methods to generate both exact and ideal answers. Finally, in\nthe fourth contribution, we develop a fully automated semantic biomedical QA\nsystem called SemBioNLQA which is able to deal with a variety of natural\nlanguage questions and to generate appropriate answers by providing both exact\nand ideal answers.\n","authors":["Mourad Sarrouti"],"pdf_url":"https://arxiv.org/pdf/2307.13631v1.pdf","comment":"Doctoral thesis"},{"id":"http://arxiv.org/abs/2307.13617v1","updated":"2023-07-25T16:21:07Z","published":"2023-07-25T16:21:07Z","title":"GPT-3 Models are Few-Shot Financial Reasoners","summary":"  Financial analysis is an important tool for evaluating company performance.\nPractitioners work to answer financial questions to make profitable investment\ndecisions, and use advanced quantitative analyses to do so. As a result,\nFinancial Question Answering (QA) is a question answering task that requires\ndeep reasoning about numbers. Furthermore, it is unknown how well pre-trained\nlanguage models can reason in the financial domain. The current\nstate-of-the-art requires a retriever to collect relevant facts about the\nfinancial question from the text and a generator to produce a valid financial\nprogram and a final answer. However, recently large language models like GPT-3\nhave achieved state-of-the-art performance on wide variety of tasks with just a\nfew shot examples. We run several experiments with GPT-3 and find that a\nseparate retrieval model and logic engine continue to be essential components\nto achieving SOTA performance in this task, particularly due to the precise\nnature of financial questions and the complex information stored in financial\ndocuments. With this understanding, our refined prompt-engineering approach on\nGPT-3 achieves near SOTA accuracy without any fine-tuning.\n","authors":["Raul Salles de Padua","Imran Qureshi","Mustafa U. Karakaplan"],"pdf_url":"https://arxiv.org/pdf/2307.13617v1.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.13560v1","updated":"2023-07-25T15:08:34Z","published":"2023-07-25T15:08:34Z","title":"XDLM: Cross-lingual Diffusion Language Model for Machine Translation","summary":"  Recently, diffusion models have excelled in image generation tasks and have\nalso been applied to neural language processing (NLP) for controllable text\ngeneration. However, the application of diffusion models in a cross-lingual\nsetting is less unexplored. Additionally, while pretraining with diffusion\nmodels has been studied within a single language, the potential of\ncross-lingual pretraining remains understudied. To address these gaps, we\npropose XDLM, a novel Cross-lingual diffusion model for machine translation,\nconsisting of pretraining and fine-tuning stages. In the pretraining stage, we\npropose TLDM, a new training objective for mastering the mapping between\ndifferent languages; in the fine-tuning stage, we build up the translation\nsystem based on the pretrained model. We evaluate the result on several machine\ntranslation benchmarks and outperformed both diffusion and Transformer\nbaselines.\n","authors":["Linyao Chen","Aosong Feng","Boming Yang","Zihui Li"],"pdf_url":"https://arxiv.org/pdf/2307.13560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08303v2","updated":"2023-07-25T14:57:05Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n  Models","summary":"  Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v2.pdf","comment":"fix typo InPairs which should be InPars"},{"id":"http://arxiv.org/abs/2207.08012v2","updated":"2023-07-25T14:30:20Z","published":"2022-07-16T20:37:46Z","title":"Meta-Referential Games to Learn Compositional Learning Behaviours","summary":"  Human beings use compositionality to generalise from past experiences to\nnovel experiences. We assume a separation of our experiences into fundamental\natomic components that can be recombined in novel ways to support our ability\nto engage with novel experiences. We frame this as the ability to learn to\ngeneralise compositionally, and we will refer to behaviours making use of this\nability as compositional learning behaviours (CLBs). A central problem to\nlearning CLBs is the resolution of a binding problem (BP). While it is another\nfeat of intelligence that human beings perform with ease, it is not the case\nfor state-of-the-art artificial agents. Thus, in order to build artificial\nagents able to collaborate with human beings, we propose to develop a novel\nbenchmark to investigate agents' abilities to exhibit CLBs by solving a\ndomain-agnostic version of the BP. We take inspiration from the language\nemergence and grounding framework of referential games and propose a\nmeta-learning extension of referential games, entitled Meta-Referential Games,\nand use this framework to build our benchmark, that we name Symbolic Behaviour\nBenchmark (S2B). We provide baseline results showing that our benchmark is a\ncompelling challenge that we hope will spur the research community towards\ndeveloping more capable artificial agents.\n","authors":["Kevin Denamganaï","Sondess Missaoui","James Alfred Walker"],"pdf_url":"https://arxiv.org/pdf/2207.08012v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2307.13528v1","updated":"2023-07-25T14:20:51Z","published":"2023-07-25T14:20:51Z","title":"FacTool: Factuality Detection in Generative AI -- A Tool Augmented\n  Framework for Multi-Task and Multi-Domain Scenarios","summary":"  The emergence of generative pre-trained models has facilitated the synthesis\nof high-quality text, but it has also posed challenges in identifying factual\nerrors in the generated text. In particular: (1) A wider range of tasks now\nface an increasing risk of containing factual errors when handled by generative\nmodels. (2) Generated texts tend to be lengthy and lack a clearly defined\ngranularity for individual facts. (3) There is a scarcity of explicit evidence\navailable during the process of fact checking. With the above challenges in\nmind, in this paper, we propose FacTool, a task and domain agnostic framework\nfor detecting factual errors of texts generated by large language models (e.g.,\nChatGPT). Experiments on four different tasks (knowledge-based QA, code\ngeneration, mathematical reasoning, and scientific literature review) show the\nefficacy of the proposed method.\n","authors":["I-Chun Chern","Steffi Chern","Shiqi Chen","Weizhe Yuan","Kehua Feng","Chunting Zhou","Junxian He","Graham Neubig","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14108v4","updated":"2023-07-25T14:07:03Z","published":"2023-04-27T11:37:18Z","title":"DataComp: In search of the next generation of multimodal datasets","summary":"  Multimodal datasets are a critical component in recent breakthroughs such as\nStable Diffusion and GPT-4, yet their design does not receive the same research\nattention as model architectures or training algorithms. To address this\nshortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset\nexperiments centered around a new candidate pool of 12.8 billion image-text\npairs from Common Crawl. Participants in our benchmark design new filtering\ntechniques or curate new data sources and then evaluate their new dataset by\nrunning our standardized CLIP training code and testing the resulting model on\n38 downstream test sets. Our benchmark consists of multiple compute scales\nspanning four orders of magnitude, which enables the study of scaling trends\nand makes the benchmark accessible to researchers with varying resources. Our\nbaseline experiments show that the DataComp workflow leads to better training\nsets. In particular, our best baseline, DataComp-1B, enables training a CLIP\nViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming\nOpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training\nprocedure and compute. We release DataComp and all accompanying code at\nwww.datacomp.ai.\n","authors":["Samir Yitzhak Gadre","Gabriel Ilharco","Alex Fang","Jonathan Hayase","Georgios Smyrnis","Thao Nguyen","Ryan Marten","Mitchell Wortsman","Dhruba Ghosh","Jieyu Zhang","Eyal Orgad","Rahim Entezari","Giannis Daras","Sarah Pratt","Vivek Ramanujan","Yonatan Bitton","Kalyani Marathe","Stephen Mussmann","Richard Vencu","Mehdi Cherti","Ranjay Krishna","Pang Wei Koh","Olga Saukh","Alexander Ratner","Shuran Song","Hannaneh Hajishirzi","Ali Farhadi","Romain Beaumont","Sewoong Oh","Alex Dimakis","Jenia Jitsev","Yair Carmon","Vaishaal Shankar","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2304.14108v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13497v1","updated":"2023-07-25T13:46:36Z","published":"2023-07-25T13:46:36Z","title":"Zshot: An Open-source Framework for Zero-Shot Named Entity Recognition\n  and Relation Extraction","summary":"  The Zero-Shot Learning (ZSL) task pertains to the identification of entities\nor relations in texts that were not seen during training. ZSL has emerged as a\ncritical research area due to the scarcity of labeled data in specific domains,\nand its applications have grown significantly in recent years. With the advent\nof large pretrained language models, several novel methods have been proposed,\nresulting in substantial improvements in ZSL performance. There is a growing\ndemand, both in the research community and industry, for a comprehensive ZSL\nframework that facilitates the development and accessibility of the latest\nmethods and pretrained models.In this study, we propose a novel ZSL framework\ncalled Zshot that aims to address the aforementioned challenges. Our primary\nobjective is to provide a platform that allows researchers to compare different\nstate-of-the-art ZSL methods with standard benchmark datasets. Additionally, we\nhave designed our framework to support the industry with readily available APIs\nfor production under the standard SpaCy NLP pipeline. Our API is extendible and\nevaluable, moreover, we include numerous enhancements such as boosting the\naccuracy with pipeline ensembling and visualization utilities available as a\nSpaCy extension.\n","authors":["Gabriele Picco","Marcos Martínez Galindo","Alberto Purpura","Leopold Fuchs","Vanessa López","Hoang Thanh Lam"],"pdf_url":"https://arxiv.org/pdf/2307.13497v1.pdf","comment":"Accepted at ACL 2023"},{"id":"http://arxiv.org/abs/2210.10332v3","updated":"2023-07-25T13:02:49Z","published":"2022-10-19T07:05:06Z","title":"Revision Transformers: Instructing Language Models to Change their\n  Values","summary":"  Current transformer language models (LM) are large-scale models with billions\nof parameters. They have been shown to provide high performances on a variety\nof tasks but are also prone to shortcut learning and bias. Addressing such\nincorrect model behavior via parameter adjustments is very costly. This is\nparticularly problematic for updating dynamic concepts, such as moral values,\nwhich vary culturally or interpersonally. In this work, we question the current\ncommon practice of storing all information in the model parameters and propose\nthe Revision Transformer (RiT) to facilitate easy model updating. The specific\ncombination of a large-scale pre-trained LM that inherently but also diffusely\nencodes world knowledge with a clear-structured revision engine makes it\npossible to update the model's knowledge with little effort and the help of\nuser interaction. We exemplify RiT on a moral dataset and simulate user\nfeedback demonstrating strong performance in model revision even with small\ndata. This way, users can easily design a model regarding their preferences,\npaving the way for more transparent AI models.\n","authors":["Felix Friedrich","Wolfgang Stammer","Patrick Schramowski","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2210.10332v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11993v2","updated":"2023-07-25T11:50:48Z","published":"2023-05-19T20:36:21Z","title":"Interpretable Word Sense Representations via Definition Generation: The\n  Case of Semantic Change Analysis","summary":"  We propose using automatically generated natural language definitions of\ncontextualised word usages as interpretable word and word sense\nrepresentations. Given a collection of usage examples for a target word, and\nthe corresponding data-driven usage clusters (i.e., word senses), a definition\nis generated for each usage with a specialised Flan-T5 language model, and the\nmost prototypical definition in a usage cluster is chosen as the sense label.\n  We demonstrate how the resulting sense labels can make existing approaches to\nsemantic change analysis more interpretable, and how they can allow users --\nhistorical linguists, lexicographers, or social scientists -- to explore and\nintuitively explain diachronic trajectories of word meaning. Semantic change\nanalysis is only one of many possible applications of the `definitions as\nrepresentations' paradigm. Beyond being human-readable, contextualised\ndefinitions also outperform token or usage sentence embeddings in\nword-in-context semantic similarity judgements, making them a new promising\ntype of lexical representation for NLP.\n","authors":["Mario Giulianelli","Iris Luden","Raquel Fernandez","Andrey Kutuzov"],"pdf_url":"https://arxiv.org/pdf/2305.11993v2.pdf","comment":"ACL 2023"},{"id":"http://arxiv.org/abs/2307.13424v1","updated":"2023-07-25T11:44:28Z","published":"2023-07-25T11:44:28Z","title":"Holistic Exploration on Universal Decompositional Semantic Parsing:\n  Architecture, Data Augmentation, and LLM Paradigm","summary":"  In this paper, we conduct a holistic exploration of the Universal\nDecompositional Semantic (UDS) Parsing. We first introduce a cascade model for\nUDS parsing that decomposes the complex parsing task into semantically\nappropriate subtasks. Our approach outperforms the prior models, while\nsignificantly reducing inference time. We also incorporate syntactic\ninformation and further optimized the architecture. Besides, different ways for\ndata augmentation are explored, which further improve the UDS Parsing. Lastly,\nwe conduct experiments to investigate the efficacy of ChatGPT in handling the\nUDS task, revealing that it excels in attribute parsing but struggles in\nrelation parsing, and using ChatGPT for data augmentation yields suboptimal\nresults. Our code is available at https://github.com/hexuandeng/HExp4UDS.\n","authors":["Hexuan Deng","Xin Zhang","Meishan Zhang","Xuebo Liu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13424v1.pdf","comment":"12 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.13417v1","updated":"2023-07-25T11:29:55Z","published":"2023-07-25T11:29:55Z","title":"Towards Resolving Word Ambiguity with Word Embeddings","summary":"  Ambiguity is ubiquitous in natural language. Resolving ambiguous meanings is\nespecially important in information retrieval tasks. While word embeddings\ncarry semantic information, they fail to handle ambiguity well. Transformer\nmodels have been shown to handle word ambiguity for complex queries, but they\ncannot be used to identify ambiguous words, e.g. for a 1-word query.\nFurthermore, training these models is costly in terms of time, hardware\nresources, and training data, prohibiting their use in specialized environments\nwith sensitive data. Word embeddings can be trained using moderate hardware\nresources. This paper shows that applying DBSCAN clustering to the latent space\ncan identify ambiguous words and evaluate their level of ambiguity. An\nautomatic DBSCAN parameter selection leads to high-quality clusters, which are\nsemantically coherent and correspond well to the perceived meanings of a given\nword.\n","authors":["Matthias Thurnbauer","Johannes Reisinger","Christoph Goller","Andreas Fischer"],"pdf_url":"https://arxiv.org/pdf/2307.13417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13405v1","updated":"2023-07-25T10:53:20Z","published":"2023-07-25T10:53:20Z","title":"Towards Bridging the Digital Language Divide","summary":"  It is a well-known fact that current AI-based language technology -- language\nmodels, machine translation systems, multilingual dictionaries and corpora --\nfocuses on the world's 2-3% most widely spoken languages. Recent research\nefforts have attempted to expand the coverage of AI technology to\n`under-resourced languages.' The goal of our paper is to bring attention to a\nphenomenon that we call linguistic bias: multilingual language processing\nsystems often exhibit a hardwired, yet usually involuntary and hidden\nrepresentational preference towards certain languages. Linguistic bias is\nmanifested in uneven per-language performance even in the case of similar test\nconditions. We show that biased technology is often the result of research and\ndevelopment methodologies that do not do justice to the complexity of the\nlanguages being represented, and that can even become ethically problematic as\nthey disregard valuable aspects of diversity as well as the needs of the\nlanguage communities themselves. As our attempt at building diversity-aware\nlanguage resources, we present a new initiative that aims at reducing\nlinguistic bias through both technological design and methodology, based on an\neye-level collaboration with local communities.\n","authors":["Gábor Bella","Paula Helm","Gertraud Koch","Fausto Giunchiglia"],"pdf_url":"https://arxiv.org/pdf/2307.13405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16143v2","updated":"2023-07-25T10:40:12Z","published":"2023-06-28T12:17:45Z","title":"Generative User-Experience Research for Developing Domain-specific\n  Natural Language Processing Applications","summary":"  User experience (UX) is a part of human-computer interaction (HCI) research\nand focuses on increasing intuitiveness, transparency, simplicity, and trust\nfor system users. Most of the UX research for machine learning (ML) or natural\nlanguage processing (NLP) focuses on a data-driven methodology, i.e., it fails\nto focus on users' requirements, and engages domain users mainly for usability\nevaluation. Moreover, more typical UX methods tailor the systems towards user\nusability, unlike learning about the user needs first. The paper proposes a\nmethodology for integrating generative UX research into developing domain NLP\napplications. Generative UX research employs domain users at the initial stages\nof prototype development, i.e., ideation and concept evaluation, and the last\nstage for evaluating the change in user value. In the case study, we report the\nfull-cycle prototype development of a domain-specific semantic search for daily\noperations in the process industry. Our case study shows that involving domain\nexperts increases their interest and trust in the final NLP application.\nMoreover, we show that synergetic UX+NLP research efficiently considers data-\nand user-driven opportunities and constraints, which can be crucial for NLP\napplications in narrow domains\n","authors":["Anastasia Zhukova","Lukas von Sperl","Christian E. Matt","Bela Gipp"],"pdf_url":"https://arxiv.org/pdf/2306.16143v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13377v1","updated":"2023-07-25T09:51:17Z","published":"2023-07-25T09:51:17Z","title":"Embedding Models for Supervised Automatic Extraction and Classification\n  of Named Entities in Scientific Acknowledgements","summary":"  Acknowledgments in scientific papers may give an insight into aspects of the\nscientific community, such as reward systems, collaboration patterns, and\nhidden research trends. The aim of the paper is to evaluate the performance of\ndifferent embedding models for the task of automatic extraction and\nclassification of acknowledged entities from the acknowledgment text in\nscientific papers. We trained and implemented a named entity recognition (NER)\ntask using the Flair NLP framework. The training was conducted using three\ndefault Flair NER models with four differently-sized corpora and different\nversions of the Flair NLP framework. The Flair Embeddings model trained on the\nmedium corpus with the latest FLAIR version showed the best accuracy of 0.79.\nExpanding the size of a training corpus from very small to medium size\nmassively increased the accuracy of all training algorithms, but further\nexpansion of the training corpus did not bring further improvement. Moreover,\nthe performance of the model slightly deteriorated. Our model is able to\nrecognize six entity types: funding agency, grant number, individuals,\nuniversity, corporation, and miscellaneous. The model works more precisely for\nsome entity types than for others; thus, individuals and grant numbers showed a\nvery good F1-Score over 0.9. Most of the previous works on acknowledgment\nanalysis were limited by the manual evaluation of data and therefore by the\namount of processed data. This model can be applied for the comprehensive\nanalysis of acknowledgment texts and may potentially make a great contribution\nto the field of automated acknowledgment analysis.\n","authors":["Nina Smirnova","Philipp Mayr"],"pdf_url":"https://arxiv.org/pdf/2307.13377v1.pdf","comment":"The present paper is an extended version of the article Evaluation of\n  Embedding Models for Automatic Extraction and Classification of Acknowledged\n  Entities in Scientific Documents (Smirnova and Mayr, 2022) presented at the\n  3rd Workshop on Extraction and Evaluation of Knowledge Entities from\n  Scientific Documents (EEKE2022). arXiv admin note: substantial text overlap\n  with arXiv:2206.10939"},{"id":"http://arxiv.org/abs/2307.12896v2","updated":"2023-07-25T09:47:54Z","published":"2023-07-24T15:44:23Z","title":"Corrections of Zipf's and Heaps' Laws Derived from Hapax Rate Models","summary":"  The article introduces corrections to Zipf's and Heaps' laws based on\nsystematic models of the hapax rate. The derivation rests on two assumptions:\nThe first one is the standard urn model which predicts that marginal frequency\ndistributions for shorter texts look as if word tokens were sampled blindly\nfrom a given longer text. The second assumption posits that the rate of hapaxes\nis a simple function of the text size. Four such functions are discussed: the\nconstant model, the Davis model, the linear model, and the logistic model. It\nis shown that the logistic model yields the best fit.\n","authors":["Łukasz Dębowski"],"pdf_url":"https://arxiv.org/pdf/2307.12896v2.pdf","comment":"42 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.13365v1","updated":"2023-07-25T09:34:42Z","published":"2023-07-25T09:34:42Z","title":"Empower Your Model with Longer and Better Context Comprehension","summary":"  Recently, with the emergence of numerous Large Language Models (LLMs), the\nimplementation of AI has entered a new era. Irrespective of these models' own\ncapacity and structure, there is a growing demand for LLMs to possess enhanced\ncomprehension of longer and more complex contexts with relatively smaller\nsizes. Models often encounter an upper limit when processing sequences of\nsentences that extend beyond their comprehension capacity and result in\noff-topic or even chaotic responses. While several recent works attempt to\naddress this issue in various ways, they rarely focus on \"why models are unable\nto compensate or strengthen their capabilities on their own\". In this paper, we\nthoroughly investigate the nature of information transfer within LLMs and\npropose a novel technique called Attention Transition. This technique empowers\nmodels to achieve longer and better context comprehension with minimal\nadditional training or impact on generation fluency. Our experiments are\nconducted in XSum and achieve substantial improvement compared with the\noriginal generation results.\n","authors":["Yifei Gao","Lei Wang","Jun Fang","Longhua Hu","Jun Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.13365v1.pdf","comment":"LLM for long context comprehension"},{"id":"http://arxiv.org/abs/2307.13339v1","updated":"2023-07-25T08:51:30Z","published":"2023-07-25T08:51:30Z","title":"Analyzing Chain-of-Thought Prompting in Large Language Models via\n  Gradient-based Feature Attributions","summary":"  Chain-of-thought (CoT) prompting has been shown to empirically improve the\naccuracy of large language models (LLMs) on various question answering tasks.\nWhile understanding why CoT prompting is effective is crucial to ensuring that\nthis phenomenon is a consequence of desired model behavior, little work has\naddressed this; nonetheless, such an understanding is a critical prerequisite\nfor responsible model deployment. We address this question by leveraging\ngradient-based feature attribution methods which produce saliency scores that\ncapture the influence of input tokens on model output. Specifically, we probe\nseveral open-source LLMs to investigate whether CoT prompting affects the\nrelative importances they assign to particular input tokens. Our results\nindicate that while CoT prompting does not increase the magnitude of saliency\nscores attributed to semantically relevant tokens in the prompt compared to\nstandard few-shot prompting, it increases the robustness of saliency scores to\nquestion perturbations and variations in model output.\n","authors":["Skyler Wu","Eric Meng Shen","Charumathi Badrinath","Jiaqi Ma","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2307.13339v1.pdf","comment":"Accepted to Workshop on Challenges in Deployable Generative AI at\n  ICML 2023"},{"id":"http://arxiv.org/abs/2307.13298v1","updated":"2023-07-25T07:27:32Z","published":"2023-07-25T07:27:32Z","title":"An Intent Taxonomy of Legal Case Retrieval","summary":"  Legal case retrieval is a special Information Retrieval~(IR) task focusing on\nlegal case documents. Depending on the downstream tasks of the retrieved case\ndocuments, users' information needs in legal case retrieval could be\nsignificantly different from those in Web search and traditional ad-hoc\nretrieval tasks. While there are several studies that retrieve legal cases\nbased on text similarity, the underlying search intents of legal retrieval\nusers, as shown in this paper, are more complicated than that yet mostly\nunexplored. To this end, we present a novel hierarchical intent taxonomy of\nlegal case retrieval. It consists of five intent types categorized by three\ncriteria, i.e., search for Particular Case(s), Characterization, Penalty,\nProcedure, and Interest. The taxonomy was constructed transparently and\nevaluated extensively through interviews, editorial user studies, and query log\nanalysis. Through a laboratory user study, we reveal significant differences in\nuser behavior and satisfaction under different search intents in legal case\nretrieval. Furthermore, we apply the proposed taxonomy to various downstream\nlegal retrieval tasks, e.g., result ranking and satisfaction prediction, and\ndemonstrate its effectiveness. Our work provides important insights into the\nunderstanding of user intents in legal case retrieval and potentially leads to\nbetter retrieval techniques in the legal domain, such as intent-aware ranking\nstrategies and evaluation methodologies.\n","authors":["Yunqiu Shao","Haitao Li","Yueyue Wu","Yiqun Liu","Qingyao Ai","Jiaxin Mao","Yixiao Ma","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2307.13298v1.pdf","comment":"28 pages, work in process"},{"id":"http://arxiv.org/abs/2307.08621v3","updated":"2023-07-25T06:47:43Z","published":"2023-07-17T16:40:01Z","title":"Retentive Network: A Successor to Transformer for Large Language Models","summary":"  In this work, we propose Retentive Network (RetNet) as a foundation\narchitecture for large language models, simultaneously achieving training\nparallelism, low-cost inference, and good performance. We theoretically derive\nthe connection between recurrence and attention. Then we propose the retention\nmechanism for sequence modeling, which supports three computation paradigms,\ni.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel\nrepresentation allows for training parallelism. The recurrent representation\nenables low-cost $O(1)$ inference, which improves decoding throughput, latency,\nand GPU memory without sacrificing performance. The chunkwise recurrent\nrepresentation facilitates efficient long-sequence modeling with linear\ncomplexity, where each chunk is encoded parallelly while recurrently\nsummarizing the chunks. Experimental results on language modeling show that\nRetNet achieves favorable scaling results, parallel training, low-cost\ndeployment, and efficient inference. The intriguing properties make RetNet a\nstrong successor to Transformer for large language models. Code will be\navailable at https://aka.ms/retnet.\n","authors":["Yutao Sun","Li Dong","Shaohan Huang","Shuming Ma","Yuqing Xia","Jilong Xue","Jianyong Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08621v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12798v2","updated":"2023-07-25T05:42:34Z","published":"2023-07-24T13:51:19Z","title":"RRAML: Reinforced Retrieval Augmented Machine Learning","summary":"  The emergence of large language models (LLMs) has revolutionized machine\nlearning and related fields, showcasing remarkable abilities in comprehending,\ngenerating, and manipulating human language. However, their conventional usage\nthrough API-based text prompt submissions imposes certain limitations in terms\nof context constraints and external source availability. To address these\nchallenges, we propose a novel framework called Reinforced Retrieval Augmented\nMachine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs\nwith supporting information retrieved by a purpose-built retriever from a vast\nuser-provided database. By leveraging recent advancements in reinforcement\nlearning, our method effectively addresses several critical challenges.\nFirstly, it circumvents the need for accessing LLM gradients. Secondly, our\nmethod alleviates the burden of retraining LLMs for specific tasks, as it is\noften impractical or impossible due to restricted access to the model and the\ncomputational intensity involved. Additionally we seamlessly link the\nretriever's task with the reasoner, mitigating hallucinations and reducing\nirrelevant, and potentially damaging retrieved documents. We believe that the\nresearch agenda outlined in this paper has the potential to profoundly impact\nthe field of AI, democratizing access to and utilization of LLMs for a wide\nrange of entities.\n","authors":["Andrea Bacciu","Florin Cocunasu","Federico Siciliano","Fabrizio Silvestri","Nicola Tonellotto","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2307.12798v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13269v1","updated":"2023-07-25T05:39:21Z","published":"2023-07-25T05:39:21Z","title":"LoraHub: Efficient Cross-Task Generalization via Dynamic LoRA\n  Composition","summary":"  Low-rank adaptations (LoRA) are often employed to fine-tune large language\nmodels (LLMs) for new tasks. This paper investigates LoRA composability for\ncross-task generalization and introduces LoraHub, a strategic framework devised\nfor the purposive assembly of LoRA modules trained on diverse given tasks, with\nthe objective of achieving adaptable performance on unseen tasks. With just a\nfew examples from a novel task, LoraHub enables the fluid combination of\nmultiple LoRA modules, eradicating the need for human expertise. Notably, the\ncomposition requires neither additional model parameters nor gradients. Our\nempirical results, derived from the Big-Bench Hard (BBH) benchmark, suggest\nthat LoraHub can effectively mimic the performance of in-context learning in\nfew-shot scenarios, excluding the necessity of in-context examples alongside\neach inference input. A significant contribution of our research is the\nfostering of a community for LoRA, where users can share their trained LoRA\nmodules, thereby facilitating their application to new tasks. We anticipate\nthis resource will widen access to and spur advancements in general\nintelligence as well as LLMs in production. Code will be available at\nhttps://github.com/sail-sg/lorahub.\n","authors":["Chengsong Huang","Qian Liu","Bill Yuchen Lin","Tianyu Pang","Chao Du","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2307.13269v1.pdf","comment":"Work in progress. The first three authors contributed equally to this\n  work"},{"id":"http://arxiv.org/abs/2307.11768v2","updated":"2023-07-25T04:01:43Z","published":"2023-07-17T00:54:10Z","title":"Question Decomposition Improves the Faithfulness of Model-Generated\n  Reasoning","summary":"  As large language models (LLMs) perform more difficult tasks, it becomes\nharder to verify the correctness and safety of their behavior. One approach to\nhelp with this issue is to prompt LLMs to externalize their reasoning, e.g., by\nhaving them generate step-by-step reasoning as they answer a question\n(Chain-of-Thought; CoT). The reasoning may enable us to check the process that\nmodels use to perform tasks. However, this approach relies on the stated\nreasoning faithfully reflecting the model's actual reasoning, which is not\nalways the case. To improve over the faithfulness of CoT reasoning, we have\nmodels generate reasoning by decomposing questions into subquestions.\nDecomposition-based methods achieve strong performance on question-answering\ntasks, sometimes approaching that of CoT while improving the faithfulness of\nthe model's stated reasoning on several recently-proposed metrics. By forcing\nthe model to answer simpler subquestions in separate contexts, we greatly\nincrease the faithfulness of model-generated reasoning over CoT, while still\nachieving some of the performance gains of CoT. Our results show it is possible\nto improve the faithfulness of model-generated reasoning; continued\nimprovements may lead to reasoning that enables us to verify the correctness\nand safety of LLM behavior.\n","authors":["Ansh Radhakrishnan","Karina Nguyen","Anna Chen","Carol Chen","Carson Denison","Danny Hernandez","Esin Durmus","Evan Hubinger","Jackson Kernion","Kamilė Lukošiūtė","Newton Cheng","Nicholas Joseph","Nicholas Schiefer","Oliver Rausch","Sam McCandlish","Sheer El Showk","Tamera Lanham","Tim Maxwell","Venkatesa Chandrasekaran","Zac Hatfield-Dodds","Jared Kaplan","Jan Brauner","Samuel R. Bowman","Ethan Perez"],"pdf_url":"https://arxiv.org/pdf/2307.11768v2.pdf","comment":"For few-shot examples and prompts, see\n  https://github.com/anthropics/DecompositionFaithfulnessPaper"},{"id":"http://arxiv.org/abs/2307.11760v2","updated":"2023-07-25T03:30:32Z","published":"2023-07-14T00:57:12Z","title":"EmotionPrompt: Leveraging Psychology for Large Language Models\n  Enhancement via Emotional Stimulus","summary":"  Large language models (LLMs) have achieved significant performance in many\nfields such as reasoning, language understanding, and math problem-solving, and\nare regarded as a crucial step to artificial general intelligence (AGI).\nHowever, the sensitivity of LLMs to prompts remains a major bottleneck for\ntheir daily adoption. In this paper, we take inspiration from psychology and\npropose EmotionPrompt to explore emotional intelligence to enhance the\nperformance of LLMs. EmotionPrompt operates on a remarkably straightforward\nprinciple: the incorporation of emotional stimulus into prompts. Experimental\nresults demonstrate that our EmotionPrompt, using the same single prompt\ntemplates, significantly outperforms original zero-shot prompt and\nZero-shot-CoT on 8 tasks with diverse models: ChatGPT, Vicuna-13b, Bloom, and\nT5. Further, EmotionPrompt was observed to improve both truthfulness and\ninformativeness. We believe that EmotionPrompt heralds a novel avenue for\nexploring interdisciplinary knowledge for humans-LLMs interaction.\n","authors":["Cheng Li","Jindong Wang","Kaijie Zhu","Yixuan Zhang","Wenxin Hou","Jianxun Lian","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2307.11760v2.pdf","comment":"Work in progress; 9 pages"},{"id":"http://arxiv.org/abs/2302.03693v2","updated":"2023-07-25T00:36:06Z","published":"2023-02-07T20:43:48Z","title":"Concept Algebra for Score-Based Conditional Models","summary":"  This paper concerns the structure of learned representations in text-guided\ngenerative models, focusing on score-based models. Here, we focus on the idea\nthat concepts are encoded as subspaces (or directions) of some representation\nspace. We develop a mathematical formalization of this idea.Using this\nformalism, we show there's a natural choice of representation with this\nproperty, and we develop a simple method for identifying the part of the\nrepresentation corresponding to a given concept. In particular, this allows us\nto manipulate the concepts expressed by the model through algebraic\nmanipulation of the representation. We demonstrate the idea with examples\ntext-guided image generation, using Stable Diffusion.\n","authors":["Zihao Wang","Lin Gui","Jeffrey Negrea","Victor Veitch"],"pdf_url":"https://arxiv.org/pdf/2302.03693v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13854v1","updated":"2023-07-25T22:59:32Z","published":"2023-07-25T22:59:32Z","title":"WebArena: A Realistic Web Environment for Building Autonomous Agents","summary":"  With generative AI advances, the exciting potential for autonomous agents to\nmanage daily tasks via natural language commands has emerged. However, cur rent\nagents are primarily created and tested in simplified synthetic environments,\nsubstantially limiting real-world scenario representation. In this paper, we\nbuild an environment for agent command and control that is highly realistic and\nreproducible. Specifically, we focus on agents that perform tasks on websites,\nand we create an environment with fully functional websites from four common\ndomains: e-commerce, social forum discussions, collaborative software\ndevelopment, and content management. Our environment is enriched with tools\n(e.g., a map) and external knowledge bases (e.g., user manuals) to encourage\nhuman-like task-solving. Building upon our environment, we release a set of\nbenchmark tasks focusing on evaluating the functional correctness of task\ncompletions. The tasks in our benchmark are diverse, long-horizon, and are\ndesigned to emulate tasks that humans routinely perform on the internet. We\ndesign and implement several autonomous agents, integrating recent techniques\nsuch as reasoning before acting. The results demonstrate that solving complex\ntasks is challenging: our best GPT-4-based agent only achieves an end-to-end\ntask success rate of 10.59%. These results highlight the need for further\ndevelopment of robust agents, that current state-of-the-art LMs are far from\nperfect performance in these real-life tasks, and that WebArena can be used to\nmeasure such progress. Our code, data, environment reproduction resources, and\nvideo demonstrations are publicly available at https://webarena.dev/.\n","authors":["Shuyan Zhou","Frank F. Xu","Hao Zhu","Xuhui Zhou","Robert Lo","Abishek Sridhar","Xianyi Cheng","Yonatan Bisk","Daniel Fried","Uri Alon","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2307.13854v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2307.13829v1","updated":"2023-07-25T21:56:14Z","published":"2023-07-25T21:56:14Z","title":"ARC-NLP at Multimodal Hate Speech Event Detection 2023: Multimodal\n  Methods Boosted by Ensemble Learning, Syntactical and Entity Features","summary":"  Text-embedded images can serve as a means of spreading hate speech,\npropaganda, and extremist beliefs. Throughout the Russia-Ukraine war, both\nopposing factions heavily relied on text-embedded images as a vehicle for\nspreading propaganda and hate speech. Ensuring the effective detection of hate\nspeech and propaganda is of utmost importance to mitigate the negative effect\nof hate speech dissemination. In this paper, we outline our methodologies for\ntwo subtasks of Multimodal Hate Speech Event Detection 2023. For the first\nsubtask, hate speech detection, we utilize multimodal deep learning models\nboosted by ensemble learning and syntactical text attributes. For the second\nsubtask, target detection, we employ multimodal deep learning models boosted by\nnamed entity features. Through experimentation, we demonstrate the superior\nperformance of our models compared to all textual, visual, and text-visual\nbaselines employed in multimodal hate speech detection. Furthermore, our models\nachieve the first place in both subtasks on the final leaderboard of the shared\ntask.\n","authors":["Umitcan Sahin","Izzet Emre Kucukkaya","Oguzhan Ozcelik","Cagri Toraman"],"pdf_url":"https://arxiv.org/pdf/2307.13829v1.pdf","comment":"Submitted to CASE at RANLP 2023"},{"id":"http://arxiv.org/abs/2302.12247v2","updated":"2023-07-25T20:50:10Z","published":"2023-02-23T18:59:05Z","title":"Quantifying & Modeling Multimodal Interactions: An Information\n  Decomposition Framework","summary":"  The recent explosion of interest in multimodal applications has resulted in a\nwide selection of datasets and methods for representing and integrating\ninformation from different modalities. Despite these empirical advances, there\nremain fundamental research questions: How can we quantify the interactions\nthat are necessary to solve a multimodal task? Subsequently, what are the most\nsuitable multimodal models to capture these interactions? To answer these\nquestions, we propose an information-theoretic approach to quantify the degree\nof redundancy, uniqueness, and synergy relating input modalities with an output\ntask. We term these three measures as the PID statistics of a multimodal\ndistribution (or PID for short), and introduce two new estimators for these PID\nstatistics that scale to high-dimensional distributions. To validate PID\nestimation, we conduct extensive experiments on both synthetic datasets where\nthe PID is known and on large-scale multimodal benchmarks where PID estimations\nare compared with human annotations. Finally, we demonstrate their usefulness\nin (1) quantifying interactions within multimodal datasets, (2) quantifying\ninteractions captured by multimodal models, (3) principled approaches for model\nselection, and (4) three real-world case studies engaging with domain experts\nin pathology, mood prediction, and robotic perception where our framework helps\nto recommend strong multimodal models for each application.\n","authors":["Paul Pu Liang","Yun Cheng","Xiang Fan","Chun Kai Ling","Suzanne Nie","Richard Chen","Zihao Deng","Nicholas Allen","Randy Auerbach","Faisal Mahmood","Ruslan Salakhutdinov","Louis-Philippe Morency"],"pdf_url":"https://arxiv.org/pdf/2302.12247v2.pdf","comment":"Code available at: https://github.com/pliang279/PID"},{"id":"http://arxiv.org/abs/2307.13808v1","updated":"2023-07-25T20:24:22Z","published":"2023-07-25T20:24:22Z","title":"Watermarking Conditional Text Generation for AI Detection: Unveiling\n  Challenges and a Semantic-Aware Watermark Remedy","summary":"  To mitigate potential risks associated with language models, recent AI\ndetection research proposes incorporating watermarks into machine-generated\ntext through random vocabulary restrictions and utilizing this information for\ndetection. While these watermarks only induce a slight deterioration in\nperplexity, our empirical investigation reveals a significant detriment to the\nperformance of conditional text generation. To address this issue, we introduce\na simple yet effective semantic-aware watermarking algorithm that considers the\ncharacteristics of conditional text generation and the input context.\nExperimental results demonstrate that our proposed method yields substantial\nimprovements across various text generation models, including BART and Flan-T5,\nin tasks such as summarization and data-to-text generation while maintaining\ndetection ability.\n","authors":["Yu Fu","Deyi Xiong","Yue Dong"],"pdf_url":"https://arxiv.org/pdf/2307.13808v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.13779v1","updated":"2023-07-25T19:34:44Z","published":"2023-07-25T19:34:44Z","title":"Is GPT a Computational Model of Emotion? Detailed Analysis","summary":"  This paper investigates the emotional reasoning abilities of the GPT family\nof large language models via a component perspective. The paper first examines\nhow the model reasons about autobiographical memories. Second, it\nsystematically varies aspects of situations to impact emotion intensity and\ncoping tendencies. Even without the use of prompt engineering, it is shown that\nGPT's predictions align significantly with human-provided appraisals and\nemotional labels. However, GPT faces difficulties predicting emotion intensity\nand coping responses. GPT-4 showed the highest performance in the initial study\nbut fell short in the second, despite providing superior results after minor\nprompt engineering. This assessment brings up questions on how to effectively\nemploy the strong points and address the weak areas of these models,\nparticularly concerning response variability. These studies underscore the\nmerits of evaluating models from a componential perspective.\n","authors":["Ala N. Tak","Jonathan Gratch"],"pdf_url":"https://arxiv.org/pdf/2307.13779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13776v1","updated":"2023-07-25T19:20:50Z","published":"2023-07-25T19:20:50Z","title":"Combating the Curse of Multilinguality in Cross-Lingual WSD by Aligning\n  Sparse Contextualized Word Representations","summary":"  In this paper, we advocate for using large pre-trained monolingual language\nmodels in cross lingual zero-shot word sense disambiguation (WSD) coupled with\na contextualized mapping mechanism. We also report rigorous experiments that\nillustrate the effectiveness of employing sparse contextualized word\nrepresentations obtained via a dictionary learning procedure. Our experimental\nresults demonstrate that the above modifications yield a significant\nimprovement of nearly 6.5 points of increase in the average F-score (from 62.0\nto 68.5) over a collection of 17 typologically diverse set of target languages.\nWe release our source code for replicating our experiments at\nhttps://github.com/begab/sparsity_makes_sense.\n","authors":["Gábor Berend"],"pdf_url":"https://arxiv.org/pdf/2307.13776v1.pdf","comment":"Presented at NAACL2022"},{"id":"http://arxiv.org/abs/2209.13192v2","updated":"2023-07-25T18:12:16Z","published":"2022-09-27T06:47:42Z","title":"Direct Speech Translation for Automatic Subtitling","summary":"  Automatic subtitling is the task of automatically translating the speech of\naudiovisual content into short pieces of timed text, i.e. subtitles and their\ncorresponding timestamps. The generated subtitles need to conform to space and\ntime requirements, while being synchronised with the speech and segmented in a\nway that facilitates comprehension. Given its considerable complexity, the task\nhas so far been addressed through a pipeline of components that separately deal\nwith transcribing, translating, and segmenting text into subtitles, as well as\npredicting timestamps. In this paper, we propose the first direct ST model for\nautomatic subtitling that generates subtitles in the target language along with\ntheir timestamps with a single model. Our experiments on 7 language pairs show\nthat our approach outperforms a cascade system in the same data condition, also\nbeing competitive with production tools on both in-domain and newly-released\nout-domain benchmarks covering new scenarios.\n","authors":["Sara Papi","Marco Gaido","Alina Karakanta","Mauro Cettolo","Matteo Negri","Marco Turchi"],"pdf_url":"https://arxiv.org/pdf/2209.13192v2.pdf","comment":"Accepted at TACL"},{"id":"http://arxiv.org/abs/2307.13714v1","updated":"2023-07-25T16:08:27Z","published":"2023-07-25T16:08:27Z","title":"Diversity and Language Technology: How Techno-Linguistic Bias Can Cause\n  Epistemic Injustice","summary":"  It is well known that AI-based language technology -- large language models,\nmachine translation systems, multilingual dictionaries, and corpora -- is\ncurrently limited to 2 to 3 percent of the world's most widely spoken and/or\nfinancially and politically best supported languages. In response, recent\nresearch efforts have sought to extend the reach of AI technology to\n``underserved languages.'' In this paper, we show that many of these attempts\nproduce flawed solutions that adhere to a hard-wired representational\npreference for certain languages, which we call techno-linguistic bias.\nTechno-linguistic bias is distinct from the well-established phenomenon of\nlinguistic bias as it does not concern the languages represented but rather the\ndesign of the technologies. As we show through the paper, techno-linguistic\nbias can result in systems that can only express concepts that are part of the\nlanguage and culture of dominant powers, unable to correctly represent concepts\nfrom other communities. We argue that at the root of this problem lies a\nsystematic tendency of technology developer communities to apply a simplistic\nunderstanding of diversity which does not do justice to the more profound\ndifferences that languages, and ultimately the communities that speak them,\nembody. Drawing on the concept of epistemic injustice, we point to the broader\nsociopolitical consequences of the bias we identify and show how it can lead\nnot only to a disregard for valuable aspects of diversity but also to an\nunder-representation of the needs and diverse worldviews of marginalized\nlanguage communities.\n","authors":["Paula Helm","Gábor Bella","Gertraud Koch","Fausto Giunchiglia"],"pdf_url":"https://arxiv.org/pdf/2307.13714v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2307.13405"},{"id":"http://arxiv.org/abs/2307.13304v1","updated":"2023-07-25T07:44:06Z","published":"2023-07-25T07:44:06Z","title":"QuIP: 2-Bit Quantization of Large Language Models With Guarantees","summary":"  This work studies post-training parameter quantization in large language\nmodels (LLMs). We introduce quantization with incoherence processing (QuIP), a\nnew method based on the insight that quantization benefits from incoherent\nweight and Hessian matrices, i.e., from the weights and the directions in which\nit is important to round them accurately being unaligned with the coordinate\naxes. QuIP consists of two steps: (1) an adaptive rounding procedure minimizing\na quadratic proxy objective; (2) efficient pre- and post-processing that\nensures weight and Hessian incoherence via multiplication by random orthogonal\nmatrices. We complement QuIP with the first theoretical analysis for an\nLLM-scale quantization algorithm, and show that our theory also applies to an\nexisting method, OPTQ. Empirically, we find that our incoherence preprocessing\nimproves several existing quantization algorithms and yields the first LLM\nquantization methods that produce viable results using only two bits per\nweight. Our code can be found at https://github.com/jerry-chee/QuIP .\n","authors":["Jerry Chee","Yaohui Cai","Volodymyr Kuleshov","Christopher De Sa"],"pdf_url":"https://arxiv.org/pdf/2307.13304v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.13697v1","updated":"2023-07-25T17:59:59Z","published":"2023-07-25T17:59:59Z","title":"Benchmarking and Analyzing Generative Data for Visual Recognition","summary":"  Advancements in large pre-trained generative models have expanded their\npotential as effective data generators in visual recognition. This work delves\ninto the impact of generative images, primarily comparing paradigms that\nharness external data (\\ie generative \\vs retrieval \\vs original).\n  Our key contributions are: \\textbf{1) GenBench Construction:} We devise\n\\textbf{GenBench}, a broad benchmark comprising 22 datasets with 2548\ncategories, to appraise generative data across various visual recognition\ntasks. \\textbf{2) CLER Score:} To address the insufficient correlation of\nexisting metrics (\\eg, FID, CLIP score) with downstream recognition\nperformance, we propose \\textbf{CLER}, a training-free metric indicating\ngenerative data's efficiency for recognition tasks prior to training.\n\\textbf{3) New Baselines:} Comparisons of generative data with retrieved data\nfrom the same external pool help to elucidate the unique traits of generative\ndata. \\textbf{4) External Knowledge Injection:} By fine-tuning special token\nembeddings for each category via Textual Inversion, performance improves across\n17 datasets, except when dealing with low-resolution reference images.\n  Our exhaustive benchmark and analysis spotlight generative data's promise in\nvisual recognition, while identifying key challenges for future investigation.\n","authors":["Bo Li","Haotian Liu","Liangyu Chen","Yong Jae Lee","Chunyuan Li","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13697v1.pdf","comment":"Research Report"},{"id":"http://arxiv.org/abs/2307.12914v2","updated":"2023-07-25T17:56:38Z","published":"2023-07-24T16:13:43Z","title":"Towards a Visual-Language Foundation Model for Computational Pathology","summary":"  The accelerated adoption of digital pathology and advances in deep learning\nhave enabled the development of powerful models for various pathology tasks\nacross a diverse array of diseases and patient cohorts. However, model training\nis often difficult due to label scarcity in the medical domain and the model's\nusage is limited by the specific task and disease for which it is trained.\nAdditionally, most models in histopathology leverage only image data, a stark\ncontrast to how humans teach each other and reason about histopathologic\nentities. We introduce CONtrastive learning from Captions for Histopathology\n(CONCH), a visual-language foundation model developed using diverse sources of\nhistopathology images, biomedical text, and notably over 1.17 million\nimage-caption pairs via task-agnostic pretraining. Evaluated on a suite of 13\ndiverse benchmarks, CONCH can be transferred to a wide range of downstream\ntasks involving either or both histopathology images and text, achieving\nstate-of-the-art performance on histology image classification, segmentation,\ncaptioning, text-to-image and image-to-text retrieval. CONCH represents a\nsubstantial leap over concurrent visual-language pretrained systems for\nhistopathology, with the potential to directly facilitate a wide array of\nmachine learning-based workflows requiring minimal or no further supervised\nfine-tuning.\n","authors":["Ming Y. Lu","Bowen Chen","Drew F. K. Williamson","Richard J. Chen","Ivy Liang","Tong Ding","Guillaume Jaume","Igor Odintsov","Andrew Zhang","Long Phi Le","Georg Gerber","Anil V Parwani","Faisal Mahmood"],"pdf_url":"https://arxiv.org/pdf/2307.12914v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1905.10448v4","updated":"2023-07-25T17:53:01Z","published":"2019-05-24T21:19:04Z","title":"Geometric Wavelet Scattering Networks on Compact Riemannian Manifolds","summary":"  The Euclidean scattering transform was introduced nearly a decade ago to\nimprove the mathematical understanding of convolutional neural networks.\nInspired by recent interest in geometric deep learning, which aims to\ngeneralize convolutional neural networks to manifold and graph-structured\ndomains, we define a geometric scattering transform on manifolds. Similar to\nthe Euclidean scattering transform, the geometric scattering transform is based\non a cascade of wavelet filters and pointwise nonlinearities. It is invariant\nto local isometries and stable to certain types of diffeomorphisms. Empirical\nresults demonstrate its utility on several geometric learning tasks. Our\nresults generalize the deformation stability and local translation invariance\nof Euclidean scattering, and demonstrate the importance of linking the used\nfilter structures to the underlying geometry of the data.\n","authors":["Michael Perlmutter","Feng Gao","Guy Wolf","Matthew Hirn"],"pdf_url":"https://arxiv.org/pdf/1905.10448v4.pdf","comment":"35 pages; 3 figures; 2 tables; v4: Fixed a minor error. Convergence\n  in Equation 13 is in L2 not p.w. modified proof of Theorem 3.3 accordingly"},{"id":"http://arxiv.org/abs/2303.06296v2","updated":"2023-07-25T17:42:37Z","published":"2023-03-11T03:30:47Z","title":"Stabilizing Transformer Training by Preventing Attention Entropy\n  Collapse","summary":"  Training stability is of great importance to Transformers. In this work, we\ninvestigate the training dynamics of Transformers by examining the evolution of\nthe attention layers. In particular, we track the attention entropy for each\nattention head during the course of training, which is a proxy for model\nsharpness. We identify a common pattern across different architectures and\ntasks, where low attention entropy is accompanied by high training instability,\nwhich can take the form of oscillating loss or divergence. We denote the\npathologically low attention entropy, corresponding to highly concentrated\nattention scores, as $\\textit{entropy collapse}$. As a remedy, we propose\n$\\sigma$Reparam, a simple and efficient solution where we reparametrize all\nlinear layers with spectral normalization and an additional learned scalar. We\ndemonstrate that $\\sigma$Reparam successfully prevents entropy collapse in the\nattention layers, promoting more stable training. Additionally, we prove a\ntight lower bound of the attention entropy, which decreases exponentially fast\nwith the spectral norm of the attention logits, providing additional motivation\nfor our approach. We conduct experiments with $\\sigma$Reparam on image\nclassification, image self-supervised learning, machine translation, speech\nrecognition, and language modeling tasks. We show that $\\sigma$Reparam provides\nstability and robustness with respect to the choice of hyperparameters, going\nso far as enabling training (a) a Vision Transformer {to competitive\nperformance} without warmup, weight decay, layer normalization or adaptive\noptimizers; (b) deep architectures in machine translation and (c) speech\nrecognition to competitive performance without warmup and adaptive optimizers.\nCode is available at \\url{https://github.com/apple/ml-sigma-reparam}.\n","authors":["Shuangfei Zhai","Tatiana Likhomanenko","Etai Littwin","Dan Busbridge","Jason Ramapuram","Yizhe Zhang","Jiatao Gu","Josh Susskind"],"pdf_url":"https://arxiv.org/pdf/2303.06296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13681v1","updated":"2023-07-25T17:39:39Z","published":"2023-07-25T17:39:39Z","title":"The Visual Language of Fabrics","summary":"  We introduce text2fabric, a novel dataset that links free-text descriptions\nto various fabric materials. The dataset comprises 15,000 natural language\ndescriptions associated to 3,000 corresponding images of fabric materials.\nTraditionally, material descriptions come in the form of tags/keywords, which\nlimits their expressivity, induces pre-existing knowledge of the appropriate\nvocabulary, and ultimately leads to a chopped description system. Therefore, we\nstudy the use of free-text as a more appropriate way to describe material\nappearance, taking the use case of fabrics as a common item that non-experts\nmay often deal with. Based on the analysis of the dataset, we identify a\ncompact lexicon, set of attributes and key structure that emerge from the\ndescriptions. This allows us to accurately understand how people describe\nfabrics and draw directions for generalization to other types of materials. We\nalso show that our dataset enables specializing large vision-language models\nsuch as CLIP, creating a meaningful latent space for fabric appearance, and\nsignificantly improving applications such as fine-grained material retrieval\nand automatic captioning.\n","authors":["Valentin Deschaintre","Julia Guerrero-Viu","Diego Gutierrez","Tamy Boubekeur","Belen Masia"],"pdf_url":"https://arxiv.org/pdf/2307.13681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13654v1","updated":"2023-07-25T17:01:10Z","published":"2023-07-25T17:01:10Z","title":"Personal Protective Equipment Detection in Extreme Construction\n  Conditions","summary":"  Object detection has been widely applied for construction safety management,\nespecially personal protective equipment (PPE) detection. Though the existing\nPPE detection models trained on conventional datasets have achieved excellent\nresults, their performance dramatically declines in extreme construction\nconditions. A robust detection model NST-YOLOv5 is developed by combining the\nneural style transfer (NST) and YOLOv5 technologies. Five extreme conditions\nare considered and simulated via the NST module to endow the detection model\nwith excellent robustness, including low light, intense light, sand dust, fog,\nand rain. Experiments show that the NST has great potential as a tool for\nextreme data synthesis since it is better at simulating extreme conditions than\nother traditional image processing algorithms and helps the NST-YOLOv5 achieve\n0.141 and 0.083 mAP_(05:95) improvements in synthesized and real-world extreme\ndata. This study provides a new feasible way to obtain a more robust detection\nmodel for extreme construction conditions.\n","authors":["Yuexiong Ding","Xiaowei Luo"],"pdf_url":"https://arxiv.org/pdf/2307.13654v1.pdf","comment":"2023 ASCE International Conference on Computing in Civil Engineering\n  (i3CE 2023)"},{"id":"http://arxiv.org/abs/2307.13646v1","updated":"2023-07-25T16:55:13Z","published":"2023-07-25T16:55:13Z","title":"QuickQual: Lightweight, convenient retinal image quality scoring with\n  off-the-shelf pretrained models","summary":"  Image quality remains a key problem for both traditional and deep learning\n(DL)-based approaches to retinal image analysis, but identifying poor quality\nimages can be time consuming and subjective. Thus, automated methods for\nretinal image quality scoring (RIQS) are needed. The current state-of-the-art\nis MCFNet, composed of three Densenet121 backbones each operating in a\ndifferent colour space. MCFNet, and the EyeQ dataset released by the same\nauthors, was a huge step forward for RIQS. We present QuickQual, a simple\napproach to RIQS, consisting of a single off-the-shelf ImageNet-pretrained\nDensenet121 backbone plus a Support Vector Machine (SVM). QuickQual performs\nvery well, setting a new state-of-the-art for EyeQ (Accuracy: 88.50% vs 88.00%\nfor MCFNet; AUC: 0.9687 vs 0.9588). This suggests that RIQS can be solved with\ngeneric perceptual features learned on natural images, as opposed to requiring\nDL models trained on large amounts of fundus images. Additionally, we propose a\nFixed Prior linearisation scheme, that converts EyeQ from a 3-way\nclassification to a continuous logistic regression task. For this task, we\npresent a second model, QuickQual MEga Minified Estimator (QuickQual-MEME),\nthat consists of only 10 parameters on top of an off-the-shelf Densenet121 and\ncan distinguish between gradable and ungradable images with an accuracy of\n89.18% (AUC: 0.9537). Code and model are available on GitHub:\nhttps://github.com/justinengelmann/QuickQual . QuickQual is so lightweight,\nthat the entire inference code (and even the parameters for QuickQual-MEME) is\nalready contained in this paper.\n","authors":["Justin Engelmann","Amos Storkey","Miguel O. Bernabeu"],"pdf_url":"https://arxiv.org/pdf/2307.13646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13645v1","updated":"2023-07-25T16:54:48Z","published":"2023-07-25T16:54:48Z","title":"Learning Transferable Object-Centric Diffeomorphic Transformations for\n  Data Augmentation in Medical Image Segmentation","summary":"  Obtaining labelled data in medical image segmentation is challenging due to\nthe need for pixel-level annotations by experts. Recent works have shown that\naugmenting the object of interest with deformable transformations can help\nmitigate this challenge. However, these transformations have been learned\nglobally for the image, limiting their transferability across datasets or\napplicability in problems where image alignment is difficult. While\nobject-centric augmentations provide a great opportunity to overcome these\nissues, existing works are only focused on position and random transformations\nwithout considering shape variations of the objects. To this end, we propose a\nnovel object-centric data augmentation model that is able to learn the shape\nvariations for the objects of interest and augment the object in place without\nmodifying the rest of the image. We demonstrated its effectiveness in improving\nkidney tumour segmentation when leveraging shape variations learned both from\nwithin the same dataset and transferred from external datasets.\n","authors":["Nilesh Kumar","Prashnna K. Gyawali","Sandesh Ghimire","Linwei Wang"],"pdf_url":"https://arxiv.org/pdf/2307.13645v1.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.13640v1","updated":"2023-07-25T16:45:35Z","published":"2023-07-25T16:45:35Z","title":"Optical Flow boosts Unsupervised Localization and Segmentation","summary":"  Unsupervised localization and segmentation are long-standing robot vision\nchallenges that describe the critical ability for an autonomous robot to learn\nto decompose images into individual objects without labeled data. These tasks\nare important because of the limited availability of dense image manual\nannotation and the promising vision of adapting to an evolving set of object\ncategories in lifelong learning. Most recent methods focus on using visual\nappearance continuity as object cues by spatially clustering features obtained\nfrom self-supervised vision transformers (ViT). In this work, we leverage\nmotion cues, inspired by the common fate principle that pixels that share\nsimilar movements tend to belong to the same object. We propose a new loss term\nformulation that uses optical flow in unlabeled videos to encourage\nself-supervised ViT features to become closer to each other if their\ncorresponding spatial locations share similar movements, and vice versa. We use\nthe proposed loss function to finetune vision transformers that were originally\ntrained on static images. Our fine-tuning procedure outperforms\nstate-of-the-art techniques for unsupervised semantic segmentation through\nlinear probing, without the use of any labeled data. This procedure also\ndemonstrates increased performance over original ViT networks across\nunsupervised object localization and semantic segmentation benchmarks.\n","authors":["Xinyu Zhang","Abdeslam Boularias"],"pdf_url":"https://arxiv.org/pdf/2307.13640v1.pdf","comment":"Accepted at IROS2023"},{"id":"http://arxiv.org/abs/2210.06433v2","updated":"2023-07-25T16:43:33Z","published":"2022-10-12T17:30:12Z","title":"Self-supervised video pretraining yields human-aligned visual\n  representations","summary":"  Humans learn powerful representations of objects and scenes by observing how\nthey evolve over time. Yet, outside of specific tasks that require explicit\ntemporal understanding, static image pretraining remains the dominant paradigm\nfor learning visual foundation models. We question this mismatch, and ask\nwhether video pretraining can yield visual representations that bear the\nhallmarks of human perception: generalisation across tasks, robustness to\nperturbations, and consistency with human judgements. To that end we propose a\nnovel procedure for curating videos, and develop a contrastive framework which\nlearns from the complex transformations therein. This simple paradigm for\ndistilling knowledge from videos, called VITO, yields general representations\nthat far outperform prior video pretraining methods on image understanding\ntasks, and image pretraining methods on video understanding tasks. Moreover,\nVITO representations are significantly more robust to natural and synthetic\ndeformations than image-, video-, and adversarially-trained ones. Finally,\nVITO's predictions are strongly aligned with human judgements, surpassing\nmodels that were specifically trained for that purpose. Together, these results\nsuggest that video pretraining could be a simple way of learning unified,\nrobust, and human-aligned representations of the visual world.\n","authors":["Nikhil Parthasarathy","S. M. Ali Eslami","João Carreira","Olivier J. Hénaff"],"pdf_url":"https://arxiv.org/pdf/2210.06433v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2307.13639v1","updated":"2023-07-25T16:42:06Z","published":"2023-07-25T16:42:06Z","title":"Fake It Without Making It: Conditioned Face Generation for Accurate 3D\n  Face Shape Estimation","summary":"  Accurate 3D face shape estimation is an enabling technology with applications\nin healthcare, security, and creative industries, yet current state-of-the-art\nmethods either rely on self-supervised training with 2D image data or\nsupervised training with very limited 3D data. To bridge this gap, we present a\nnovel approach which uses a conditioned stable diffusion model for face image\ngeneration, leveraging the abundance of 2D facial information to inform 3D\nspace. By conditioning stable diffusion on depth maps sampled from a 3D\nMorphable Model (3DMM) of the human face, we generate diverse and\nshape-consistent images, forming the basis of SynthFace. We introduce this\nlarge-scale synthesised dataset of 250K photorealistic images and corresponding\n3DMM parameters. We further propose ControlFace, a deep neural network, trained\non SynthFace, which achieves competitive performance on the NoW benchmark,\nwithout requiring 3D supervision or manual 3D asset creation.\n","authors":["Will Rowan","Patrik Huber","Nick Pears","Andrew Keeling"],"pdf_url":"https://arxiv.org/pdf/2307.13639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13619v1","updated":"2023-07-25T16:22:58Z","published":"2023-07-25T16:22:58Z","title":"RecursiveDet: End-to-End Region-based Recursive Object Detection","summary":"  End-to-end region-based object detectors like Sparse R-CNN usually have\nmultiple cascade bounding box decoding stages, which refine the current\npredictions according to their previous results. Model parameters within each\nstage are independent, evolving a huge cost. In this paper, we find the general\nsetting of decoding stages is actually redundant. By simply sharing parameters\nand making a recursive decoder, the detector already obtains a significant\nimprovement. The recursive decoder can be further enhanced by positional\nencoding (PE) of the proposal box, which makes it aware of the exact locations\nand sizes of input bounding boxes, thus becoming adaptive to proposals from\ndifferent stages during the recursion. Moreover, we also design\ncenterness-based PE to distinguish the RoI feature element and dynamic\nconvolution kernels at different positions within the bounding box. To validate\nthe effectiveness of the proposed method, we conduct intensive ablations and\nbuild the full model on three recent mainstream region-based detectors. The\nRecusiveDet is able to achieve obvious performance boosts with even fewer model\nparameters and slightly increased computation cost. Codes are available at\nhttps://github.com/bravezzzzzz/RecursiveDet.\n","authors":["Jing Zhao","Li Sun","Qingli Li"],"pdf_url":"https://arxiv.org/pdf/2307.13619v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13606v1","updated":"2023-07-25T16:15:29Z","published":"2023-07-25T16:15:29Z","title":"Object-based Probabilistic Similarity Evidence of Sparse Latent Features\n  from Fully Convolutional Networks","summary":"  Similarity analysis using neural networks has emerged as a powerful technique\nfor understanding and categorizing complex patterns in various domains. By\nleveraging the latent representations learned by neural networks, data objects\nsuch as images can be compared effectively. This research explores the\nutilization of latent information generated by fully convolutional networks\n(FCNs) in similarity analysis, notably to estimate the visual resemblance of\nobjects segmented in 2D pictures. To do this, the analytical scheme comprises\ntwo steps: (1) extracting and transforming feature patterns per 2D object from\na trained FCN, and (2) identifying the most similar patterns through fuzzy\ninference. The step (2) can be further enhanced by incorporating a weighting\nscheme that considers the significance of latent variables in the analysis. The\nresults provide valuable insights into the benefits and challenges of employing\nneural network-based similarity analysis for discerning data patterns\neffectively.\n","authors":["Cyril Juliani"],"pdf_url":"https://arxiv.org/pdf/2307.13606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13600v1","updated":"2023-07-25T16:03:47Z","published":"2023-07-25T16:03:47Z","title":"Decisive Data using Multi-Modality Optical Sensors for Advanced\n  Vehicular Systems","summary":"  Optical sensors have played a pivotal role in acquiring real world data for\ncritical applications. This data, when integrated with advanced machine\nlearning algorithms provides meaningful information thus enhancing human\nvision. This paper focuses on various optical technologies for design and\ndevelopment of state-of-the-art out-cabin forward vision systems and in-cabin\ndriver monitoring systems. The focused optical sensors include Longwave Thermal\nImaging (LWIR) cameras, Near Infrared (NIR), Neuromorphic/ event cameras,\nVisible CMOS cameras and Depth cameras. Further the paper discusses different\npotential applications which can be employed using the unique strengths of each\nthese optical modalities in real time environment.\n","authors":["Muhammad Ali Farooq","Waseem Shariff","Mehdi Sefidgar Dilmaghani","Wang Yao","Moazam Soomro","Peter Corcoran"],"pdf_url":"https://arxiv.org/pdf/2307.13600v1.pdf","comment":"The Paper is accepted in 25th Irish Machine Vision and Image\n  Processing Conference (IMVIP23)"},{"id":"http://arxiv.org/abs/2307.13567v1","updated":"2023-07-25T15:20:19Z","published":"2023-07-25T15:20:19Z","title":"Mystique: Deconstructing SVG Charts for Layout Reuse","summary":"  To facilitate the reuse of existing charts, previous research has examined\nhow to obtain a semantic understanding of a chart by deconstructing its visual\nrepresentation into reusable components, such as encodings. However, existing\ndeconstruction approaches primarily focus on chart styles, handling only basic\nlayouts. In this paper, we investigate how to deconstruct chart layouts,\nfocusing on rectangle-based ones as they cover not only 17 chart types but also\nadvanced layouts (e.g., small multiples, nested layouts). We develop an\ninteractive tool, called Mystique, adopting a mixed-initiative approach to\nextract the axes and legend, and deconstruct a chart's layout into four\nsemantic components: mark groups, spatial relationships, data encodings, and\ngraphical constraints. Mystique employs a wizard interface that guides chart\nauthors through a series of steps to specify how the deconstructed components\nmap to their own data. On 150 rectangle-based SVG charts, Mystique achieves\nabove 85% accuracy for axis and legend extraction and 96% accuracy for layout\ndeconstruction. In a chart reproduction study, participants could easily reuse\nexisting charts on new datasets. We discuss the current limitations of Mystique\nand future research directions.\n","authors":["Chen Chen","Bongshin Lee","Yunhai Wang","Yunjeong Chang","Zhicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13567v1.pdf","comment":"To appear at the 2023 IEEE Visualization Conference"},{"id":"http://arxiv.org/abs/2307.12493v2","updated":"2023-07-25T15:17:25Z","published":"2023-07-24T02:50:44Z","title":"TF-ICON: Diffusion-Based Training-Free Cross-Domain Image Composition","summary":"  Text-driven diffusion models have exhibited impressive generative\ncapabilities, enabling various image editing tasks. In this paper, we propose\nTF-ICON, a novel Training-Free Image COmpositioN framework that harnesses the\npower of text-driven diffusion models for cross-domain image-guided\ncomposition. This task aims to seamlessly integrate user-provided objects into\na specific visual context. Current diffusion-based methods often involve costly\ninstance-based optimization or finetuning of pretrained models on customized\ndatasets, which can potentially undermine their rich prior. In contrast,\nTF-ICON can leverage off-the-shelf diffusion models to perform cross-domain\nimage-guided composition without requiring additional training, finetuning, or\noptimization. Moreover, we introduce the exceptional prompt, which contains no\ninformation, to facilitate text-driven diffusion models in accurately inverting\nreal images into latent representations, forming the basis for compositing. Our\nexperiments show that equipping Stable Diffusion with the exceptional prompt\noutperforms state-of-the-art inversion methods on various datasets (CelebA-HQ,\nCOCO, and ImageNet), and that TF-ICON surpasses prior baselines in versatile\nvisual domains. Code is available at https://github.com/Shilin-LU/TF-ICON\n","authors":["Shilin Lu","Yanzhu Liu","Adams Wai-Kin Kong"],"pdf_url":"https://arxiv.org/pdf/2307.12493v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.05747v2","updated":"2023-07-25T15:16:33Z","published":"2023-07-08T14:14:55Z","title":"Integrating Curricula with Replays: Its Effects on Continual Learning","summary":"  Humans engage in learning and reviewing processes with curricula when\nacquiring new skills or knowledge. This human learning behavior has inspired\nthe integration of curricula with replay methods in continual learning agents.\nThe goal is to emulate the human learning process, thereby improving knowledge\nretention and facilitating learning transfer. Existing replay methods in\ncontinual learning agents involve the random selection and ordering of data\nfrom previous tasks, which has shown to be effective. However, limited research\nhas explored the integration of different curricula with replay methods to\nenhance continual learning. Our study takes initial steps in examining the\nimpact of integrating curricula with replay methods on continual learning in\nthree specific aspects: the interleaved frequency of replayed exemplars with\ntraining data, the sequence in which exemplars are replayed, and the strategy\nfor selecting exemplars into the replay buffer. These aspects of curricula\ndesign align with cognitive psychology principles and leverage the benefits of\ninterleaved practice during replays, easy-to-hard rehearsal, and exemplar\nselection strategy involving exemplars from a uniform distribution of\ndifficulties. Based on our results, these three curricula effectively mitigated\ncatastrophic forgetting and enhanced positive knowledge transfer, demonstrating\nthe potential of curricula in advancing continual learning methodologies. Our\ncode and data are available:\nhttps://github.com/ZhangLab-DeepNeuroCogLab/Integrating-Curricula-with-Replays\n","authors":["Ren Jie Tee","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.05747v2.pdf","comment":"8 pages, 6 figures, accepted in AAAI Summer Symposium Series\n  Proceedings"},{"id":"http://arxiv.org/abs/2307.13541v1","updated":"2023-07-25T14:44:41Z","published":"2023-07-25T14:44:41Z","title":"Group Activity Recognition in Computer Vision: A Comprehensive Review,\n  Challenges, and Future Perspectives","summary":"  Group activity recognition is a hot topic in computer vision. Recognizing\nactivities through group relationships plays a vital role in group activity\nrecognition. It holds practical implications in various scenarios, such as\nvideo analysis, surveillance, automatic driving, and understanding social\nactivities. The model's key capabilities encompass efficiently modeling\nhierarchical relationships within a scene and accurately extracting distinctive\nspatiotemporal features from groups. Given this technology's extensive\napplicability, identifying group activities has garnered significant research\nattention. This work examines the current progress in technology for\nrecognizing group activities, with a specific focus on global interactivity and\nactivities. Firstly, we comprehensively review the pertinent literature and\nvarious group activity recognition approaches, from traditional methodologies\nto the latest methods based on spatial structure, descriptors, non-deep\nlearning, hierarchical recurrent neural networks (HRNN), relationship models,\nand attention mechanisms. Subsequently, we present the relational network and\nrelational architectures for each module. Thirdly, we investigate methods for\nrecognizing group activity and compare their performance with state-of-the-art\ntechnologies. We summarize the existing challenges and provide comprehensive\nguidance for newcomers to understand group activity recognition. Furthermore,\nwe review emerging perspectives in group activity recognition to explore new\ndirections and possibilities.\n","authors":["Chuanchuan Wang","Ahmad Sufril Azlan Mohamed"],"pdf_url":"https://arxiv.org/pdf/2307.13541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13539v1","updated":"2023-07-25T14:40:11Z","published":"2023-07-25T14:40:11Z","title":"Model Calibration in Dense Classification with Adaptive Label\n  Perturbation","summary":"  For safety-related applications, it is crucial to produce trustworthy deep\nneural networks whose prediction is associated with confidence that can\nrepresent the likelihood of correctness for subsequent decision-making.\nExisting dense binary classification models are prone to being over-confident.\nTo improve model calibration, we propose Adaptive Stochastic Label Perturbation\n(ASLP) which learns a unique label perturbation level for each training image.\nASLP employs our proposed Self-Calibrating Binary Cross Entropy (SC-BCE) loss,\nwhich unifies label perturbation processes including stochastic approaches\n(like DisturbLabel), and label smoothing, to correct calibration while\nmaintaining classification rates. ASLP follows Maximum Entropy Inference of\nclassic statistical mechanics to maximise prediction entropy with respect to\nmissing information. It performs this while: (1) preserving classification\naccuracy on known data as a conservative solution, or (2) specifically improves\nmodel calibration degree by minimising the gap between the prediction accuracy\nand expected confidence of the target training label. Extensive results\ndemonstrate that ASLP can significantly improve calibration degrees of dense\nbinary classification models on both in-distribution and out-of-distribution\ndata. The code is available on https://github.com/Carlisle-Liu/ASLP.\n","authors":["Jiawei Liu","Changkun Ye","Shan Wang","Ruikai Cui","Jing Zhang","Kaihao Zhang","Nick Barnes"],"pdf_url":"https://arxiv.org/pdf/2307.13539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13537v1","updated":"2023-07-25T14:35:25Z","published":"2023-07-25T14:35:25Z","title":"Spectrum-guided Multi-granularity Referring Video Object Segmentation","summary":"  Current referring video object segmentation (R-VOS) techniques extract\nconditional kernels from encoded (low-resolution) vision-language features to\nsegment the decoded high-resolution features. We discovered that this causes\nsignificant feature drift, which the segmentation kernels struggle to perceive\nduring the forward computation. This negatively affects the ability of\nsegmentation kernels. To address the drift problem, we propose a\nSpectrum-guided Multi-granularity (SgMg) approach, which performs direct\nsegmentation on the encoded features and employs visual details to further\noptimize the masks. In addition, we propose Spectrum-guided Cross-modal Fusion\n(SCF) to perform intra-frame global interactions in the spectral domain for\neffective multimodal representation. Finally, we extend SgMg to perform\nmulti-object R-VOS, a new paradigm that enables simultaneous segmentation of\nmultiple referred objects in a video. This not only makes R-VOS faster, but\nalso more practical. Extensive experiments show that SgMg achieves\nstate-of-the-art performance on four video benchmark datasets, outperforming\nthe nearest competitor by 2.8% points on Ref-YouTube-VOS. Our extended SgMg\nenables multi-object R-VOS, runs about 3 times faster while maintaining\nsatisfactory performance. Code is available at https://github.com/bo-miao/SgMg.\n","authors":["Bo Miao","Mohammed Bennamoun","Yongsheng Gao","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2307.13537v1.pdf","comment":"Accepted by ICCV 2023, code is at https://github.com/bo-miao/SgMg"},{"id":"http://arxiv.org/abs/2307.02150v3","updated":"2023-07-25T14:32:41Z","published":"2023-07-05T09:46:41Z","title":"Harmonizing Feature Attributions Across Deep Learning Architectures:\n  Enhancing Interpretability and Consistency","summary":"  Ensuring the trustworthiness and interpretability of machine learning models\nis critical to their deployment in real-world applications. Feature attribution\nmethods have gained significant attention, which provide local explanations of\nmodel predictions by attributing importance to individual input features. This\nstudy examines the generalization of feature attributions across various deep\nlearning architectures, such as convolutional neural networks (CNNs) and vision\ntransformers. We aim to assess the feasibility of utilizing a feature\nattribution method as a future detector and examine how these features can be\nharmonized across multiple models employing distinct architectures but trained\non the same data distribution. By exploring this harmonization, we aim to\ndevelop a more coherent and optimistic understanding of feature attributions,\nenhancing the consistency of local explanations across diverse deep-learning\nmodels. Our findings highlight the potential for harmonized feature attribution\nmethods to improve interpretability and foster trust in machine learning\napplications, regardless of the underlying architecture.\n","authors":["Md Abdul Kadir","Gowtham Krishna Addluri","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2307.02150v3.pdf","comment":"This version of the contribution has been submitted in KI2023"},{"id":"http://arxiv.org/abs/2307.10745v2","updated":"2023-07-25T14:25:20Z","published":"2023-07-20T10:16:03Z","title":"EdgeAL: An Edge Estimation Based Active Learning Approach for OCT\n  Segmentation","summary":"  Active learning algorithms have become increasingly popular for training\nmodels with limited data. However, selecting data for annotation remains a\nchallenging problem due to the limited information available on unseen data. To\naddress this issue, we propose EdgeAL, which utilizes the edge information of\nunseen images as {\\it a priori} information for measuring uncertainty. The\nuncertainty is quantified by analyzing the divergence and entropy in model\npredictions across edges. This measure is then used to select superpixels for\nannotation. We demonstrate the effectiveness of EdgeAL on multi-class Optical\nCoherence Tomography (OCT) segmentation tasks, where we achieved a 99% dice\nscore while reducing the annotation label cost to 12%, 2.3%, and 3%,\nrespectively, on three publicly available datasets (Duke, AROI, and UMN). The\nsource code is available at \\url{https://github.com/Mak-Ta-Reque/EdgeAL}\n","authors":["Md Abdul Kadir","Hasan Md Tusfiqur Alam","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2307.10745v2.pdf","comment":"This version of the contribution has been submitted in miccai2023"},{"id":"http://arxiv.org/abs/2307.13529v1","updated":"2023-07-25T14:20:52Z","published":"2023-07-25T14:20:52Z","title":"Re-mine, Learn and Reason: Exploring the Cross-modal Semantic\n  Correlations for Language-guided HOI detection","summary":"  Human-Object Interaction (HOI) detection is a challenging computer vision\ntask that requires visual models to address the complex interactive\nrelationship between humans and objects and predict HOI triplets. Despite the\nchallenges posed by the numerous interaction combinations, they also offer\nopportunities for multimodal learning of visual texts. In this paper, we\npresent a systematic and unified framework (RmLR) that enhances HOI detection\nby incorporating structured text knowledge. Firstly, we qualitatively and\nquantitatively analyze the loss of interaction information in the two-stage HOI\ndetector and propose a re-mining strategy to generate more comprehensive visual\nrepresentation.Secondly, we design more fine-grained sentence- and word-level\nalignment and knowledge transfer strategies to effectively address the\nmany-to-many matching problem between multiple interactions and multiple\ntexts.These strategies alleviate the matching confusion problem that arises\nwhen multiple interactions occur simultaneously, thereby improving the\neffectiveness of the alignment process. Finally, HOI reasoning by visual\nfeatures augmented with textual knowledge substantially improves the\nunderstanding of interactions. Experimental results illustrate the\neffectiveness of our approach, where state-of-the-art performance is achieved\non public benchmarks. We further analyze the effects of different components of\nour approach to provide insights into its efficacy.\n","authors":["Yichao Cao","Xiu Su","Qingfei Tang","Feng Yang","Shan You","Xiaobo Lu","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2307.13529v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.13527v1","updated":"2023-07-25T14:18:58Z","published":"2023-07-25T14:18:58Z","title":"Not with my name! Inferring artists' names of input strings employed by\n  Diffusion Models","summary":"  Diffusion Models (DM) are highly effective at generating realistic,\nhigh-quality images. However, these models lack creativity and merely compose\noutputs based on their training data, guided by a textual input provided at\ncreation time. Is it acceptable to generate images reminiscent of an artist,\nemploying his name as input? This imply that if the DM is able to replicate an\nartist's work then it was trained on some or all of his artworks thus violating\ncopyright. In this paper, a preliminary study to infer the probability of use\nof an artist's name in the input string of a generated image is presented. To\nthis aim we focused only on images generated by the famous DALL-E 2 and\ncollected images (both original and generated) of five renowned artists.\nFinally, a dedicated Siamese Neural Network was employed to have a first kind\nof probability. Experimental results demonstrate that our approach is an\noptimal starting point and can be employed as a prior for predicting a complete\ninput string of an investigated image. Dataset and code are available at:\nhttps://github.com/ictlab-unict/not-with-my-name .\n","authors":["Roberto Leotta","Oliver Giudice","Luca Guarnera","Sebastiano Battiato"],"pdf_url":"https://arxiv.org/pdf/2307.13527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12274v2","updated":"2023-07-25T14:17:09Z","published":"2023-07-23T09:34:13Z","title":"FDCT: Fast Depth Completion for Transparent Objects","summary":"  Depth completion is crucial for many robotic tasks such as autonomous\ndriving, 3-D reconstruction, and manipulation. Despite the significant\nprogress, existing methods remain computationally intensive and often fail to\nmeet the real-time requirements of low-power robotic platforms. Additionally,\nmost methods are designed for opaque objects and struggle with transparent\nobjects due to the special properties of reflection and refraction. To address\nthese challenges, we propose a Fast Depth Completion framework for Transparent\nobjects (FDCT), which also benefits downstream tasks like object pose\nestimation. To leverage local information and avoid overfitting issues when\nintegrating it with global information, we design a new fusion branch and\nshortcuts to exploit low-level features and a loss function to suppress\noverfitting. This results in an accurate and user-friendly depth rectification\nframework which can recover dense depth estimation from RGB-D images alone.\nExtensive experiments demonstrate that FDCT can run about 70 FPS with a higher\naccuracy than the state-of-the-art methods. We also demonstrate that FDCT can\nimprove pose estimation in object grasping tasks. The source code is available\nat https://github.com/Nonmy/FDCT\n","authors":["Tianan Li","Zhehan Chen","Huan Liu","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12274v2.pdf","comment":"9pages,7figures"},{"id":"http://arxiv.org/abs/2302.03114v3","updated":"2023-07-25T14:11:18Z","published":"2023-02-06T20:33:16Z","title":"From CAD models to soft point cloud labels: An automatic annotation\n  pipeline for cheaply supervised 3D semantic segmentation","summary":"  We propose a fully automatic annotation scheme that takes a raw 3D point\ncloud with a set of fitted CAD models as input and outputs convincing\npoint-wise labels that can be used as cheap training data for point cloud\nsegmentation. Compared with manual annotations, we show that our automatic\nlabels are accurate while drastically reducing the annotation time and\neliminating the need for manual intervention or dataset-specific parameters.\nOur labeling pipeline outputs semantic classes and soft point-wise object\nscores, which can either be binarized into standard one-hot-encoded labels,\nthresholded into weak labels with ambiguous points left unlabeled, or used\ndirectly as soft labels during training. We evaluate the label quality and\nsegmentation performance of PointNet++ on a dataset of real industrial point\nclouds and Scan2CAD, a public dataset of indoor scenes. Our results indicate\nthat reducing supervision in areas that are more difficult to label\nautomatically is beneficial compared with the conventional approach of naively\nassigning a hard \"best guess\" label to every point.\n","authors":["Galadrielle Humblot-Renaux","Simon Buus Jensen","Andreas Møgelmose"],"pdf_url":"https://arxiv.org/pdf/2302.03114v3.pdf","comment":"updated version, published in the Remote Sensing journal"},{"id":"http://arxiv.org/abs/2304.14108v4","updated":"2023-07-25T14:07:03Z","published":"2023-04-27T11:37:18Z","title":"DataComp: In search of the next generation of multimodal datasets","summary":"  Multimodal datasets are a critical component in recent breakthroughs such as\nStable Diffusion and GPT-4, yet their design does not receive the same research\nattention as model architectures or training algorithms. To address this\nshortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset\nexperiments centered around a new candidate pool of 12.8 billion image-text\npairs from Common Crawl. Participants in our benchmark design new filtering\ntechniques or curate new data sources and then evaluate their new dataset by\nrunning our standardized CLIP training code and testing the resulting model on\n38 downstream test sets. Our benchmark consists of multiple compute scales\nspanning four orders of magnitude, which enables the study of scaling trends\nand makes the benchmark accessible to researchers with varying resources. Our\nbaseline experiments show that the DataComp workflow leads to better training\nsets. In particular, our best baseline, DataComp-1B, enables training a CLIP\nViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming\nOpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training\nprocedure and compute. We release DataComp and all accompanying code at\nwww.datacomp.ai.\n","authors":["Samir Yitzhak Gadre","Gabriel Ilharco","Alex Fang","Jonathan Hayase","Georgios Smyrnis","Thao Nguyen","Ryan Marten","Mitchell Wortsman","Dhruba Ghosh","Jieyu Zhang","Eyal Orgad","Rahim Entezari","Giannis Daras","Sarah Pratt","Vivek Ramanujan","Yonatan Bitton","Kalyani Marathe","Stephen Mussmann","Richard Vencu","Mehdi Cherti","Ranjay Krishna","Pang Wei Koh","Olga Saukh","Alexander Ratner","Shuran Song","Hannaneh Hajishirzi","Ali Farhadi","Romain Beaumont","Sewoong Oh","Alex Dimakis","Jenia Jitsev","Yair Carmon","Vaishaal Shankar","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2304.14108v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13510v1","updated":"2023-07-25T14:02:02Z","published":"2023-07-25T14:02:02Z","title":"HeightFormer: Explicit Height Modeling without Extra Data for\n  Camera-only 3D Object Detection in Bird's Eye View","summary":"  Vision-based Bird's Eye View (BEV) representation is an emerging perception\nformulation for autonomous driving. The core challenge is to construct BEV\nspace with multi-camera features, which is a one-to-many ill-posed problem.\nDiving into all previous BEV representation generation methods, we found that\nmost of them fall into two types: modeling depths in image views or modeling\nheights in the BEV space, mostly in an implicit way. In this work, we propose\nto explicitly model heights in the BEV space, which needs no extra data like\nLiDAR and can fit arbitrary camera rigs and types compared to modeling depths.\nTheoretically, we give proof of the equivalence between height-based methods\nand depth-based methods. Considering the equivalence and some advantages of\nmodeling heights, we propose HeightFormer, which models heights and\nuncertainties in a self-recursive way. Without any extra data, the proposed\nHeightFormer could estimate heights in BEV accurately. Benchmark results show\nthat the performance of HeightFormer achieves SOTA compared with those\ncamera-only methods.\n","authors":["Yiming Wu","Ruixiang Li","Zequn Qin","Xinhai Zhao","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2307.13510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00334v4","updated":"2023-07-25T14:00:52Z","published":"2023-03-01T08:54:56Z","title":"Online Streaming Video Super-Resolution with Convolutional Look-Up Table","summary":"  Online video streaming has fundamental limitations on the transmission\nbandwidth and computational capacity and super-resolution is a promising\npotential solution. However, applying existing video super-resolution methods\nto online streaming is non-trivial. Existing video codecs and streaming\nprotocols (\\eg, WebRTC) dynamically change the video quality both spatially and\ntemporally, which leads to diverse and dynamic degradations. Furthermore,\nonline streaming has a strict requirement for latency that most existing\nmethods are less applicable. As a result, this paper focuses on the rarely\nexploited problem setting of online streaming video super resolution. To\nfacilitate the research on this problem, a new benchmark dataset named\nLDV-WebRTC is constructed based on a real-world online streaming system.\nLeveraging the new benchmark dataset, we proposed a novel method specifically\nfor online video streaming, which contains a convolution and Look-Up Table\n(LUT) hybrid model to achieve better performance-latency trade-off. To tackle\nthe changing degradations, we propose a mixture-of-expert-LUT module, where a\nset of LUT specialized in different degradations are built and adaptively\ncombined to handle different degradations. Experiments show our method achieves\n720P video SR around 100 FPS, while significantly outperforms existing\nLUT-based methods and offers competitive performance compared to efficient\nCNN-based methods.\n","authors":["Guanghao Yin","Zefan Qu","Xinyang Jiang","Shan Jiang","Zhenhua Han","Ningxin Zheng","Xiaohong Liu","Huan Yang","Yuqing Yang","Dongsheng Li","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2303.00334v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13492v1","updated":"2023-07-25T13:35:45Z","published":"2023-07-25T13:35:45Z","title":"NormAUG: Normalization-guided Augmentation for Domain Generalization","summary":"  Deep learning has made significant advancements in supervised learning.\nHowever, models trained in this setting often face challenges due to domain\nshift between training and test sets, resulting in a significant drop in\nperformance during testing. To address this issue, several domain\ngeneralization methods have been developed to learn robust and domain-invariant\nfeatures from multiple training domains that can generalize well to unseen test\ndomains. Data augmentation plays a crucial role in achieving this goal by\nenhancing the diversity of the training data. In this paper, inspired by the\nobservation that normalizing an image with different statistics generated by\ndifferent batches with various domains can perturb its feature, we propose a\nsimple yet effective method called NormAUG (Normalization-guided Augmentation).\nOur method includes two paths: the main path and the auxiliary (augmented)\npath. During training, the auxiliary path includes multiple sub-paths, each\ncorresponding to batch normalization for a single domain or a random\ncombination of multiple domains. This introduces diverse information at the\nfeature level and improves the generalization of the main path. Moreover, our\nNormAUG method effectively reduces the existing upper boundary for\ngeneralization based on theoretical perspectives. During the test stage, we\nleverage an ensemble strategy to combine the predictions from the auxiliary\npath of our model, further boosting performance. Extensive experiments are\nconducted on multiple benchmark datasets to validate the effectiveness of our\nproposed method.\n","authors":["Lei Qi","Hongpeng Yang","Yinghuan Shi","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2307.13492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13485v1","updated":"2023-07-25T13:22:24Z","published":"2023-07-25T13:22:24Z","title":"Cos R-CNN for Online Few-shot Object Detection","summary":"  We propose Cos R-CNN, a simple exemplar-based R-CNN formulation that is\ndesigned for online few-shot object detection. That is, it is able to localise\nand classify novel object categories in images with few examples without\nfine-tuning. Cos R-CNN frames detection as a learning-to-compare task: unseen\nclasses are represented as exemplar images, and objects are detected based on\ntheir similarity to these exemplars. The cosine-based classification head\nallows for dynamic adaptation of classification parameters to the exemplar\nembedding, and encourages the clustering of similar classes in embedding space\nwithout the need for manual tuning of distance-metric hyperparameters. This\nsimple formulation achieves best results on the recently proposed 5-way\nImageNet few-shot detection benchmark, beating the online 1/5/10-shot scenarios\nby more than 8/3/1%, as well as performing up to 20% better in online 20-way\nfew-shot VOC across all shots on novel classes.\n","authors":["Gratianus Wesley Putra Data","Henry Howard-Jenkins","David Murray","Victor Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2307.13485v1.pdf","comment":"Unpublished tech report from 2020"},{"id":"http://arxiv.org/abs/2307.13463v1","updated":"2023-07-25T12:47:21Z","published":"2023-07-25T12:47:21Z","title":"Unlocking the Emotional World of Visual Media: An Overview of the\n  Science, Research, and Impact of Understanding Emotion","summary":"  The emergence of artificial emotional intelligence technology is\nrevolutionizing the fields of computers and robotics, allowing for a new level\nof communication and understanding of human behavior that was once thought\nimpossible. While recent advancements in deep learning have transformed the\nfield of computer vision, automated understanding of evoked or expressed\nemotions in visual media remains in its infancy. This foundering stems from the\nabsence of a universally accepted definition of \"emotion\", coupled with the\ninherently subjective nature of emotions and their intricate nuances. In this\narticle, we provide a comprehensive, multidisciplinary overview of the field of\nemotion analysis in visual media, drawing on insights from psychology,\nengineering, and the arts. We begin by exploring the psychological foundations\nof emotion and the computational principles that underpin the understanding of\nemotions from images and videos. We then review the latest research and systems\nwithin the field, accentuating the most promising approaches. We also discuss\nthe current technological challenges and limitations of emotion analysis,\nunderscoring the necessity for continued investigation and innovation. We\ncontend that this represents a \"Holy Grail\" research problem in computing and\ndelineate pivotal directions for future inquiry. Finally, we examine the\nethical ramifications of emotion-understanding technologies and contemplate\ntheir potential societal impacts. Overall, this article endeavors to equip\nreaders with a deeper understanding of the domain of emotion analysis in visual\nmedia and to inspire further research and development in this captivating and\nrapidly evolving field.\n","authors":["James Z. Wang","Sicheng Zhao","Chenyan Wu","Reginald B. Adams","Michelle G. Newman","Tal Shafir","Rachelle Tsachor"],"pdf_url":"https://arxiv.org/pdf/2307.13463v1.pdf","comment":"Proceedings of the IEEE 2023"},{"id":"http://arxiv.org/abs/2307.13459v1","updated":"2023-07-25T12:40:24Z","published":"2023-07-25T12:40:24Z","title":"Weakly-supervised 3D Pose Transfer with Keypoints","summary":"  The main challenges of 3D pose transfer are: 1) Lack of paired training data\nwith different characters performing the same pose; 2) Disentangling pose and\nshape information from the target mesh; 3) Difficulty in applying to meshes\nwith different topologies. We thus propose a novel weakly-supervised\nkeypoint-based framework to overcome these difficulties. Specifically, we use a\ntopology-agnostic keypoint detector with inverse kinematics to compute\ntransformations between the source and target meshes. Our method only requires\nsupervision on the keypoints, can be applied to meshes with different\ntopologies and is shape-invariant for the target which allows extraction of\npose-only information from the target meshes without transferring shape\ninformation. We further design a cycle reconstruction to perform\nself-supervised pose transfer without the need for ground truth deformed mesh\nwith the same pose and shape as the target and source, respectively. We\nevaluate our approach on benchmark human and animal datasets, where we achieve\nsuperior performance compared to the state-of-the-art unsupervised approaches\nand even comparable performance with the fully supervised approaches. We test\non the more challenging Mixamo dataset to verify our approach's ability in\nhandling meshes with different topologies and complex clothes. Cross-dataset\nevaluation further shows the strong generalization ability of our approach.\n","authors":["Jinnan Chen","Chen Li","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2307.13459v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2003.03229v4","updated":"2023-07-25T12:25:13Z","published":"2020-02-02T21:09:39Z","title":"Non-linear Neurons with Human-like Apical Dendrite Activations","summary":"  In order to classify linearly non-separable data, neurons are typically\norganized into multi-layer neural networks that are equipped with at least one\nhidden layer. Inspired by some recent discoveries in neuroscience, we propose a\nnew model of artificial neuron along with a novel activation function enabling\nthe learning of nonlinear decision boundaries using a single neuron. We show\nthat a standard neuron followed by our novel apical dendrite activation (ADA)\ncan learn the XOR logical function with 100% accuracy. Furthermore, we conduct\nexperiments on six benchmark data sets from computer vision, signal processing\nand natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,\nTiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions\nprovide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and\nSwish, for various neural network architectures, e.g. one-hidden-layer or\ntwo-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural\nnetworks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain\nfurther performance improvements when we change the standard model of the\nneuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our\ncode is available at: https://github.com/raduionescu/pynada.\n","authors":["Mariana-Iuliana Georgescu","Radu Tudor Ionescu","Nicolae-Catalin Ristea","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2003.03229v4.pdf","comment":"Accepted for publication in Applied Intelligence"},{"id":"http://arxiv.org/abs/2303.09941v4","updated":"2023-07-25T12:24:47Z","published":"2023-03-17T12:55:22Z","title":"Leaping Into Memories: Space-Time Deep Feature Synthesis","summary":"  The success of deep learning models has led to their adaptation and adoption\nby prominent video understanding methods. The majority of these approaches\nencode features in a joint space-time modality for which the inner workings and\nlearned representations are difficult to visually interpret. We propose LEArned\nPreconscious Synthesis (LEAPS), an architecture-independent method for\nsynthesizing videos from the internal spatiotemporal representations of models.\nUsing a stimulus video and a target class, we prime a fixed space-time model\nand iteratively optimize a video initialized with random noise. Additional\nregularizers are used to improve the feature diversity of the synthesized\nvideos alongside the cross-frame temporal coherence of motions. We\nquantitatively and qualitatively evaluate the applicability of LEAPS by\ninverting a range of spatiotemporal convolutional and attention-based\narchitectures trained on Kinetics-400, which to the best of our knowledge has\nnot been previously accomplished.\n","authors":["Alexandros Stergiou","Nikos Deligiannis"],"pdf_url":"https://arxiv.org/pdf/2303.09941v4.pdf","comment":"Accepted at IEEE/CVF International Conference on Computer Vision\n  (ICCV) 2023"},{"id":"http://arxiv.org/abs/2307.12348v2","updated":"2023-07-25T11:58:07Z","published":"2023-07-23T15:10:02Z","title":"ResShift: Efficient Diffusion Model for Image Super-resolution by\n  Residual Shifting","summary":"  Diffusion-based image super-resolution (SR) methods are mainly limited by the\nlow inference speed due to the requirements of hundreds or even thousands of\nsampling steps. Existing acceleration sampling techniques inevitably sacrifice\nperformance to some extent, leading to over-blurry SR results. To address this\nissue, we propose a novel and efficient diffusion model for SR that\nsignificantly reduces the number of diffusion steps, thereby eliminating the\nneed for post-acceleration during inference and its associated performance\ndeterioration. Our method constructs a Markov chain that transfers between the\nhigh-resolution image and the low-resolution image by shifting the residual\nbetween them, substantially improving the transition efficiency. Additionally,\nan elaborate noise schedule is developed to flexibly control the shifting speed\nand the noise strength during the diffusion process. Extensive experiments\ndemonstrate that the proposed method obtains superior or at least comparable\nperformance to current state-of-the-art methods on both synthetic and\nreal-world datasets, even only with 15 sampling steps. Our code and model are\navailable at https://github.com/zsyOAOA/ResShift.\n","authors":["Zongsheng Yue","Jianyi Wang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2307.12348v2.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.13428v1","updated":"2023-07-25T11:51:14Z","published":"2023-07-25T11:51:14Z","title":"An Explainable Model-Agnostic Algorithm for CNN-based Biometrics\n  Verification","summary":"  This paper describes an adaptation of the Local Interpretable Model-Agnostic\nExplanations (LIME) AI method to operate under a biometric verification\nsetting. LIME was initially proposed for networks with the same output classes\nused for training, and it employs the softmax probability to determine which\nregions of the image contribute the most to classification. However, in a\nverification setting, the classes to be recognized have not been seen during\ntraining. In addition, instead of using the softmax output, face descriptors\nare usually obtained from a layer before the classification layer. The model is\nadapted to achieve explainability via cosine similarity between feature vectors\nof perturbated versions of the input image. The method is showcased for face\nbiometrics with two CNN models based on MobileNetv2 and ResNet50.\n","authors":["Fernando Alonso-Fernandez","Kevin Hernandez-Diaz","Jose M. Buades","Prayag Tiwari","Josef Bigun"],"pdf_url":"https://arxiv.org/pdf/2307.13428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13425v1","updated":"2023-07-25T11:45:28Z","published":"2023-07-25T11:45:28Z","title":"A signal processing interpretation of noise-reduction convolutional\n  neural networks","summary":"  Encoding-decoding CNNs play a central role in data-driven noise reduction and\ncan be found within numerous deep-learning algorithms. However, the development\nof these CNN architectures is often done in ad-hoc fashion and theoretical\nunderpinnings for important design choices is generally lacking. Up to this\nmoment there are different existing relevant works that strive to explain the\ninternal operation of these CNNs. Still, these ideas are either scattered\nand/or may require significant expertise to be accessible for a bigger\naudience. In order to open up this exciting field, this article builds\nintuition on the theory of deep convolutional framelets and explains diverse ED\nCNN architectures in a unified theoretical framework. By connecting basic\nprinciples from signal processing to the field of deep learning, this\nself-contained material offers significant guidance for designing robust and\nefficient novel CNN architectures.\n","authors":["Luis A. Zavala-Mondragón","Peter H. N. de With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2307.13425v1.pdf","comment":"This article is currently accepted in IEEE Signal Processing Magazine\n  (SPM)"},{"id":"http://arxiv.org/abs/2307.13412v1","updated":"2023-07-25T11:19:21Z","published":"2023-07-25T11:19:21Z","title":"Mitigating Memory Wall Effects in CNN Engines with On-the-Fly Weights\n  Generation","summary":"  The unprecedented accuracy of convolutional neural networks (CNNs) across a\nbroad range of AI tasks has led to their widespread deployment in mobile and\nembedded settings. In a pursuit for high-performance and energy-efficient\ninference, significant research effort has been invested in the design of\nFPGA-based CNN accelerators. In this context, single computation engines\nconstitute a popular approach to support diverse CNN modes without the overhead\nof fabric reconfiguration. Nevertheless, this flexibility often comes with\nsignificantly degraded performance on memory-bound layers and resource\nunderutilisation due to the suboptimal mapping of certain layers on the\nengine's fixed configuration. In this work, we investigate the implications in\nterms of CNN engine design for a class of models that introduce a\npre-convolution stage to decompress the weights at run time. We refer to these\napproaches as on-the-fly. This paper presents unzipFPGA, a novel CNN inference\nsystem that counteracts the limitations of existing CNN engines. The proposed\nframework comprises a novel CNN hardware architecture that introduces a weights\ngenerator module that enables the on-chip on-the-fly generation of weights,\nalleviating the negative impact of limited bandwidth on memory-bound layers. We\nfurther enhance unzipFPGA with an automated hardware-aware methodology that\ntailors the weights generation mechanism to the target CNN-device pair, leading\nto an improved accuracy-performance balance. Finally, we introduce an input\nselective processing element (PE) design that balances the load between PEs in\nsuboptimally mapped layers. The proposed framework yields hardware designs that\nachieve an average of 2.57x performance efficiency gain over highly optimised\nGPU designs for the same power constraints and up to 3.94x higher performance\ndensity over a diverse range of state-of-the-art FPGA-based CNN accelerators.\n","authors":["Stylianos I. Venieris","Javier Fernandez-Marques","Nicholas D. Lane"],"pdf_url":"https://arxiv.org/pdf/2307.13412v1.pdf","comment":"Accepted at ACM TODAES, 2023. arXiv admin note: substantial text\n  overlap with arXiv:2103.05600"},{"id":"http://arxiv.org/abs/2307.13397v1","updated":"2023-07-25T10:31:45Z","published":"2023-07-25T10:31:45Z","title":"Scoring Cycling Environments Perceived Safety using Pairwise Image\n  Comparisons","summary":"  Today, many cities seek to transition to more sustainable transportation\nsystems. Cycling is critical in this transition for shorter trips, including\nfirst-and-last-mile links to transit. Yet, if individuals perceive cycling as\nunsafe, they will not cycle and choose other transportation modes. This study\npresents a novel approach to identifying how the perception of cycling safety\ncan be analyzed and understood and the impact of the built environment and\ncycling contexts on such perceptions. We base our work on other perception\nstudies and pairwise comparisons, using real-world images to survey\nrespondents. We repeatedly show respondents two road environments and ask them\nto select the one they perceive as safer for cycling. We compare several\nmethods capable of rating cycling environments from pairwise comparisons and\nclassify cycling environments perceived as safe or unsafe. Urban planning can\nuse this score to improve interventions' effectiveness and improve cycling\npromotion campaigns. Furthermore, this approach facilitates the continuous\nassessment of changing cycling environments, allows for a short-term evaluation\nof measures, and is efficiently deployed in different locations or contexts.\n","authors":["Miguel Costa","Manuel Marques","Felix Wilhelm Siebert","Carlos Lima Azevedo","Filipe Moura"],"pdf_url":"https://arxiv.org/pdf/2307.13397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13375v1","updated":"2023-07-25T09:48:13Z","published":"2023-07-25T09:48:13Z","title":"Towards Unifying Anatomy Segmentation: Automated Generation of a\n  Full-body CT Dataset via Knowledge Aggregation and Anatomical Guidelines","summary":"  In this study, we present a method for generating automated anatomy\nsegmentation datasets using a sequential process that involves nnU-Net-based\npseudo-labeling and anatomy-guided pseudo-label refinement. By combining\nvarious fragmented knowledge bases, we generate a dataset of whole-body CT\nscans with $142$ voxel-level labels for 533 volumes providing comprehensive\nanatomical coverage which experts have approved. Our proposed procedure does\nnot rely on manual annotation during the label aggregation stage. We examine\nits plausibility and usefulness using three complementary checks: Human expert\nevaluation which approved the dataset, a Deep Learning usefulness benchmark on\nthe BTCV dataset in which we achieve 85% dice score without using its training\ndataset, and medical validity checks. This evaluation procedure combines\nscalable automated checks with labor-intensive high-quality expert checks.\nBesides the dataset, we release our trained unified anatomical segmentation\nmodel capable of predicting $142$ anatomical structures on CT data.\n","authors":["Alexander Jaus","Constantin Seibold","Kelsey Hermann","Alexandra Walter","Kristina Giske","Johannes Haubold","Jens Kleesiek","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2307.13375v1.pdf","comment":"18 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.13368v1","updated":"2023-07-25T09:39:59Z","published":"2023-07-25T09:39:59Z","title":"Kefa: A Knowledge Enhanced and Fine-grained Aligned Speaker for\n  Navigation Instruction Generation","summary":"  We introduce a novel speaker model \\textsc{Kefa} for navigation instruction\ngeneration. The existing speaker models in Vision-and-Language Navigation\nsuffer from the large domain gap of vision features between different\nenvironments and insufficient temporal grounding capability. To address the\nchallenges, we propose a Knowledge Refinement Module to enhance the feature\nrepresentation with external knowledge facts, and an Adaptive Temporal\nAlignment method to enforce fine-grained alignment between the generated\ninstructions and the observation sequences. Moreover, we propose a new metric\nSPICE-D for navigation instruction evaluation, which is aware of the\ncorrectness of direction phrases. The experimental results on R2R and UrbanWalk\ndatasets show that the proposed KEFA speaker achieves state-of-the-art\ninstruction generation performance for both indoor and outdoor scenes.\n","authors":["Haitian Zeng","Xiaohan Wang","Wenguan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2307.13368v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2304.07221v2","updated":"2023-07-25T09:34:38Z","published":"2023-04-14T16:03:09Z","title":"Instance-aware Dynamic Prompt Tuning for Pre-trained Point Cloud Models","summary":"  Pre-trained point cloud models have found extensive applications in 3D\nunderstanding tasks like object classification and part segmentation. However,\nthe prevailing strategy of full fine-tuning in downstream tasks leads to large\nper-task storage overhead for model parameters, which limits the efficiency\nwhen applying large-scale pre-trained models. Inspired by the recent success of\nvisual prompt tuning (VPT), this paper attempts to explore prompt tuning on\npre-trained point cloud models, to pursue an elegant balance between\nperformance and parameter efficiency. We find while instance-agnostic static\nprompting, e.g. VPT, shows some efficacy in downstream transfer, it is\nvulnerable to the distribution diversity caused by various types of noises in\nreal-world point cloud data. To conquer this limitation, we propose a novel\nInstance-aware Dynamic Prompt Tuning (IDPT) strategy for pre-trained point\ncloud models. The essence of IDPT is to develop a dynamic prompt generation\nmodule to perceive semantic prior features of each point cloud instance and\ngenerate adaptive prompt tokens to enhance the model's robustness. Notably,\nextensive experiments demonstrate that IDPT outperforms full fine-tuning in\nmost tasks with a mere 7% of the trainable parameters, providing a promising\nsolution to parameter-efficient learning for pre-trained point cloud models.\nCode is available at \\url{https://github.com/zyh16143998882/ICCV23-IDPT}.\n","authors":["Yaohua Zha","Jinpeng Wang","Tao Dai","Bin Chen","Zhi Wang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2304.07221v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13363v1","updated":"2023-07-25T09:33:25Z","published":"2023-07-25T09:33:25Z","title":"3DRP-Net: 3D Relative Position-aware Network for 3D Visual Grounding","summary":"  3D visual grounding aims to localize the target object in a 3D point cloud by\na free-form language description. Typically, the sentences describing the\ntarget object tend to provide information about its relative relation between\nother objects and its position within the whole scene. In this work, we propose\na relation-aware one-stage framework, named 3D Relative Position-aware Network\n(3DRP-Net), which can effectively capture the relative spatial relationships\nbetween objects and enhance object attributes. Specifically, 1) we propose a 3D\nRelative Position Multi-head Attention (3DRP-MA) module to analyze relative\nrelations from different directions in the context of object pairs, which helps\nthe model to focus on the specific object relations mentioned in the sentence.\n2) We designed a soft-labeling strategy to alleviate the spatial ambiguity\ncaused by redundant points, which further stabilizes and enhances the learning\nprocess through a constant and discriminative distribution. Extensive\nexperiments conducted on three benchmarks (i.e., ScanRefer and Nr3D/Sr3D)\ndemonstrate that our method outperforms all the state-of-the-art methods in\ngeneral. The source code will be released on GitHub.\n","authors":["Zehan Wang","Haifeng Huang","Yang Zhao","Linjun Li","Xize Cheng","Yichen Zhu","Aoxiong Yin","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.13363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13361v1","updated":"2023-07-25T09:31:55Z","published":"2023-07-25T09:31:55Z","title":"Of Mice and Pose: 2D Mouse Pose Estimation from Unlabelled Data and\n  Synthetic Prior","summary":"  Numerous fields, such as ecology, biology, and neuroscience, use animal\nrecordings to track and measure animal behaviour. Over time, a significant\nvolume of such data has been produced, but some computer vision techniques\ncannot explore it due to the lack of annotations. To address this, we propose\nan approach for estimating 2D mouse body pose from unlabelled images using a\nsynthetically generated empirical pose prior. Our proposal is based on a recent\nself-supervised method for estimating 2D human pose that uses single images and\na set of unpaired typical 2D poses within a GAN framework. We adapt this method\nto the limb structure of the mouse and generate the empirical prior of 2D poses\nfrom a synthetic 3D mouse model, thereby avoiding manual annotation. In\nexperiments on a new mouse video dataset, we evaluate the performance of the\napproach by comparing pose predictions to a manually obtained ground truth. We\nalso compare predictions with those from a supervised state-of-the-art method\nfor animal pose estimation. The latter evaluation indicates promising results\ndespite the lack of paired training data. Finally, qualitative results using a\ndataset of horse images show the potential of the setting to adapt to other\nanimal species.\n","authors":["Jose Sosa","Sharn Perry","Jane Alty","David Hogg"],"pdf_url":"https://arxiv.org/pdf/2307.13361v1.pdf","comment":"Accepted at the International Conference on Computer Vision Systems\n  2023"},{"id":"http://arxiv.org/abs/2109.12965v2","updated":"2023-07-25T09:27:12Z","published":"2021-09-27T11:42:40Z","title":"Text-based Person Search in Full Images via Semantic-Driven Proposal\n  Generation","summary":"  Finding target persons in full scene images with a query of text description\nhas important practical applications in intelligent video surveillance.However,\ndifferent from the real-world scenarios where the bounding boxes are not\navailable, existing text-based person retrieval methods mainly focus on the\ncross modal matching between the query text descriptions and the gallery of\ncropped pedestrian images. To close the gap, we study the problem of text-based\nperson search in full images by proposing a new end-to-end learning framework\nwhich jointly optimize the pedestrian detection, identification and\nvisual-semantic feature embedding tasks. To take full advantage of the query\ntext, the semantic features are leveraged to instruct the Region Proposal\nNetwork to pay more attention to the text-described proposals. Besides, a\ncross-scale visual-semantic embedding mechanism is utilized to improve the\nperformance. To validate the proposed method, we collect and annotate two\nlarge-scale benchmark datasets based on the widely adopted image-based person\nsearch datasets CUHK-SYSU and PRW. Comprehensive experiments are conducted on\nthe two datasets and compared with the baseline methods, our method achieves\nthe state-of-the-art performance.\n","authors":["Shizhou Zhang","De Cheng","Wenlong Luo","Yinghui Xing","Duo Long","Hao Li","Kai Niu","Guoqiang Liang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2109.12965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13345v1","updated":"2023-07-25T09:02:29Z","published":"2023-07-25T09:02:29Z","title":"Do humans and Convolutional Neural Networks attend to similar areas\n  during scene classification: Effects of task and image type","summary":"  Deep Learning models like Convolutional Neural Networks (CNN) are powerful\nimage classifiers, but what factors determine whether they attend to similar\nimage areas as humans do? While previous studies have focused on technological\nfactors, little is known about the role of factors that affect human attention.\nIn the present study, we investigated how the tasks used to elicit human\nattention maps interact with image characteristics in modulating the similarity\nbetween humans and CNN. We varied the intentionality of human tasks, ranging\nfrom spontaneous gaze during categorization over intentional gaze-pointing up\nto manual area selection. Moreover, we varied the type of image to be\ncategorized, using either singular, salient objects, indoor scenes consisting\nof object arrangements, or landscapes without distinct objects defining the\ncategory. The human attention maps generated in this way were compared to the\nCNN attention maps revealed by explainable artificial intelligence (Grad-CAM).\nThe influence of human tasks strongly depended on image type: For objects,\nhuman manual selection produced maps that were most similar to CNN, while the\nspecific eye movement task has little impact. For indoor scenes, spontaneous\ngaze produced the least similarity, while for landscapes, similarity was\nequally low across all human tasks. To better understand these results, we also\ncompared the different human attention maps to each other. Our results\nhighlight the importance of taking human factors into account when comparing\nthe attention of humans and CNN.\n","authors":["Romy Müller","Marcel Duerschmidt","Julian Ullrich","Carsten Knoll","Sascha Weber","Steffen Seitz"],"pdf_url":"https://arxiv.org/pdf/2307.13345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13344v1","updated":"2023-07-25T08:58:26Z","published":"2023-07-25T08:58:26Z","title":"Prior Based Online Lane Graph Extraction from Single Onboard Camera\n  Image","summary":"  The local road network information is essential for autonomous navigation.\nThis information is commonly obtained from offline HD-Maps in terms of lane\ngraphs. However, the local road network at a given moment can be drastically\ndifferent than the one given in the offline maps; due to construction works,\naccidents etc. Moreover, the autonomous vehicle might be at a location not\ncovered in the offline HD-Map. Thus, online estimation of the lane graph is\ncrucial for widespread and reliable autonomous navigation. In this work, we\ntackle online Bird's-Eye-View lane graph extraction from a single onboard\ncamera image. We propose to use prior information to increase quality of the\nestimations. The prior is extracted from the dataset through a transformer\nbased Wasserstein Autoencoder. The autoencoder is then used to enhance the\ninitial lane graph estimates. This is done through optimization of the latent\nspace vector. The optimization encourages the lane graph estimation to be\nlogical by discouraging it to diverge from the prior distribution. We test the\nmethod on two benchmark datasets, NuScenes and Argoverse. The results show that\nthe proposed method significantly improves the performance compared to\nstate-of-the-art methods.\n","authors":["Yigit Baran Can","Alexander Liniger","Danda Pani Paudel","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2307.13344v1.pdf","comment":"ITSC 2023"},{"id":"http://arxiv.org/abs/2208.06643v4","updated":"2023-07-25T08:53:16Z","published":"2022-08-13T13:13:41Z","title":"Recent Progress in Transformer-based Medical Image Analysis","summary":"  The transformer is primarily used in the field of natural language\nprocessing. Recently, it has been adopted and shows promise in the computer\nvision (CV) field. Medical image analysis (MIA), as a critical branch of CV,\nalso greatly benefits from this state-of-the-art technique. In this review, we\nfirst recap the core component of the transformer, the attention mechanism, and\nthe detailed structures of the transformer. After that, we depict the recent\nprogress of the transformer in the field of MIA. We organize the applications\nin a sequence of different tasks, including classification, segmentation,\ncaptioning, registration, detection, enhancement, localization, and synthesis.\nThe mainstream classification and segmentation tasks are further divided into\neleven medical image modalities. A large number of experiments studied in this\nreview illustrate that the transformer-based method outperforms existing\nmethods through comparisons with multiple evaluation metrics. Finally, we\ndiscuss the open challenges and future opportunities in this field. This\ntask-modality review with the latest contents, detailed information, and\ncomprehensive comparison may greatly benefit the broad MIA community.\n","authors":["Zhaoshan Liu","Qiujie Lv","Ziduo Yang","Yifan Li","Chau Hung Lee","Lei Shen"],"pdf_url":"https://arxiv.org/pdf/2208.06643v4.pdf","comment":"Computers in Biology and Medicine Accepted"},{"id":"http://arxiv.org/abs/2307.13337v1","updated":"2023-07-25T08:50:01Z","published":"2023-07-25T08:50:01Z","title":"Overcoming Distribution Mismatch in Quantizing Image Super-Resolution\n  Networks","summary":"  Quantization is a promising approach to reduce the high computational\ncomplexity of image super-resolution (SR) networks. However, compared to\nhigh-level tasks like image classification, low-bit quantization leads to\nsevere accuracy loss in SR networks. This is because feature distributions of\nSR networks are significantly divergent for each channel or input image, and is\nthus difficult to determine a quantization range. Existing SR quantization\nworks approach this distribution mismatch problem by dynamically adapting\nquantization ranges to the variant distributions during test time. However,\nsuch dynamic adaptation incurs additional computational costs that limit the\nbenefits of quantization. Instead, we propose a new quantization-aware training\nframework that effectively Overcomes the Distribution Mismatch problem in SR\nnetworks without the need for dynamic adaptation. Intuitively, the mismatch can\nbe reduced by directly regularizing the variance in features during training.\nHowever, we observe that variance regularization can collide with the\nreconstruction loss during training and adversely impact SR accuracy. Thus, we\navoid the conflict between two losses by regularizing the variance only when\nthe gradients of variance regularization are cooperative with that of\nreconstruction. Additionally, to further reduce the distribution mismatch, we\nintroduce distribution offsets to layers with a significant mismatch, which\neither scales or shifts channel-wise features. Our proposed algorithm, called\nODM, effectively reduces the mismatch in distributions with minimal\ncomputational overhead. Experimental results show that ODM effectively\noutperforms existing SR quantization approaches with similar or fewer\ncomputations, demonstrating the importance of reducing the distribution\nmismatch problem. Our code is available at https://github.com/Cheeun/ODM.\n","authors":["Cheeun Hong","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.13337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12853v2","updated":"2023-07-25T08:48:11Z","published":"2023-07-24T14:53:23Z","title":"Spatiotemporal Modeling Encounters 3D Medical Image Analysis:\n  Slice-Shift UNet with Multi-View Fusion","summary":"  As a fundamental part of computational healthcare, Computer Tomography (CT)\nand Magnetic Resonance Imaging (MRI) provide volumetric data, making the\ndevelopment of algorithms for 3D image analysis a necessity. Despite being\ncomputationally cheap, 2D Convolutional Neural Networks can only extract\nspatial information. In contrast, 3D CNNs can extract three-dimensional\nfeatures, but they have higher computational costs and latency, which is a\nlimitation for clinical practice that requires fast and efficient models.\nInspired by the field of video action recognition we propose a new 2D-based\nmodel dubbed Slice SHift UNet (SSH-UNet) which encodes three-dimensional\nfeatures at 2D CNN's complexity. More precisely multi-view features are\ncollaboratively learned by performing 2D convolutions along the three\northogonal planes of a volume and imposing a weights-sharing mechanism. The\nthird dimension, which is neglected by the 2D convolution, is reincorporated by\nshifting a portion of the feature maps along the slices' axis. The\neffectiveness of our approach is validated in Multi-Modality Abdominal\nMulti-Organ Segmentation (AMOS) and Multi-Atlas Labeling Beyond the Cranial\nVault (BTCV) datasets, showing that SSH-UNet is more efficient while on par in\nperformance with state-of-the-art architectures.\n","authors":["C. I. Ugwu","S. Casarin","O. Lanz"],"pdf_url":"https://arxiv.org/pdf/2307.12853v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16759v2","updated":"2023-07-25T08:39:31Z","published":"2023-05-26T09:21:56Z","title":"StyleHumanCLIP: Text-guided Garment Manipulation for StyleGAN-Human","summary":"  This paper tackles text-guided control of StyleGAN for editing garments in\nfull-body human images. Existing StyleGAN-based methods suffer from handling\nthe rich diversity of garments and body shapes and poses. We propose a\nframework for text-guided full-body human image synthesis via an\nattention-based latent code mapper, which enables more disentangled control of\nStyleGAN than existing mappers. Our latent code mapper adopts an attention\nmechanism that adaptively manipulates individual latent codes on different\nStyleGAN layers under text guidance. In addition, we introduce feature-space\nmasking at inference time to avoid unwanted changes caused by text inputs. Our\nquantitative and qualitative evaluations reveal that our method can control\ngenerated images more faithfully to given texts than existing methods.\n","authors":["Takato Yoshikawa","Yuki Endo","Yoshihiro Kanamori"],"pdf_url":"https://arxiv.org/pdf/2305.16759v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13316v1","updated":"2023-07-25T08:23:10Z","published":"2023-07-25T08:23:10Z","title":"Unmasking Anomalies in Road-Scene Segmentation","summary":"  Anomaly segmentation is a critical task for driving applications, and it is\napproached traditionally as a per-pixel classification problem. However,\nreasoning individually about each pixel without considering their contextual\nsemantics results in high uncertainty around the objects' boundaries and\nnumerous false positives. We propose a paradigm change by shifting from a\nper-pixel classification to a mask classification. Our mask-based method,\nMask2Anomaly, demonstrates the feasibility of integrating an anomaly detection\nmethod in a mask-classification architecture. Mask2Anomaly includes several\ntechnical novelties that are designed to improve the detection of anomalies in\nmasks: i) a global masked attention module to focus individually on the\nforeground and background regions; ii) a mask contrastive learning that\nmaximizes the margin between an anomaly and known classes; and iii) a mask\nrefinement solution to reduce false positives. Mask2Anomaly achieves new\nstate-of-the-art results across a range of benchmarks, both in the per-pixel\nand component-level evaluations. In particular, Mask2Anomaly reduces the\naverage false positives rate by 60% wrt the previous state-of-the-art. Github\npage:\nhttps://github.com/shyam671/Mask2Anomaly-Unmasking-Anomalies-in-Road-Scene-Segmentation.\n","authors":["Shyam Nandan Rai","Fabio Cermelli","Dario Fontanel","Carlo Masone","Barbara Caputo"],"pdf_url":"https://arxiv.org/pdf/2307.13316v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13314v1","updated":"2023-07-25T08:15:55Z","published":"2023-07-25T08:15:55Z","title":"Mitigating Cross-client GANs-based Attack in Federated Learning","summary":"  Machine learning makes multimedia data (e.g., images) more attractive,\nhowever, multimedia data is usually distributed and privacy sensitive. Multiple\ndistributed multimedia clients can resort to federated learning (FL) to jointly\nlearn a global shared model without requiring to share their private samples\nwith any third-party entities. In this paper, we show that FL suffers from the\ncross-client generative adversarial networks (GANs)-based (C-GANs) attack, in\nwhich a malicious client (i.e., adversary) can reconstruct samples with the\nsame distribution as the training samples from other clients (i.e., victims).\nSince a benign client's data can be leaked to the adversary, this attack brings\nthe risk of local data leakage for clients in many security-critical FL\napplications. Thus, we propose Fed-EDKD (i.e., Federated Ensemble Data-free\nKnowledge Distillation) technique to improve the current popular FL schemes to\nresist C-GANs attack. In Fed-EDKD, each client submits a local model to the\nserver for obtaining an ensemble global model. Then, to avoid model expansion,\nFed-EDKD adopts data-free knowledge distillation techniques to transfer\nknowledge from the ensemble global model to a compressed model. By this way,\nFed-EDKD reduces the adversary's control capability over the global model, so\nFed-EDKD can effectively mitigate C-GANs attack. Finally, the experimental\nresults demonstrate that Fed-EDKD significantly mitigates C-GANs attack while\nonly incurring a slight accuracy degradation of FL.\n","authors":["Hong Huang","Xinyu Lei","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2307.13314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13310v1","updated":"2023-07-25T08:00:40Z","published":"2023-07-25T08:00:40Z","title":"CT-Net: Arbitrary-Shaped Text Detection via Contour Transformer","summary":"  Contour based scene text detection methods have rapidly developed recently,\nbut still suffer from inaccurate frontend contour initialization, multi-stage\nerror accumulation, or deficient local information aggregation. To tackle these\nlimitations, we propose a novel arbitrary-shaped scene text detection framework\nnamed CT-Net by progressive contour regression with contour transformers.\nSpecifically, we first employ a contour initialization module that generates\ncoarse text contours without any post-processing. Then, we adopt contour\nrefinement modules to adaptively refine text contours in an iterative manner,\nwhich are beneficial for context information capturing and progressive global\ncontour deformation. Besides, we propose an adaptive training strategy to\nenable the contour transformers to learn more potential deformation paths, and\nintroduce a re-score mechanism that can effectively suppress false positives.\nExtensive experiments are conducted on four challenging datasets, which\ndemonstrate the accuracy and efficiency of our CT-Net over state-of-the-art\nmethods. Particularly, CT-Net achieves F-measure of 86.1 at 11.2 frames per\nsecond (FPS) and F-measure of 87.8 at 10.1 FPS for CTW1500 and Total-Text\ndatasets, respectively.\n","authors":["Zhiwen Shao","Yuchen Su","Yong Zhou","Fanrong Meng","Hancheng Zhu","Bing Liu","Rui Yao"],"pdf_url":"https://arxiv.org/pdf/2307.13310v1.pdf","comment":"This paper has been accepted by IEEE Transactions on Circuits and\n  Systems for Video Technology"},{"id":"http://arxiv.org/abs/2307.13300v1","updated":"2023-07-25T07:30:28Z","published":"2023-07-25T07:30:28Z","title":"Mini-PointNetPlus: a local feature descriptor in deep learning model for\n  3d environment perception","summary":"  Common deep learning models for 3D environment perception often use\npillarization/voxelization methods to convert point cloud data into\npillars/voxels and then process it with a 2D/3D convolutional neural network\n(CNN). The pioneer work PointNet has been widely applied as a local feature\ndescriptor, a fundamental component in deep learning models for 3D perception,\nto extract features of a point cloud. This is achieved by using a symmetric\nmax-pooling operator which provides unique pillar/voxel features. However, by\nignoring most of the points, the max-pooling operator causes an information\nloss, which reduces the model performance. To address this issue, we propose a\nnovel local feature descriptor, mini-PointNetPlus, as an alternative for\nplug-and-play to PointNet. Our basic idea is to separately project the data\npoints to the individual features considered, each leading to a permutation\ninvariant. Thus, the proposed descriptor transforms an unordered point cloud to\na stable order. The vanilla PointNet is proved to be a special case of our\nmini-PointNetPlus. Due to fully utilizing the features by the proposed\ndescriptor, we demonstrate in experiment a considerable performance improvement\nfor 3D perception.\n","authors":["Chuanyu Luo","Nuo Cheng","Sikun Ma","Jun Xiang","Xiaohan Li","Shengguang Lei","Pu Li"],"pdf_url":"https://arxiv.org/pdf/2307.13300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13294v1","updated":"2023-07-25T07:20:21Z","published":"2023-07-25T07:20:21Z","title":"Imperceptible Physical Attack against Face Recognition Systems via LED\n  Illumination Modulation","summary":"  Although face recognition starts to play an important role in our daily life,\nwe need to pay attention that data-driven face recognition vision systems are\nvulnerable to adversarial attacks. However, the current two categories of\nadversarial attacks, namely digital attacks and physical attacks both have\ndrawbacks, with the former ones impractical and the latter one conspicuous,\nhigh-computational and inexecutable. To address the issues, we propose a\npractical, executable, inconspicuous and low computational adversarial attack\nbased on LED illumination modulation. To fool the systems, the proposed attack\ngenerates imperceptible luminance changes to human eyes through fast intensity\nmodulation of scene LED illumination and uses the rolling shutter effect of\nCMOS image sensors in face recognition systems to implant luminance information\nperturbation to the captured face images. In summary,we present a\ndenial-of-service (DoS) attack for face detection and a dodging attack for face\nverification. We also evaluate their effectiveness against well-known face\ndetection models, Dlib, MTCNN and RetinaFace , and face verification models,\nDlib, FaceNet,and ArcFace.The extensive experiments show that the success rates\nof DoS attacks against face detection models reach 97.67%, 100%, and 100%,\nrespectively, and the success rates of dodging attacks against all face\nverification models reach 100%.\n","authors":["Junbin Fang","Canjian Jiang","You Jiang","Puxi Lin","Zhaojie Chen","Yujing Sun","Siu-Ming Yiu","Zoe L. Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.13294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06872v4","updated":"2023-07-25T07:07:12Z","published":"2023-03-13T05:46:21Z","title":"FusionLoc: Camera-2D LiDAR Fusion Using Multi-Head Self-Attention for\n  End-to-End Serving Robot Relocalization","summary":"  As technology advances in autonomous mobile robots, mobile service robots\nhave been actively used more and more for various purposes. Especially, serving\nrobots have been not surprising products anymore since the COVID-19 pandemic.\nOne of the practical problems in operating a serving robot is that it often\nfails to estimate its pose on a map that it moves around. Whenever the failure\nhappens, servers should bring the serving robot to its initial location and\nreboot it manually. In this paper, we focus on end-to-end relocalization of\nserving robots to address the problem. It is to predict robot pose directly\nfrom only the onboard sensor data using neural networks. In particular, we\npropose a deep neural network architecture for the relocalization based on\ncamera-2D LiDAR sensor fusion. We call the proposed method FusionLoc. In the\nproposed method, the multi-head self-attention complements different types of\ninformation captured by the two sensors to regress the robot pose. Our\nexperiments on a dataset collected by a commercial serving robot demonstrate\nthat FusionLoc can provide better performances than previous end-to-end\nrelocalization methods taking only a single image or a 2D LiDAR point cloud as\nwell as a straightforward fusion method concatenating their features.\n","authors":["Jieun Lee","Hakjun Lee","Jiyong Oh"],"pdf_url":"https://arxiv.org/pdf/2303.06872v4.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.13282v1","updated":"2023-07-25T06:37:50Z","published":"2023-07-25T06:37:50Z","title":"High-Resolution Volumetric Reconstruction for Clothed Humans","summary":"  We present a novel method for reconstructing clothed humans from a sparse set\nof, e.g., 1 to 6 RGB images. Despite impressive results from recent works\nemploying deep implicit representation, we revisit the volumetric approach and\ndemonstrate that better performance can be achieved with proper system design.\nThe volumetric representation offers significant advantages in leveraging 3D\nspatial context through 3D convolutions, and the notorious quantization error\nis largely negligible with a reasonably large yet affordable volume resolution,\ne.g., 512. To handle memory and computation costs, we propose a sophisticated\ncoarse-to-fine strategy with voxel culling and subspace sparse convolution. Our\nmethod starts with a discretized visual hull to compute a coarse shape and then\nfocuses on a narrow band nearby the coarse shape for refinement. Once the shape\nis reconstructed, we adopt an image-based rendering approach, which computes\nthe colors of surface points by blending input images with learned weights.\nExtensive experimental results show that our method significantly reduces the\nmean point-to-surface (P2S) precision of state-of-the-art methods by more than\n50% to achieve approximately 2mm accuracy with a 512 volume resolution.\nAdditionally, images rendered from our textured model achieve a higher peak\nsignal-to-noise ratio (PSNR) compared to state-of-the-art methods.\n","authors":["Sicong Tang","Guangyuan Wang","Qing Ran","Lingzhi Li","Li Shen","Ping Tan"],"pdf_url":"https://arxiv.org/pdf/2307.13282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12316v2","updated":"2023-07-25T06:36:01Z","published":"2023-07-23T12:57:47Z","title":"Development of pericardial fat count images using a combination of three\n  different deep-learning models","summary":"  Rationale and Objectives: Pericardial fat (PF), the thoracic visceral fat\nsurrounding the heart, promotes the development of coronary artery disease by\ninducing inflammation of the coronary arteries. For evaluating PF, this study\naimed to generate pericardial fat count images (PFCIs) from chest radiographs\n(CXRs) using a dedicated deep-learning model.\n  Materials and Methods: The data of 269 consecutive patients who underwent\ncoronary computed tomography (CT) were reviewed. Patients with metal implants,\npleural effusion, history of thoracic surgery, or that of malignancy were\nexcluded. Thus, the data of 191 patients were used. PFCIs were generated from\nthe projection of three-dimensional CT images, where fat accumulation was\nrepresented by a high pixel value. Three different deep-learning models,\nincluding CycleGAN, were combined in the proposed method to generate PFCIs from\nCXRs. A single CycleGAN-based model was used to generate PFCIs from CXRs for\ncomparison with the proposed method. To evaluate the image quality of the\ngenerated PFCIs, structural similarity index measure (SSIM), mean squared error\n(MSE), and mean absolute error (MAE) of (i) the PFCI generated using the\nproposed method and (ii) the PFCI generated using the single model were\ncompared.\n  Results: The mean SSIM, MSE, and MAE were as follows: 0.856, 0.0128, and\n0.0357, respectively, for the proposed model; and 0.762, 0.0198, and 0.0504,\nrespectively, for the single CycleGAN-based model.\n  Conclusion: PFCIs generated from CXRs with the proposed model showed better\nperformance than those with the single model. PFCI evaluation without CT may be\npossible with the proposed method.\n","authors":["Takaaki Matsunaga","Atsushi Kono","Hidetoshi Matsuo","Kaoru Kitagawa","Mizuho Nishio","Hiromi Hashimura","Yu Izawa","Takayoshi Toba","Kazuki Ishikawa","Akie Katsuki","Kazuyuki Ohmura","Takamichi Murakami"],"pdf_url":"https://arxiv.org/pdf/2307.12316v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11881v2","updated":"2023-07-25T05:13:12Z","published":"2023-07-21T19:37:23Z","title":"Selecting the motion ground truth for loose-fitting wearables:\n  benchmarking optical MoCap methods","summary":"  To help smart wearable researchers choose the optimal ground truth methods\nfor motion capturing (MoCap) for all types of loose garments, we present a\nbenchmark, DrapeMoCapBench (DMCB), specifically designed to evaluate the\nperformance of optical marker-based and marker-less MoCap. High-cost\nmarker-based MoCap systems are well-known as precise golden standards. However,\na less well-known caveat is that they require skin-tight fitting markers on\nbony areas to ensure the specified precision, making them questionable for\nloose garments. On the other hand, marker-less MoCap methods powered by\ncomputer vision models have matured over the years, which have meager costs as\nsmartphone cameras would suffice. To this end, DMCB uses large real-world\nrecorded MoCap datasets to perform parallel 3D physics simulations with a wide\nrange of diversities: six levels of drape from skin-tight to extremely draped\ngarments, three levels of motions and six body type - gender combinations to\nbenchmark state-of-the-art optical marker-based and marker-less MoCap methods\nto identify the best-performing method in different scenarios. In assessing the\nperformance of marker-based and low-cost marker-less MoCap for casual loose\ngarments both approaches exhibit significant performance loss (>10cm), but for\neveryday activities involving basic and fast motions, marker-less MoCap\nslightly outperforms marker-based MoCap, making it a favorable and\ncost-effective choice for wearable studies.\n","authors":["Lala Shakti Swarup Ray","Bo Zhou","Sungho Suh","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2307.11881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13259v1","updated":"2023-07-25T05:05:07Z","published":"2023-07-25T05:05:07Z","title":"GaitFormer: Revisiting Intrinsic Periodicity for Gait Recognition","summary":"  Gait recognition aims to distinguish different walking patterns by analyzing\nvideo-level human silhouettes, rather than relying on appearance information.\nPrevious research on gait recognition has primarily focused on extracting local\nor global spatial-temporal representations, while overlooking the intrinsic\nperiodic features of gait sequences, which, when fully utilized, can\nsignificantly enhance performance. In this work, we propose a plug-and-play\nstrategy, called Temporal Periodic Alignment (TPA), which leverages the\nperiodic nature and fine-grained temporal dependencies of gait patterns. The\nTPA strategy comprises two key components. The first component is Adaptive\nFourier-transform Position Encoding (AFPE), which adaptively converts features\nand discrete-time signals into embeddings that are sensitive to periodic\nwalking patterns. The second component is the Temporal Aggregation Module\n(TAM), which separates embeddings into trend and seasonal components, and\nextracts meaningful temporal correlations to identify primary components, while\nfiltering out random noise. We present a simple and effective baseline method\nfor gait recognition, based on the TPA strategy. Extensive experiments\nconducted on three popular public datasets (CASIA-B, OU-MVLP, and GREW)\ndemonstrate that our proposed method achieves state-of-the-art performance on\nmultiple benchmark tests.\n","authors":["Qian Wu","Ruixuan Xiao","Kaixin Xu","Jingcheng Ni","Boxun Li","Ziyao Xu"],"pdf_url":"https://arxiv.org/pdf/2307.13259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13254v1","updated":"2023-07-25T04:48:03Z","published":"2023-07-25T04:48:03Z","title":"Conditional Cross Attention Network for Multi-Space Embedding without\n  Entanglement in Only a SINGLE Network","summary":"  Many studies in vision tasks have aimed to create effective embedding spaces\nfor single-label object prediction within an image. However, in reality, most\nobjects possess multiple specific attributes, such as shape, color, and length,\nwith each attribute composed of various classes. To apply models in real-world\nscenarios, it is essential to be able to distinguish between the granular\ncomponents of an object. Conventional approaches to embedding multiple specific\nattributes into a single network often result in entanglement, where\nfine-grained features of each attribute cannot be identified separately. To\naddress this problem, we propose a Conditional Cross-Attention Network that\ninduces disentangled multi-space embeddings for various specific attributes\nwith only a single backbone. Firstly, we employ a cross-attention mechanism to\nfuse and switch the information of conditions (specific attributes), and we\ndemonstrate its effectiveness through a diverse visualization example.\nSecondly, we leverage the vision transformer for the first time to a\nfine-grained image retrieval task and present a simple yet effective framework\ncompared to existing methods. Unlike previous studies where performance varied\ndepending on the benchmark dataset, our proposed method achieved consistent\nstate-of-the-art performance on the FashionAI, DARN, DeepFashion, and Zappos50K\nbenchmark datasets.\n","authors":["Chull Hwan Song","Taebaek Hwang","Jooyoung Yoon","Shunghyun Choi","Yeong Hyeon Gu"],"pdf_url":"https://arxiv.org/pdf/2307.13254v1.pdf","comment":"ICCV 2023 Accepted"},{"id":"http://arxiv.org/abs/2307.13251v1","updated":"2023-07-25T04:43:22Z","published":"2023-07-25T04:43:22Z","title":"GaPro: Box-Supervised 3D Point Cloud Instance Segmentation Using\n  Gaussian Processes as Pseudo Labelers","summary":"  Instance segmentation on 3D point clouds (3DIS) is a longstanding challenge\nin computer vision, where state-of-the-art methods are mainly based on full\nsupervision. As annotating ground truth dense instance masks is tedious and\nexpensive, solving 3DIS with weak supervision has become more practical. In\nthis paper, we propose GaPro, a new instance segmentation for 3D point clouds\nusing axis-aligned 3D bounding box supervision. Our two-step approach involves\ngenerating pseudo labels from box annotations and training a 3DIS network with\nthe resulting labels. Additionally, we employ the self-training strategy to\nimprove the performance of our method further. We devise an effective Gaussian\nProcess to generate pseudo instance masks from the bounding boxes and resolve\nambiguities when they overlap, resulting in pseudo instance masks with their\nuncertainty values. Our experiments show that GaPro outperforms previous weakly\nsupervised 3D instance segmentation methods and has competitive performance\ncompared to state-of-the-art fully supervised ones. Furthermore, we demonstrate\nthe robustness of our approach, where we can adapt various state-of-the-art\nfully supervised methods to the weak supervision task by using our pseudo\nlabels for training. The source code and trained models are available at\nhttps://github.com/VinAIResearch/GaPro.\n","authors":["Tuan Duc Ngo","Binh-Son Hua","Khoi Nguyen"],"pdf_url":"https://arxiv.org/pdf/2307.13251v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13250v1","updated":"2023-07-25T04:41:32Z","published":"2023-07-25T04:41:32Z","title":"Keyword-Aware Relative Spatio-Temporal Graph Networks for Video Question\n  Answering","summary":"  The main challenge in video question answering (VideoQA) is to capture and\nunderstand the complex spatial and temporal relations between objects based on\ngiven questions. Existing graph-based methods for VideoQA usually ignore\nkeywords in questions and employ a simple graph to aggregate features without\nconsidering relative relations between objects, which may lead to inferior\nperformance. In this paper, we propose a Keyword-aware Relative Spatio-Temporal\n(KRST) graph network for VideoQA. First, to make question features aware of\nkeywords, we employ an attention mechanism to assign high weights to keywords\nduring question encoding. The keyword-aware question features are then used to\nguide video graph construction. Second, because relations are relative, we\nintegrate the relative relation modeling to better capture the spatio-temporal\ndynamics among object nodes. Moreover, we disentangle the spatio-temporal\nreasoning into an object-level spatial graph and a frame-level temporal graph,\nwhich reduces the impact of spatial and temporal relation reasoning on each\nother. Extensive experiments on the TGIF-QA, MSVD-QA and MSRVTT-QA datasets\ndemonstrate the superiority of our KRST over multiple state-of-the-art methods.\n","authors":["Yi Cheng","Hehe Fan","Dongyun Lin","Ying Sun","Mohan Kankanhalli","Joo-Hwee Lim"],"pdf_url":"https://arxiv.org/pdf/2307.13250v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2307.13244v1","updated":"2023-07-25T04:12:50Z","published":"2023-07-25T04:12:50Z","title":"Multi-Granularity Prediction with Learnable Fusion for Scene Text\n  Recognition","summary":"  Due to the enormous technical challenges and wide range of applications,\nscene text recognition (STR) has been an active research topic in computer\nvision for years. To tackle this tough problem, numerous innovative methods\nhave been successively proposed, and incorporating linguistic knowledge into\nSTR models has recently become a prominent trend. In this work, we first draw\ninspiration from the recent progress in Vision Transformer (ViT) to construct a\nconceptually simple yet functionally powerful vision STR model, which is built\nupon ViT and a tailored Adaptive Addressing and Aggregation (A$^3$) module. It\nalready outperforms most previous state-of-the-art models for scene text\nrecognition, including both pure vision models and language-augmented methods.\nTo integrate linguistic knowledge, we further propose a Multi-Granularity\nPrediction strategy to inject information from the language modality into the\nmodel in an implicit way, \\ie, subword representations (BPE and WordPiece)\nwidely used in NLP are introduced into the output space, in addition to the\nconventional character level representation, while no independent language\nmodel (LM) is adopted. To produce the final recognition results, two strategies\nfor effectively fusing the multi-granularity predictions are devised. The\nresultant algorithm (termed MGP-STR) is able to push the performance envelope\nof STR to an even higher level. Specifically, MGP-STR achieves an average\nrecognition accuracy of $94\\%$ on standard benchmarks for scene text\nrecognition. Moreover, it also achieves state-of-the-art results on widely-used\nhandwritten benchmarks as well as more challenging scene text datasets,\ndemonstrating the generality of the proposed MGP-STR algorithm. The source code\nand models will be available at:\n\\url{https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/MGP-STR}.\n","authors":["Cheng Da","Peng Wang","Cong Yao"],"pdf_url":"https://arxiv.org/pdf/2307.13244v1.pdf","comment":"submitted to TPAMI; an extension to our previous ECCV 2022 paper\n  arXiv:2209.03592"},{"id":"http://arxiv.org/abs/2307.13240v1","updated":"2023-07-25T04:06:25Z","published":"2023-07-25T04:06:25Z","title":"Fashion Matrix: Editing Photos by Just Talking","summary":"  The utilization of Large Language Models (LLMs) for the construction of AI\nsystems has garnered significant attention across diverse fields. The extension\nof LLMs to the domain of fashion holds substantial commercial potential but\nalso inherent challenges due to the intricate semantic interactions in\nfashion-related generation. To address this issue, we developed a hierarchical\nAI system called Fashion Matrix dedicated to editing photos by just talking.\nThis system facilitates diverse prompt-driven tasks, encompassing garment or\naccessory replacement, recoloring, addition, and removal. Specifically, Fashion\nMatrix employs LLM as its foundational support and engages in iterative\ninteractions with users. It employs a range of Semantic Segmentation Models\n(e.g., Grounded-SAM, MattingAnything, etc.) to delineate the specific editing\nmasks based on user instructions. Subsequently, Visual Foundation Models (e.g.,\nStable Diffusion, ControlNet, etc.) are leveraged to generate edited images\nfrom text prompts and masks, thereby facilitating the automation of fashion\nediting processes. Experiments demonstrate the outstanding ability of Fashion\nMatrix to explores the collaborative potential of functionally diverse\npre-trained models in the domain of fashion editing.\n","authors":["Zheng Chong","Xujie Zhang","Fuwei Zhao","Zhenyu Xie","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2307.13240v1.pdf","comment":"13 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.12907v2","updated":"2023-07-25T04:05:58Z","published":"2023-07-24T16:02:42Z","title":"GridMM: Grid Memory Map for Vision-and-Language Navigation","summary":"  Vision-and-language navigation (VLN) enables the agent to navigate to a\nremote location following the natural language instruction in 3D environments.\nTo represent the previously visited environment, most approaches for VLN\nimplement memory using recurrent states, topological maps, or top-down semantic\nmaps. In contrast to these approaches, we build the top-down egocentric and\ndynamically growing Grid Memory Map (i.e., GridMM) to structure the visited\nenvironment. From a global perspective, historical observations are projected\ninto a unified grid map in a top-down view, which can better represent the\nspatial relations of the environment. From a local perspective, we further\npropose an instruction relevance aggregation method to capture fine-grained\nvisual clues in each grid region. Extensive experiments are conducted on both\nthe REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE\ndataset in the continuous environments, showing the superiority of our proposed\nmethod.\n","authors":["Zihan Wang","Xiangyang Li","Jiahao Yang","Yeqi Liu","Shuqiang Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.12907v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13236v1","updated":"2023-07-25T03:59:04Z","published":"2023-07-25T03:59:04Z","title":"Audio-aware Query-enhanced Transformer for Audio-Visual Segmentation","summary":"  The goal of the audio-visual segmentation (AVS) task is to segment the\nsounding objects in the video frames using audio cues. However, current\nfusion-based methods have the performance limitations due to the small\nreceptive field of convolution and inadequate fusion of audio-visual features.\nTo overcome these issues, we propose a novel \\textbf{Au}dio-aware\nquery-enhanced \\textbf{TR}ansformer (AuTR) to tackle the task. Unlike existing\nmethods, our approach introduces a multimodal transformer architecture that\nenables deep fusion and aggregation of audio-visual features. Furthermore, we\ndevise an audio-aware query-enhanced transformer decoder that explicitly helps\nthe model focus on the segmentation of the pinpointed sounding objects based on\naudio signals, while disregarding silent yet salient objects. Experimental\nresults show that our method outperforms previous methods and demonstrates\nbetter generalization ability in multi-sound and open-set scenarios.\n","authors":["Jinxiang Liu","Chen Ju","Chaofan Ma","Yanfeng Wang","Yu Wang","Ya Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13236v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.11019"},{"id":"http://arxiv.org/abs/2307.13226v1","updated":"2023-07-25T03:30:09Z","published":"2023-07-25T03:30:09Z","title":"Strivec: Sparse Tri-Vector Radiance Fields","summary":"  We propose Strivec, a novel neural representation that models a 3D scene as a\nradiance field with sparsely distributed and compactly factorized local tensor\nfeature grids. Our approach leverages tensor decomposition, following the\nrecent work TensoRF, to model the tensor grids. In contrast to TensoRF which\nuses a global tensor and focuses on their vector-matrix decomposition, we\npropose to utilize a cloud of local tensors and apply the classic\nCANDECOMP/PARAFAC (CP) decomposition to factorize each tensor into triple\nvectors that express local feature distributions along spatial axes and\ncompactly encode a local neural field. We also apply multi-scale tensor grids\nto discover the geometry and appearance commonalities and exploit spatial\ncoherence with the tri-vector factorization at multiple local scales. The final\nradiance field properties are regressed by aggregating neural features from\nmultiple local tensors across all scales. Our tri-vector tensors are sparsely\ndistributed around the actual scene surface, discovered by a fast coarse\nreconstruction, leveraging the sparsity of a 3D scene. We demonstrate that our\nmodel can achieve better rendering quality while using significantly fewer\nparameters than previous methods, including TensoRF and Instant-NGP.\n","authors":["Quankai Gao","Qiangeng Xu","Hao Su","Ulrich Neumann","Zexiang Xu"],"pdf_url":"https://arxiv.org/pdf/2307.13226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13221v1","updated":"2023-07-25T03:18:04Z","published":"2023-07-25T03:18:04Z","title":"Multilevel Large Language Models for Everyone","summary":"  Large language models have made significant progress in the past few years.\nHowever, they are either generic {\\it or} field specific, splitting the\ncommunity into different groups. In this paper, we unify these large language\nmodels into a larger map, where the generic {\\it and} specific models are\nlinked together and can improve each other, based on the user personal input\nand information from the internet. The idea of linking several large language\nmodels together is inspired by the functionality of human brain. The specific\nregions on the brain cortex are specific for certain low level functionality.\nAnd these regions can jointly work together to achieve more complex high level\nfunctionality. Such behavior on human brain cortex sheds the light to design\nthe multilevel large language models that contain global level, field level and\nuser level models. The user level models run on local machines to achieve\nefficient response and protect the user's privacy. Such multilevel models\nreduce some redundancy and perform better than the single level models. The\nproposed multilevel idea can be applied in various applications, such as\nnatural language processing, computer vision tasks, professional assistant,\nbusiness and healthcare.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2307.13221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13215v1","updated":"2023-07-25T02:56:20Z","published":"2023-07-25T02:56:20Z","title":"Image Segmentation Keras : Implementation of Segnet, FCN, UNet, PSPNet\n  and other models in Keras","summary":"  Semantic segmentation plays a vital role in computer vision tasks, enabling\nprecise pixel-level understanding of images. In this paper, we present a\ncomprehensive library for semantic segmentation, which contains implementations\nof popular segmentation models like SegNet, FCN, UNet, and PSPNet. We also\nevaluate and compare these models on several datasets, offering researchers and\npractitioners a powerful toolset for tackling diverse segmentation challenges.\n","authors":["Divam Gupta"],"pdf_url":"https://arxiv.org/pdf/2307.13215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04264v3","updated":"2023-07-25T02:45:53Z","published":"2023-02-08T18:58:00Z","title":"Nerfstudio: A Modular Framework for Neural Radiance Field Development","summary":"  Neural Radiance Fields (NeRF) are a rapidly growing area of research with\nwide-ranging applications in computer vision, graphics, robotics, and more. In\norder to streamline the development and deployment of NeRF research, we propose\na modular PyTorch framework, Nerfstudio. Our framework includes plug-and-play\ncomponents for implementing NeRF-based methods, which make it easy for\nresearchers and practitioners to incorporate NeRF into their projects.\nAdditionally, the modular design enables support for extensive real-time\nvisualization tools, streamlined pipelines for importing captured in-the-wild\ndata, and tools for exporting to video, point cloud and mesh representations.\nThe modularity of Nerfstudio enables the development of Nerfacto, our method\nthat combines components from recent papers to achieve a balance between speed\nand quality, while also remaining flexible to future modifications. To promote\ncommunity-driven development, all associated code and data are made publicly\navailable with open-source licensing at https://nerf.studio.\n","authors":["Matthew Tancik","Ethan Weber","Evonne Ng","Ruilong Li","Brent Yi","Justin Kerr","Terrance Wang","Alexander Kristoffersen","Jake Austin","Kamyar Salahi","Abhik Ahuja","David McAllister","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2302.04264v3.pdf","comment":"Project page at https://nerf.studio"},{"id":"http://arxiv.org/abs/2307.11514v2","updated":"2023-07-25T02:44:55Z","published":"2023-07-21T11:50:05Z","title":"CORE: Cooperative Reconstruction for Multi-Agent Perception","summary":"  This paper presents CORE, a conceptually simple, effective and\ncommunication-efficient model for multi-agent cooperative perception. It\naddresses the task from a novel perspective of cooperative reconstruction,\nbased on two key insights: 1) cooperating agents together provide a more\nholistic observation of the environment, and 2) the holistic observation can\nserve as valuable supervision to explicitly guide the model learning how to\nreconstruct the ideal observation based on collaboration. CORE instantiates the\nidea with three major components: a compressor for each agent to create more\ncompact feature representation for efficient broadcasting, a lightweight\nattentive collaboration component for cross-agent message aggregation, and a\nreconstruction module to reconstruct the observation based on aggregated\nfeature representations. This learning-to-reconstruct idea is task-agnostic,\nand offers clear and reasonable supervision to inspire more effective\ncollaboration, eventually promoting perception tasks. We validate CORE on\nOPV2V, a large-scale multi-agent percetion dataset, in two tasks, i.e., 3D\nobject detection and semantic segmentation. Results demonstrate that the model\nachieves state-of-the-art performance on both tasks, and is more\ncommunication-efficient.\n","authors":["Binglu Wang","Lei Zhang","Zhaozhong Wang","Yongqiang Zhao","Tianfei Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.11514v2.pdf","comment":"Accepted to ICCV 2023; Code: https://github.com/zllxot/CORE"},{"id":"http://arxiv.org/abs/2302.13434v2","updated":"2023-07-25T02:24:04Z","published":"2023-02-26T23:02:33Z","title":"Spatial-temporal Transformer-guided Diffusion based Data Augmentation\n  for Efficient Skeleton-based Action Recognition","summary":"  Recently, skeleton-based human action has become a hot research topic because\nthe compact representation of human skeletons brings new blood to this research\ndomain. As a result, researchers began to notice the importance of using RGB or\nother sensors to analyze human action by extracting skeleton information.\nLeveraging the rapid development of deep learning (DL), a significant number of\nskeleton-based human action approaches have been presented with fine-designed\nDL structures recently. However, a well-trained DL model always demands\nhigh-quality and sufficient data, which is hard to obtain without costing high\nexpenses and human labor. In this paper, we introduce a novel data augmentation\nmethod for skeleton-based action recognition tasks, which can effectively\ngenerate high-quality and diverse sequential actions. In order to obtain\nnatural and realistic action sequences, we propose denoising diffusion\nprobabilistic models (DDPMs) that can generate a series of synthetic action\nsequences, and their generation process is precisely guided by a\nspatial-temporal transformer (ST-Trans). Experimental results show that our\nmethod outperforms the state-of-the-art (SOTA) motion generation approaches on\ndifferent naturality and diversity metrics. It proves that its high-quality\nsynthetic data can also be effectively deployed to existing action recognition\nmodels with significant performance improvement.\n","authors":["Yifan Jiang","Han Chen","Hanseok Ko"],"pdf_url":"https://arxiv.org/pdf/2302.13434v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06524v2","updated":"2023-07-25T02:22:16Z","published":"2022-12-13T12:17:13Z","title":"SST: Real-time End-to-end Monocular 3D Reconstruction via Sparse\n  Spatial-Temporal Guidance","summary":"  Real-time monocular 3D reconstruction is a challenging problem that remains\nunsolved. Although recent end-to-end methods have demonstrated promising\nresults, tiny structures and geometric boundaries are hardly captured due to\ntheir insufficient supervision neglecting spatial details and oversimplified\nfeature fusion ignoring temporal cues. To address the problems, we propose an\nend-to-end 3D reconstruction network SST, which utilizes Sparse estimated\npoints from visual SLAM system as additional Spatial guidance and fuses\nTemporal features via a novel cross-modal attention mechanism, achieving more\ndetailed reconstruction results. We propose a Local Spatial-Temporal Fusion\nmodule to exploit more informative spatial-temporal cues from multi-view color\ninformation and sparse priors, as well a Global Spatial-Temporal Fusion module\nto refine the local TSDF volumes with the world-frame model from coarse to\nfine. Extensive experiments on ScanNet and 7-Scenes demonstrate that SST\noutperforms all state-of-the-art competitors, whilst keeping a high inference\nspeed at 59 FPS, enabling real-world applications with real-time requirements.\n","authors":["Chenyangguang Zhang","Zhiqiang Lou","Yan Di","Federico Tombari","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2212.06524v2.pdf","comment":"ICME 2023 (oral)"},{"id":"http://arxiv.org/abs/2303.17743v2","updated":"2023-07-25T01:50:39Z","published":"2023-03-30T23:30:42Z","title":"FairGen: Towards Fair Graph Generation","summary":"  There have been tremendous efforts over the past decades dedicated to the\ngeneration of realistic graphs in a variety of domains, ranging from social\nnetworks to computer networks, from gene regulatory networks to online\ntransaction networks. Despite the remarkable success, the vast majority of\nthese works are unsupervised in nature and are typically trained to minimize\nthe expected graph reconstruction loss, which would result in the\nrepresentation disparity issue in the generated graphs, i.e., the protected\ngroups (often minorities) contribute less to the objective and thus suffer from\nsystematically higher errors. In this paper, we aim to tailor graph generation\nto downstream mining tasks by leveraging label information and user-preferred\nparity constraint. In particular, we start from the investigation of\nrepresentation disparity in the context of graph generative models. To mitigate\nthe disparity, we propose a fairness-aware graph generative model named\nFairGen. Our model jointly trains a label-informed graph generation module and\na fair representation learning module by progressively learning the behaviors\nof the protected and unprotected groups, from the `easy' concepts to the `hard'\nones. In addition, we propose a generic context sampling strategy for graph\ngenerative models, which is proven to be capable of fairly capturing the\ncontextual information of each group with a high probability. Experimental\nresults on seven real-world data sets, including web-based graphs, demonstrate\nthat FairGen (1) obtains performance on par with state-of-the-art graph\ngenerative models across six network properties, (2) mitigates the\nrepresentation disparity issues in the generated graphs, and (3) substantially\nboosts the model performance by up to 17% in downstream tasks via data\naugmentation.\n","authors":["Lecheng Zheng","Dawei Zhou","Hanghang Tong","Jiejun Xu","Yada Zhu","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2303.17743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13199v1","updated":"2023-07-25T01:35:37Z","published":"2023-07-25T01:35:37Z","title":"An Investigation into Glomeruli Detection in Kidney H&E and PAS Images\n  using YOLO","summary":"  Context: Analyzing digital pathology images is necessary to draw diagnostic\nconclusions by investigating tissue patterns and cellular morphology. However,\nmanual evaluation can be time-consuming, expensive, and prone to inter- and\nintra-observer variability. Objective: To assist pathologists using\ncomputerized solutions, automated tissue structure detection and segmentation\nmust be proposed. Furthermore, generating pixel-level object annotations for\nhistopathology images is expensive and time-consuming. As a result, detection\nmodels with bounding box labels may be a feasible solution. Design: This paper\nstudies. YOLO-v4 (You-Only-Look-Once), a real-time object detector for\nmicroscopic images. YOLO uses a single neural network to predict several\nbounding boxes and class probabilities for objects of interest. YOLO can\nenhance detection performance by training on whole slide images. YOLO-v4 has\nbeen used in this paper. for glomeruli detection in human kidney images.\nMultiple experiments have been designed and conducted based on different\ntraining data of two public datasets and a private dataset from the University\nof Michigan for fine-tuning the model. The model was tested on the private\ndataset from the University of Michigan, serving as an external validation of\ntwo different stains, namely hematoxylin and eosin (H&E) and periodic\nacid-Schiff (PAS). Results: Average specificity and sensitivity for all\nexperiments, and comparison of existing segmentation methods on the same\ndatasets are discussed. Conclusions: Automated glomeruli detection in human\nkidney images is possible using modern AI models. The design and validation for\ndifferent stains still depends on variability of public multi-stain datasets.\n","authors":["Kimia Hemmatirad","Morteza Babaie","Jeffrey Hodgin","Liron Pantanowitz","H. R. Tizhoosh"],"pdf_url":"https://arxiv.org/pdf/2307.13199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02034v2","updated":"2023-07-25T00:19:04Z","published":"2023-03-03T15:52:06Z","title":"Linear CNNs Discover the Statistical Structure of the Dataset Using Only\n  the Most Dominant Frequencies","summary":"  We here present a stepping stone towards a deeper understanding of\nconvolutional neural networks (CNNs) in the form of a theory of learning in\nlinear CNNs. Through analyzing the gradient descent equations, we discover that\nthe evolution of the network during training is determined by the interplay\nbetween the dataset structure and the convolutional network structure. We show\nthat linear CNNs discover the statistical structure of the dataset with\nnon-linear, ordered, stage-like transitions, and that the speed of discovery\nchanges depending on the relationship between the dataset and the convolutional\nnetwork structure. Moreover, we find that this interplay lies at the heart of\nwhat we call the ``dominant frequency bias'', where linear CNNs arrive at these\ndiscoveries using only the dominant frequencies of the different structural\nparts present in the dataset. We furthermore provide experiments that show how\nour theory relates to deep, non-linear CNNs used in practice. Our findings shed\nnew light on the inner working of CNNs, and can help explain their shortcut\nlearning and their tendency to rely on texture instead of shape.\n","authors":["Hannah Pinson","Joeri Lenaerts","Vincent Ginis"],"pdf_url":"https://arxiv.org/pdf/2303.02034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13865v1","updated":"2023-07-25T23:46:48Z","published":"2023-07-25T23:46:48Z","title":"Pretrained Deep 2.5D Models for Efficient Predictive Modeling from\n  Retinal OCT","summary":"  In the field of medical imaging, 3D deep learning models play a crucial role\nin building powerful predictive models of disease progression. However, the\nsize of these models presents significant challenges, both in terms of\ncomputational resources and data requirements. Moreover, achieving high-quality\npretraining of 3D models proves to be even more challenging. To address these\nissues, hybrid 2.5D approaches provide an effective solution for utilizing 3D\nvolumetric data efficiently using 2D models. Combining 2D and 3D techniques\noffers a promising avenue for optimizing performance while minimizing memory\nrequirements. In this paper, we explore 2.5D architectures based on a\ncombination of convolutional neural networks (CNNs), long short-term memory\n(LSTM), and Transformers. In addition, leveraging the benefits of recent\nnon-contrastive pretraining approaches in 2D, we enhanced the performance and\ndata efficiency of 2.5D techniques even further. We demonstrate the\neffectiveness of architectures and associated pretraining on a task of\npredicting progression to wet age-related macular degeneration (AMD) within a\nsix-month period on two large longitudinal OCT datasets.\n","authors":["Taha Emre","Marzieh Oghbaie","Arunava Chakravarty","Antoine Rivail","Sophie Riedl","Julia Mai","Hendrik P. N. Scholl","Sobha Sivaprasad","Daniel Rueckert","Andrew Lotery","Ursula Schmidt-Erfurth","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2307.13865v1.pdf","comment":"Accepted at OMIA-X MICCAI'23 Workshop"},{"id":"http://arxiv.org/abs/2305.11990v2","updated":"2023-07-25T23:43:35Z","published":"2023-05-19T20:30:59Z","title":"Productive Crop Field Detection: A New Dataset and Deep Learning\n  Benchmark Results","summary":"  In precision agriculture, detecting productive crop fields is an essential\npractice that allows the farmer to evaluate operating performance separately\nand compare different seed varieties, pesticides, and fertilizers. However,\nmanually identifying productive fields is often a time-consuming and\nerror-prone task. Previous studies explore different methods to detect crop\nfields using advanced machine learning algorithms, but they often lack good\nquality labeled data. In this context, we propose a high-quality dataset\ngenerated by machine operation combined with Sentinel-2 images tracked over\ntime. As far as we know, it is the first one to overcome the lack of labeled\nsamples by using this technique. In sequence, we apply a semi-supervised\nclassification of unlabeled data and state-of-the-art supervised and\nself-supervised deep learning methods to detect productive crop fields\nautomatically. Finally, the results demonstrate high accuracy in Positive\nUnlabeled learning, which perfectly fits the problem where we have high\nconfidence in the positive samples. Best performances have been found in\nTriplet Loss Siamese given the existence of an accurate dataset and Contrastive\nLearning considering situations where we do not have a comprehensive labeled\ndataset available.\n","authors":["Eduardo Nascimento","John Just","Jurandy Almeida","Tiago Almeida"],"pdf_url":"https://arxiv.org/pdf/2305.11990v2.pdf","comment":"Preprint of the paper https://doi.org/10.1109/lgrs.2023.3296064\n  published in IEEE Geoscience and Remote Sensing Letters"},{"id":"http://arxiv.org/abs/2307.13856v1","updated":"2023-07-25T23:09:05Z","published":"2023-07-25T23:09:05Z","title":"On the unreasonable vulnerability of transformers for image restoration\n  -- and an easy fix","summary":"  Following their success in visual recognition tasks, Vision\nTransformers(ViTs) are being increasingly employed for image restoration. As a\nfew recent works claim that ViTs for image classification also have better\nrobustness properties, we investigate whether the improved adversarial\nrobustness of ViTs extends to image restoration. We consider the recently\nproposed Restormer model, as well as NAFNet and the \"Baseline network\" which\nare both simplified versions of a Restormer. We use Projected Gradient Descent\n(PGD) and CosPGD, a recently proposed adversarial attack tailored to pixel-wise\nprediction tasks for our robustness evaluation. Our experiments are performed\non real-world images from the GoPro dataset for image deblurring. Our analysis\nindicates that contrary to as advocated by ViTs in image classification works,\nthese models are highly susceptible to adversarial attacks. We attempt to\nimprove their robustness through adversarial training. While this yields a\nsignificant increase in robustness for Restormer, results on other networks are\nless promising. Interestingly, the design choices in NAFNet and Baselines,\nwhich were based on iid performance, and not on robust generalization, seem to\nbe at odds with the model robustness. Thus, we investigate this further and\nfind a fix.\n","authors":["Shashank Agnihotri","Kanchana Vaishnavi Gandikota","Julia Grabinski","Paramanand Chandramouli","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.13856v1.pdf","comment":"Tags: Robustness, adversarial attacks, image deblurring, image\n  restoration, NAFNet, Baseline, Restormer, adversarial training"},{"id":"http://arxiv.org/abs/2307.13855v1","updated":"2023-07-25T23:02:35Z","published":"2023-07-25T23:02:35Z","title":"Exploring the Sharpened Cosine Similarity","summary":"  Convolutional layers have long served as the primary workhorse for image\nclassification. Recently, an alternative to convolution was proposed using the\nSharpened Cosine Similarity (SCS), which in theory may serve as a better\nfeature detector. While multiple sources report promising results, there has\nnot been to date a full-scale empirical analysis of neural network performance\nusing these new layers. In our work, we explore SCS's parameter behavior and\npotential as a drop-in replacement for convolutions in multiple CNN\narchitectures benchmarked on CIFAR-10. We find that while SCS may not yield\nsignificant increases in accuracy, it may learn more interpretable\nrepresentations. We also find that, in some circumstances, SCS may confer a\nslight increase in adversarial robustness.\n","authors":["Skyler Wu","Fred Lu","Edward Raff","James Holt"],"pdf_url":"https://arxiv.org/pdf/2307.13855v1.pdf","comment":"Accepted to I Can't Believe It's Not Better Workshop (ICBINB) at\n  NeurIPS 2022"},{"id":"http://arxiv.org/abs/2307.13851v1","updated":"2023-07-25T22:54:47Z","published":"2023-07-25T22:54:47Z","title":"SplitFed resilience to packet loss: Where to split, that is the question","summary":"  Decentralized machine learning has broadened its scope recently with the\ninvention of Federated Learning (FL), Split Learning (SL), and their hybrids\nlike Split Federated Learning (SplitFed or SFL). The goal of SFL is to reduce\nthe computational power required by each client in FL and parallelize SL while\nmaintaining privacy. This paper investigates the robustness of SFL against\npacket loss on communication links. The performance of various SFL aggregation\nstrategies is examined by splitting the model at two points -- shallow split\nand deep split -- and testing whether the split point makes a statistically\nsignificant difference to the accuracy of the final model. Experiments are\ncarried out on a segmentation model for human embryo images and indicate the\nstatistically significant advantage of a deeper split point.\n","authors":["Chamani Shiranthika","Zahra Hafezi Kafshgari","Parvaneh Saeedi","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2307.13851v1.pdf","comment":"10 pages, 4 figures, MICCAI 2023 Workshop on Distributed,\n  Collaborative and Federated Learning"},{"id":"http://arxiv.org/abs/2307.13850v1","updated":"2023-07-25T22:51:36Z","published":"2023-07-25T22:51:36Z","title":"MAEA: Multimodal Attribution for Embodied AI","summary":"  Understanding multimodal perception for embodied AI is an open question\nbecause such inputs may contain highly complementary as well as redundant\ninformation for the task. A relevant direction for multimodal policies is\nunderstanding the global trends of each modality at the fusion layer. To this\nend, we disentangle the attributions for visual, language, and previous action\ninputs across different policies trained on the ALFRED dataset. Attribution\nanalysis can be utilized to rank and group the failure scenarios, investigate\nmodeling and dataset biases, and critically analyze multimodal EAI policies for\nrobustness and user trust before deployment. We present MAEA, a framework to\ncompute global attributions per modality of any differentiable policy. In\naddition, we show how attributions enable lower-level behavior analysis in EAI\npolicies for language and visual attributions.\n","authors":["Vidhi Jain","Jayant Sravan Tamarapalli","Sahiti Yerramilli","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2307.13850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08544v4","updated":"2023-07-25T22:42:36Z","published":"2022-11-12T06:11:36Z","title":"Exploiting the Partly Scratch-off Lottery Ticket for Quantization-Aware\n  Training","summary":"  Quantization-aware training (QAT) receives extensive popularity as it well\nretains the performance of quantized networks. In QAT, the contemporary\nexperience is that all quantized weights are updated for an entire training\nprocess. In this paper, this experience is challenged based on an interesting\nphenomenon we observed. Specifically, a large portion of quantized weights\nreaches the optimal quantization level after a few training epochs, which we\nrefer to as the partly scratch-off lottery ticket. This\nstraightforward-yet-valuable observation naturally inspires us to zero out\ngradient calculations of these weights in the remaining training period to\navoid meaningless updating. To effectively find the ticket, we develop a\nheuristic method, dubbed lottery ticket scratcher (LTS), which freezes a weight\nonce the distance between the full-precision one and its quantization level is\nsmaller than a controllable threshold. Surprisingly, the proposed LTS typically\neliminates 50%-70% weight updating and 25%-35% FLOPs of the backward pass,\nwhile still resulting on par with or even better performance than the compared\nbaseline. For example, compared with the baseline, LTS improves 2-bit\nMobileNetV2 by 5.05%, eliminating 46% weight updating and 23% FLOPs of the\nbackward pass. Code is at url{https://github.com/zysxmu/LTS}.\n","authors":["Yunshan Zhong","Gongrui Nan","Yuxin Zhang","Fei Chao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2211.08544v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13842v1","updated":"2023-07-25T22:37:10Z","published":"2023-07-25T22:37:10Z","title":"CosSIF: Cosine similarity-based image filtering to overcome low\n  inter-class variation in synthetic medical image datasets","summary":"  Crafting effective deep learning models for medical image analysis is a\ncomplex task, particularly in cases where the medical image dataset lacks\nsignificant inter-class variation. This challenge is further aggravated when\nemploying such datasets to generate synthetic images using generative\nadversarial networks (GANs), as the output of GANs heavily relies on the input\ndata. In this research, we propose a novel filtering algorithm called Cosine\nSimilarity-based Image Filtering (CosSIF). We leverage CosSIF to develop two\ndistinct filtering methods: Filtering Before GAN Training (FBGT) and Filtering\nAfter GAN Training (FAGT). FBGT involves the removal of real images that\nexhibit similarities to images of other classes before utilizing them as the\ntraining dataset for a GAN. On the other hand, FAGT focuses on eliminating\nsynthetic images with less discriminative features compared to real images used\nfor training the GAN. Experimental results reveal that employing either the\nFAGT or FBGT method with modern transformer and convolutional-based networks\nleads to substantial performance gains in various evaluation metrics. FAGT\nimplementation on the ISIC-2016 dataset surpasses the baseline method in terms\nof sensitivity by 1.59\\% and AUC by 1.88\\%. Furthermore, for the HAM10000\ndataset, applying FABT outperforms the baseline approach in terms of recall by\n13.75\\%, and with the sole implementation of FAGT, achieves a maximum accuracy\nof 94.44\\%.\n","authors":["Mominul Islam","Hasib Zunair","Nabeel Mohammed"],"pdf_url":"https://arxiv.org/pdf/2307.13842v1.pdf","comment":"18 pages, 20 figures"},{"id":"http://arxiv.org/abs/2303.06040v2","updated":"2023-07-25T20:53:03Z","published":"2023-03-10T16:30:09Z","title":"Importance of Aligning Training Strategy with Evaluation for Diffusion\n  Models in 3D Multiclass Segmentation","summary":"  Recently, denoising diffusion probabilistic models (DDPM) have been applied\nto image segmentation by generating segmentation masks conditioned on images,\nwhile the applications were mainly limited to 2D networks without exploiting\npotential benefits from the 3D formulation. In this work, we studied the\nDDPM-based segmentation model for 3D multiclass segmentation on two large\nmulticlass data sets (prostate MR and abdominal CT). We observed that the\ndifference between training and test methods led to inferior performance for\nexisting DDPM methods. To mitigate the inconsistency, we proposed a recycling\nmethod which generated corrupted masks based on the model's prediction at a\nprevious time step instead of using ground truth. The proposed method achieved\nstatistically significantly improved performance compared to existing DDPMs,\nindependent of a number of other techniques for reducing train-test\ndiscrepancy, including performing mask prediction, using Dice loss, and\nreducing the number of diffusion time steps during training. The performance of\ndiffusion models was also competitive and visually similar to\nnon-diffusion-based U-net, within the same compute budget. The JAX-based\ndiffusion framework has been released at\nhttps://github.com/mathpluscode/ImgX-DiffSeg.\n","authors":["Yunguan Fu","Yiwen Li","Shaheer U. Saeed","Matthew J. Clarkson","Yipeng Hu"],"pdf_url":"https://arxiv.org/pdf/2303.06040v2.pdf","comment":"Accepted at Deep Generative Models workshop at MICCAI 2023"},{"id":"http://arxiv.org/abs/2302.12247v2","updated":"2023-07-25T20:50:10Z","published":"2023-02-23T18:59:05Z","title":"Quantifying & Modeling Multimodal Interactions: An Information\n  Decomposition Framework","summary":"  The recent explosion of interest in multimodal applications has resulted in a\nwide selection of datasets and methods for representing and integrating\ninformation from different modalities. Despite these empirical advances, there\nremain fundamental research questions: How can we quantify the interactions\nthat are necessary to solve a multimodal task? Subsequently, what are the most\nsuitable multimodal models to capture these interactions? To answer these\nquestions, we propose an information-theoretic approach to quantify the degree\nof redundancy, uniqueness, and synergy relating input modalities with an output\ntask. We term these three measures as the PID statistics of a multimodal\ndistribution (or PID for short), and introduce two new estimators for these PID\nstatistics that scale to high-dimensional distributions. To validate PID\nestimation, we conduct extensive experiments on both synthetic datasets where\nthe PID is known and on large-scale multimodal benchmarks where PID estimations\nare compared with human annotations. Finally, we demonstrate their usefulness\nin (1) quantifying interactions within multimodal datasets, (2) quantifying\ninteractions captured by multimodal models, (3) principled approaches for model\nselection, and (4) three real-world case studies engaging with domain experts\nin pathology, mood prediction, and robotic perception where our framework helps\nto recommend strong multimodal models for each application.\n","authors":["Paul Pu Liang","Yun Cheng","Xiang Fan","Chun Kai Ling","Suzanne Nie","Richard Chen","Zihao Deng","Nicholas Allen","Randy Auerbach","Faisal Mahmood","Ruslan Salakhutdinov","Louis-Philippe Morency"],"pdf_url":"https://arxiv.org/pdf/2302.12247v2.pdf","comment":"Code available at: https://github.com/pliang279/PID"},{"id":"http://arxiv.org/abs/2303.04068v3","updated":"2023-07-25T20:09:55Z","published":"2023-03-07T17:26:04Z","title":"VOCALExplore: Pay-as-You-Go Video Data Exploration and Model Building\n  [Technical Report]","summary":"  We introduce VOCALExplore, a system designed to support users in building\ndomain-specific models over video datasets. VOCALExplore supports interactive\nlabeling sessions and trains models using user-supplied labels. VOCALExplore\nmaximizes model quality by automatically deciding how to select samples based\non observed skew in the collected labels. It also selects the optimal video\nrepresentations to use when training models by casting feature selection as a\nrising bandit problem. Finally, VOCALExplore implements optimizations to\nachieve low latency without sacrificing model performance. We demonstrate that\nVOCALExplore achieves close to the best possible model quality given candidate\nacquisition functions and feature extractors, and it does so with low visible\nlatency (~1 second per iteration) and no expensive preprocessing.\n","authors":["Maureen Daum","Enhao Zhang","Dong He","Stephen Mussmann","Brandon Haynes","Ranjay Krishna","Magdalena Balazinska"],"pdf_url":"https://arxiv.org/pdf/2303.04068v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15446v2","updated":"2023-07-25T19:56:00Z","published":"2023-03-27T17:59:58Z","title":"SwiftFormer: Efficient Additive Attention for Transformer-based\n  Real-time Mobile Vision Applications","summary":"  Self-attention has become a defacto choice for capturing global context in\nvarious vision applications. However, its quadratic computational complexity\nwith respect to image resolution limits its use in real-time applications,\nespecially for deployment on resource-constrained mobile devices. Although\nhybrid approaches have been proposed to combine the advantages of convolutions\nand self-attention for a better speed-accuracy trade-off, the expensive matrix\nmultiplication operations in self-attention remain a bottleneck. In this work,\nwe introduce a novel efficient additive attention mechanism that effectively\nreplaces the quadratic matrix multiplication operations with linear\nelement-wise multiplications. Our design shows that the key-value interaction\ncan be replaced with a linear layer without sacrificing any accuracy. Unlike\nprevious state-of-the-art methods, our efficient formulation of self-attention\nenables its usage at all stages of the network. Using our proposed efficient\nadditive attention, we build a series of models called \"SwiftFormer\" which\nachieves state-of-the-art performance in terms of both accuracy and mobile\ninference speed. Our small variant achieves 78.5% top-1 ImageNet-1K accuracy\nwith only 0.8 ms latency on iPhone 14, which is more accurate and 2x faster\ncompared to MobileViT-v2. Code: https://github.com/Amshaker/SwiftFormer\n","authors":["Abdelrahman Shaker","Muhammad Maaz","Hanoona Rasheed","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2303.15446v2.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13770v1","updated":"2023-07-25T19:03:21Z","published":"2023-07-25T19:03:21Z","title":"E^2VPT: An Effective and Efficient Approach for Visual Prompt Tuning","summary":"  As the size of transformer-based models continues to grow, fine-tuning these\nlarge-scale pretrained vision models for new tasks has become increasingly\nparameter-intensive. Parameter-efficient learning has been developed to reduce\nthe number of tunable parameters during fine-tuning. Although these methods\nshow promising results, there is still a significant performance gap compared\nto full fine-tuning. To address this challenge, we propose an Effective and\nEfficient Visual Prompt Tuning (E^2VPT) approach for large-scale\ntransformer-based model adaptation. Specifically, we introduce a set of\nlearnable key-value prompts and visual prompts into self-attention and input\nlayers, respectively, to improve the effectiveness of model fine-tuning.\nMoreover, we design a prompt pruning procedure to systematically prune low\nimportance prompts while preserving model performance, which largely enhances\nthe model's efficiency. Empirical results demonstrate that our approach\noutperforms several state-of-the-art baselines on two benchmarks, with\nconsiderably low parameter usage (e.g., 0.32% of model parameters on VTAB-1k).\nOur code is available at https://github.com/ChengHan111/E2VPT.\n","authors":["Cheng Han","Qifan Wang","Yiming Cui","Zhiwen Cao","Wenguan Wang","Siyuan Qi","Dongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13770v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.13765v1","updated":"2023-07-25T18:50:05Z","published":"2023-07-25T18:50:05Z","title":"A real-time material breakage detection for offshore wind turbines based\n  on improved neural network algorithm","summary":"  The integrity of offshore wind turbines, pivotal for sustainable energy\ngeneration, is often compromised by surface material defects. Despite the\navailability of various detection techniques, limitations persist regarding\ncost-effectiveness, efficiency, and applicability. Addressing these\nshortcomings, this study introduces a novel approach leveraging an advanced\nversion of the YOLOv8 object detection model, supplemented with a Convolutional\nBlock Attention Module (CBAM) for improved feature recognition. The optimized\nloss function further refines the learning process. Employing a dataset of\n5,432 images from the Saemangeum offshore wind farm and a publicly available\ndataset, our method underwent rigorous testing. The findings reveal a\nsubstantial enhancement in defect detection stability, marking a significant\nstride towards efficient turbine maintenance. This study's contributions\nilluminate the path for future research, potentially revolutionizing\nsustainable energy practices.\n","authors":["Yantong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13765v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2306.16019"},{"id":"http://arxiv.org/abs/2307.13762v1","updated":"2023-07-25T18:43:08Z","published":"2023-07-25T18:43:08Z","title":"Implementing and Benchmarking the Locally Competitive Algorithm on the\n  Loihi 2 Neuromorphic Processor","summary":"  Neuromorphic processors have garnered considerable interest in recent years\nfor their potential in energy-efficient and high-speed computing. The Locally\nCompetitive Algorithm (LCA) has been utilized for power efficient sparse coding\non neuromorphic processors, including the first Loihi processor. With the Loihi\n2 processor enabling custom neuron models and graded spike communication, more\ncomplex implementations of LCA are possible. We present a new implementation of\nLCA designed for the Loihi 2 processor and perform an initial set of benchmarks\ncomparing it to LCA on CPU and GPU devices. In these experiments LCA on Loihi 2\nis orders of magnitude more efficient and faster for large sparsity penalties,\nwhile maintaining similar reconstruction quality. We find this performance\nimprovement increases as the LCA parameters are tuned towards greater\nrepresentation sparsity.\n  Our study highlights the potential of neuromorphic processors, particularly\nLoihi 2, in enabling intelligent, autonomous, real-time processing on small\nrobots, satellites where there are strict SWaP (small, lightweight, and low\npower) requirements. By demonstrating the superior performance of LCA on Loihi\n2 compared to conventional computing device, our study suggests that Loihi 2\ncould be a valuable tool in advancing these types of applications. Overall, our\nstudy highlights the potential of neuromorphic processors for efficient and\naccurate data processing on resource-constrained devices.\n","authors":["Gavin Parpart","Sumedh R. Risbud","Garrett T. Kenyon","Yijing Watkins"],"pdf_url":"https://arxiv.org/pdf/2307.13762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13756v1","updated":"2023-07-25T18:28:19Z","published":"2023-07-25T18:28:19Z","title":"PlaneRecTR: Unified Query learning for 3D Plane Recovery from a Single\n  View","summary":"  3D plane recovery from a single image can usually be divided into several\nsubtasks of plane detection, segmentation, parameter estimation and possibly\ndepth estimation. Previous works tend to solve this task by either extending\nthe RCNN-based segmentation network or the dense pixel embedding-based\nclustering framework. However, none of them tried to integrate above related\nsubtasks into a unified framework but treat them separately and sequentially,\nwhich we suspect is potentially a main source of performance limitation for\nexisting approaches. Motivated by this finding and the success of query-based\nlearning in enriching reasoning among semantic entities, in this paper, we\npropose PlaneRecTR, a Transformer-based architecture, which for the first time\nunifies all subtasks related to single-view plane recovery with a single\ncompact model. Extensive quantitative and qualitative experiments demonstrate\nthat our proposed unified learning achieves mutual benefits across subtasks,\nobtaining a new state-of-the-art performance on public ScanNet and NYUv2-Plane\ndatasets. Codes are available at https://github.com/SJingjia/PlaneRecTR.\n","authors":["Jingjia Shi","Shuaifeng Zhi","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2307.13756v1.pdf","comment":"To be published in Proceedings of IEEE International Conference on\n  Computer Vision (ICCV 2023). Codes: https://github.com/SJingjia/PlaneRecTR ,\n  Video: https://youtu.be/YBB7totHGJg"},{"id":"http://arxiv.org/abs/2307.13755v1","updated":"2023-07-25T18:26:22Z","published":"2023-07-25T18:26:22Z","title":"TMR-RD: Training-based Model Refinement and Representation Disagreement\n  for Semi-Supervised Object Detection","summary":"  Semi-supervised object detection (SSOD) can incorporate limited labeled data\nand large amounts of unlabeled data to improve the performance and\ngeneralization of existing object detectors. Despite many advances, recent SSOD\nmethods are still challenged by noisy/misleading pseudo-labels, classical\nexponential moving average (EMA) strategy, and the consensus of Teacher-Student\nmodels in the latter stages of training. This paper proposes a novel\ntraining-based model refinement (TMR) stage and a simple yet effective\nrepresentation disagreement (RD) strategy to address the limitations of\nclassical EMA and the consensus problem. The TMR stage of Teacher-Student\nmodels optimizes the lightweight scaling operation to refine the model's\nweights and prevent overfitting or forgetting learned patterns from unlabeled\ndata. Meanwhile, the RD strategy helps keep these models diverged to encourage\nthe student model to explore complementary representations. In addition, we use\ncascade regression to generate more reliable pseudo-labels for supervising the\nstudent model. Extensive experiments demonstrate the superior performance of\nour approach over state-of-the-art SSOD methods. Specifically, the proposed\napproach outperforms the Unbiased-Teacher method by an average mAP margin of\n4.6% and 5.3% when using partially-labeled and fully-labeled data on the\nMS-COCO dataset, respectively.\n","authors":["Seyed Mojtaba Marvasti-Zadeh","Nilanjan Ray","Nadir Erbilgin"],"pdf_url":"https://arxiv.org/pdf/2307.13755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13746v1","updated":"2023-07-25T18:04:52Z","published":"2023-07-25T18:04:52Z","title":"ChildGAN: Large Scale Synthetic Child Facial Data Using Domain\n  Adaptation in StyleGAN","summary":"  In this research work, we proposed a novel ChildGAN, a pair of GAN networks\nfor generating synthetic boys and girls facial data derived from StyleGAN2.\nChildGAN is built by performing smooth domain transfer using transfer learning.\nIt provides photo-realistic, high-quality data samples. A large-scale dataset\nis rendered with a variety of smart facial transformations: facial expressions,\nage progression, eye blink effects, head pose, skin and hair color variations,\nand variable lighting conditions. The dataset comprises more than 300k distinct\ndata samples. Further, the uniqueness and characteristics of the rendered\nfacial features are validated by running different computer vision application\ntests which include CNN-based child gender classifier, face localization and\nfacial landmarks detection test, identity similarity evaluation using ArcFace,\nand lastly running eye detection and eye aspect ratio tests. The results\ndemonstrate that synthetic child facial data of high quality offers an\nalternative to the cost and complexity of collecting a large-scale dataset from\nreal children.\n","authors":["Muhammad Ali Farooq","Wang Yao","Gabriel Costache","Peter Corcoran"],"pdf_url":"https://arxiv.org/pdf/2307.13746v1.pdf","comment":"The Paper is submitted in IEEE Access Journal"},{"id":"http://arxiv.org/abs/2307.13721v1","updated":"2023-07-25T17:59:18Z","published":"2023-07-25T17:59:18Z","title":"Foundational Models Defining a New Era in Vision: A Survey and Outlook","summary":"  Vision systems to see and reason about the compositional nature of visual\nscenes are fundamental to understanding our world. The complex relations\nbetween objects and their locations, ambiguities, and variations in the\nreal-world environment can be better described in human language, naturally\ngoverned by grammatical rules and other modalities such as audio and depth. The\nmodels learned to bridge the gap between such modalities coupled with\nlarge-scale training data facilitate contextual reasoning, generalization, and\nprompt capabilities at test time. These models are referred to as foundational\nmodels. The output of such models can be modified through human-provided\nprompts without retraining, e.g., segmenting a particular object by providing a\nbounding box, having interactive dialogues by asking questions about an image\nor video scene or manipulating the robot's behavior through language\ninstructions. In this survey, we provide a comprehensive review of such\nemerging foundational models, including typical architecture designs to combine\ndifferent modalities (vision, text, audio, etc), training objectives\n(contrastive, generative), pre-training datasets, fine-tuning mechanisms, and\nthe common prompting patterns; textual, visual, and heterogeneous. We discuss\nthe open challenges and research directions for foundational models in computer\nvision, including difficulties in their evaluations and benchmarking, gaps in\ntheir real-world understanding, limitations of their contextual understanding,\nbiases, vulnerability to adversarial attacks, and interpretability issues. We\nreview recent developments in this field, covering a wide range of applications\nof foundation models systematically and comprehensively. A comprehensive list\nof foundational models studied in this work is available at\n\\url{https://github.com/awaisrauf/Awesome-CV-Foundational-Models}.\n","authors":["Muhammad Awais","Muzammal Naseer","Salman Khan","Rao Muhammad Anwer","Hisham Cholakkal","Mubarak Shah","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2307.13721v1.pdf","comment":"Project page:\n  https://github.com/awaisrauf/Awesome-CV-Foundational-Models"},{"id":"http://arxiv.org/abs/2307.13720v1","updated":"2023-07-25T17:58:43Z","published":"2023-07-25T17:58:43Z","title":"Composite Diffusion | whole >= Σparts","summary":"  For an artist or a graphic designer, the spatial layout of a scene is a\ncritical design choice. However, existing text-to-image diffusion models\nprovide limited support for incorporating spatial information. This paper\nintroduces Composite Diffusion as a means for artists to generate high-quality\nimages by composing from the sub-scenes. The artists can specify the\narrangement of these sub-scenes through a flexible free-form segment layout.\nThey can describe the content of each sub-scene primarily using natural text\nand additionally by utilizing reference images or control inputs such as line\nart, scribbles, human pose, canny edges, and more.\n  We provide a comprehensive and modular method for Composite Diffusion that\nenables alternative ways of generating, composing, and harmonizing sub-scenes.\nFurther, we wish to evaluate the composite image for effectiveness in both\nimage quality and achieving the artist's intent. We argue that existing image\nquality metrics lack a holistic evaluation of image composites. To address\nthis, we propose novel quality criteria especially relevant to composite\ngeneration.\n  We believe that our approach provides an intuitive method of art creation.\nThrough extensive user surveys, quantitative and qualitative analysis, we show\nhow it achieves greater spatial, semantic, and creative control over image\ngeneration. In addition, our methods do not need to retrain or modify the\narchitecture of the base diffusion models and can work in a plug-and-play\nmanner with the fine-tuned models.\n","authors":["Vikram Jamwal","Ramaneswaran S"],"pdf_url":"https://arxiv.org/pdf/2307.13720v1.pdf","comment":"44 pages"},{"id":"http://arxiv.org/abs/2307.13717v1","updated":"2023-07-25T17:29:32Z","published":"2023-07-25T17:29:32Z","title":"A Comprehensive Analysis on the Leakage of Fuzzy Matchers","summary":"  The present paper presents a comprehensive analysis of potential information\nleakage in distance evaluation, with a specific emphasis on threshold-based\nobfuscated distance (i.e. Fuzzy Matcher). It includes detailed descriptions of\nvarious situations related to potential information leakage and specific\nattention is given to their consequences on security. Generic attacks\ncorresponding to each scenario are outlined, and their complexities are\nassessed. The main contribution of this work lies in providing an upper bound\non the security of a fuzzy matcher in scenarios where there is additional\ninformation leakage from the matcher, providing a straightforward understanding\nof the maximum level of achievable security and its potential implications for\ndata privacy and security.\n","authors":["Axel Durbet","Paul-Marie Grollemund","Kevin Thiry-Atighehchi"],"pdf_url":"https://arxiv.org/pdf/2307.13717v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.13632v1","updated":"2023-07-25T16:31:59Z","published":"2023-07-25T16:31:59Z","title":"Mitigating Mainstream Bias in Recommendation via Cost-sensitive Learning","summary":"  Mainstream bias, where some users receive poor recommendations because their\npreferences are uncommon or simply because they are less active, is an\nimportant aspect to consider regarding fairness in recommender systems.\nExisting methods to mitigate mainstream bias do not explicitly model the\nimportance of these non-mainstream users or, when they do, it is in a way that\nis not necessarily compatible with the data and recommendation model at hand.\nIn contrast, we use the recommendation utility as a more generic and implicit\nproxy to quantify mainstreamness, and propose a simple user-weighting approach\nto incorporate it into the training process while taking the cost of potential\nrecommendation errors into account. We provide extensive experimental results\nshowing that quantifying mainstreamness via utility is better able at\nidentifying non-mainstream users, and that they are indeed better served when\ntraining the model in a cost-sensitive way. This is achieved with negligible or\nno loss in overall recommendation accuracy, meaning that the models learn a\nbetter balance across users. In addition, we show that research of this kind,\nwhich evaluates recommendation quality at the individual user level, may not be\nreliable if not using enough interactions when assessing model performance.\n","authors":["Roger Zhe Li","Julián Urbano","Alan Hanjalic"],"pdf_url":"https://arxiv.org/pdf/2307.13632v1.pdf","comment":"8 pages, 7 figures, accepted to ICTIR'23"},{"id":"http://arxiv.org/abs/2307.08303v2","updated":"2023-07-25T14:57:05Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n  Models","summary":"  Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v2.pdf","comment":"fix typo InPairs which should be InPars"},{"id":"http://arxiv.org/abs/2307.13468v1","updated":"2023-07-25T12:56:41Z","published":"2023-07-25T12:56:41Z","title":"Gaussian Graph with Prototypical Contrastive Learning in E-Commerce\n  Bundle Recommendation","summary":"  Bundle recommendation aims to provide a bundle of items to satisfy the user\npreference on e-commerce platform. Existing successful solutions are based on\nthe contrastive graph learning paradigm where graph neural networks (GNNs) are\nemployed to learn representations from user-level and bundle-level graph views\nwith a contrastive learning module to enhance the cooperative association\nbetween different views. Nevertheless, they ignore the uncertainty issue which\nhas a significant impact in real bundle recommendation scenarios due to the\nlack of discriminative information caused by highly sparsity or diversity. We\nfurther suggest that their instancewise contrastive learning fails to\ndistinguish the semantically similar negatives (i.e., sampling bias issue),\nresulting in performance degradation. In this paper, we propose a novel\nGaussian Graph with Prototypical Contrastive Learning (GPCL) framework to\novercome these challenges. In particular, GPCL embeds each user/bundle/item as\na Gaussian distribution rather than a fixed vector. We further design a\nprototypical contrastive learning module to capture the contextual information\nand mitigate the sampling bias issue. Extensive experiments demonstrate that\nbenefiting from the proposed components, we achieve new state-of-the-art\nperformance compared to previous methods on several public datasets. Moreover,\nGPCL has been deployed on real-world e-commerce platform and achieved\nsubstantial improvements.\n","authors":["Zhao-Yang Liu","Liucheng Sun","Chenwei Weng","Qijin Chen","Chengfu Huo"],"pdf_url":"https://arxiv.org/pdf/2307.13468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13427v1","updated":"2023-07-25T11:49:25Z","published":"2023-07-25T11:49:25Z","title":"Comprehensive Review on Semantic Information Retrieval and Ontology\n  Engineering","summary":"  Situation awareness is a crucial cognitive skill that enables individuals to\nperceive, comprehend, and project the current state of their environment\naccurately. It involves being conscious of relevant information, understanding\nits meaning, and using that understanding to make well-informed decisions.\nAwareness systems often need to integrate new knowledge and adapt to changing\nenvironments. Ontology reasoning facilitates knowledge integration and\nevolution, allowing for seamless updates and expansions of the ontology. With\nthe consideration of above, we are providing a quick review on semantic\ninformation retrieval and ontology engineering to understand the emerging\nchallenges and future research. In the review we have found that the ontology\nreasoning addresses the limitations of traditional systems by providing a\nformal, flexible, and scalable framework for knowledge representation,\nreasoning, and inference.\n","authors":["Sumit Sharma","Sarika Jain"],"pdf_url":"https://arxiv.org/pdf/2307.13427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13394v1","updated":"2023-07-25T10:27:00Z","published":"2023-07-25T10:27:00Z","title":"An End-to-End Workflow using Topic Segmentation and Text Summarisation\n  Methods for Improved Podcast Comprehension","summary":"  The consumption of podcast media has been increasing rapidly. Due to the\nlengthy nature of podcast episodes, users often carefully select which ones to\nlisten to. Although episode descriptions aid users by providing a summary of\nthe entire podcast, they do not provide a topic-by-topic breakdown. This study\nexplores the combined application of topic segmentation and text summarisation\nmethods to investigate how podcast episode comprehension can be improved. We\nhave sampled 10 episodes from Spotify's English-Language Podcast Dataset and\nemployed TextTiling and TextSplit to segment them. Moreover, three text\nsummarisation models, namely T5, BART, and Pegasus, were applied to provide a\nvery short title for each segment. The segmentation part was evaluated using\nour annotated sample with the $P_k$ and WindowDiff ($WD$) metrics. A survey was\nalso rolled out ($N=25$) to assess the quality of the generated summaries. The\nTextSplit algorithm achieved the lowest mean for both evaluation metrics\n($\\bar{P_k}=0.41$ and $\\bar{WD}=0.41$), while the T5 model produced the best\nsummaries, achieving a relevancy score only $8\\%$ less to the one achieved by\nthe human-written titles.\n","authors":["Andrew Aquilina","Sean Diacono","Panagiotis Papapetrou","Maria Movin"],"pdf_url":"https://arxiv.org/pdf/2307.13394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.11876v4","updated":"2023-07-25T10:13:28Z","published":"2021-05-25T12:23:24Z","title":"Criterion-based Heterogeneous Collaborative Filtering for Multi-behavior\n  Implicit Recommendation","summary":"  Recent years have witnessed the explosive growth of interaction behaviors in\nmultimedia information systems, where multi-behavior recommender systems have\nreceived increasing attention by leveraging data from various auxiliary\nbehaviors such as tip and collect. Among various multi-behavior recommendation\nmethods, non-sampling methods have shown superiority over negative sampling\nmethods. However, two observations are usually ignored in existing\nstate-of-the-art non-sampling methods based on binary regression: (1) users\nhave different preference strengths for different items, so they cannot be\nmeasured simply by binary implicit data; (2) the dependency across multiple\nbehaviors varies for different users and items. To tackle the above issue, we\npropose a novel non-sampling learning framework named Criterion-guided\nHeterogeneous Collaborative Filtering (CHCF). CHCF introduces both upper and\nlower thresholds to indicate selection criteria, which will guide user\npreference learning. Besides, CHCF integrates criterion learning and user\npreference learning into a unified framework, which can be trained jointly for\nthe interaction prediction of the target behavior. We further theoretically\ndemonstrate that the optimization of Collaborative Metric Learning can be\napproximately achieved by the CHCF learning framework in a non-sampling form\neffectively. Extensive experiments on three real-world datasets show the\neffectiveness of CHCF in heterogeneous scenarios.\n","authors":["Xiao Luo","Daqing Wu","Yiyang Gu","Chong Chen","Luchen Liu","Jinwen Ma","Ming Zhang","Minghua Deng","Jianqiang Huang","Xian-Sheng Hua"],"pdf_url":"https://arxiv.org/pdf/2105.11876v4.pdf","comment":"Accepted by ACM Transactions on Knowledge Discovery from Data (TKDD)"},{"id":"http://arxiv.org/abs/2307.13377v1","updated":"2023-07-25T09:51:17Z","published":"2023-07-25T09:51:17Z","title":"Embedding Models for Supervised Automatic Extraction and Classification\n  of Named Entities in Scientific Acknowledgements","summary":"  Acknowledgments in scientific papers may give an insight into aspects of the\nscientific community, such as reward systems, collaboration patterns, and\nhidden research trends. The aim of the paper is to evaluate the performance of\ndifferent embedding models for the task of automatic extraction and\nclassification of acknowledged entities from the acknowledgment text in\nscientific papers. We trained and implemented a named entity recognition (NER)\ntask using the Flair NLP framework. The training was conducted using three\ndefault Flair NER models with four differently-sized corpora and different\nversions of the Flair NLP framework. The Flair Embeddings model trained on the\nmedium corpus with the latest FLAIR version showed the best accuracy of 0.79.\nExpanding the size of a training corpus from very small to medium size\nmassively increased the accuracy of all training algorithms, but further\nexpansion of the training corpus did not bring further improvement. Moreover,\nthe performance of the model slightly deteriorated. Our model is able to\nrecognize six entity types: funding agency, grant number, individuals,\nuniversity, corporation, and miscellaneous. The model works more precisely for\nsome entity types than for others; thus, individuals and grant numbers showed a\nvery good F1-Score over 0.9. Most of the previous works on acknowledgment\nanalysis were limited by the manual evaluation of data and therefore by the\namount of processed data. This model can be applied for the comprehensive\nanalysis of acknowledgment texts and may potentially make a great contribution\nto the field of automated acknowledgment analysis.\n","authors":["Nina Smirnova","Philipp Mayr"],"pdf_url":"https://arxiv.org/pdf/2307.13377v1.pdf","comment":"The present paper is an extended version of the article Evaluation of\n  Embedding Models for Automatic Extraction and Classification of Acknowledged\n  Entities in Scientific Documents (Smirnova and Mayr, 2022) presented at the\n  3rd Workshop on Extraction and Evaluation of Knowledge Entities from\n  Scientific Documents (EEKE2022). arXiv admin note: substantial text overlap\n  with arXiv:2206.10939"},{"id":"http://arxiv.org/abs/2307.13298v1","updated":"2023-07-25T07:27:32Z","published":"2023-07-25T07:27:32Z","title":"An Intent Taxonomy of Legal Case Retrieval","summary":"  Legal case retrieval is a special Information Retrieval~(IR) task focusing on\nlegal case documents. Depending on the downstream tasks of the retrieved case\ndocuments, users' information needs in legal case retrieval could be\nsignificantly different from those in Web search and traditional ad-hoc\nretrieval tasks. While there are several studies that retrieve legal cases\nbased on text similarity, the underlying search intents of legal retrieval\nusers, as shown in this paper, are more complicated than that yet mostly\nunexplored. To this end, we present a novel hierarchical intent taxonomy of\nlegal case retrieval. It consists of five intent types categorized by three\ncriteria, i.e., search for Particular Case(s), Characterization, Penalty,\nProcedure, and Interest. The taxonomy was constructed transparently and\nevaluated extensively through interviews, editorial user studies, and query log\nanalysis. Through a laboratory user study, we reveal significant differences in\nuser behavior and satisfaction under different search intents in legal case\nretrieval. Furthermore, we apply the proposed taxonomy to various downstream\nlegal retrieval tasks, e.g., result ranking and satisfaction prediction, and\ndemonstrate its effectiveness. Our work provides important insights into the\nunderstanding of user intents in legal case retrieval and potentially leads to\nbetter retrieval techniques in the legal domain, such as intent-aware ranking\nstrategies and evaluation methodologies.\n","authors":["Yunqiu Shao","Haitao Li","Yueyue Wu","Yiqun Liu","Qingyao Ai","Jiaxin Mao","Yixiao Ma","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2307.13298v1.pdf","comment":"28 pages, work in process"},{"id":"http://arxiv.org/abs/2307.12798v2","updated":"2023-07-25T05:42:34Z","published":"2023-07-24T13:51:19Z","title":"RRAML: Reinforced Retrieval Augmented Machine Learning","summary":"  The emergence of large language models (LLMs) has revolutionized machine\nlearning and related fields, showcasing remarkable abilities in comprehending,\ngenerating, and manipulating human language. However, their conventional usage\nthrough API-based text prompt submissions imposes certain limitations in terms\nof context constraints and external source availability. To address these\nchallenges, we propose a novel framework called Reinforced Retrieval Augmented\nMachine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs\nwith supporting information retrieved by a purpose-built retriever from a vast\nuser-provided database. By leveraging recent advancements in reinforcement\nlearning, our method effectively addresses several critical challenges.\nFirstly, it circumvents the need for accessing LLM gradients. Secondly, our\nmethod alleviates the burden of retraining LLMs for specific tasks, as it is\noften impractical or impossible due to restricted access to the model and the\ncomputational intensity involved. Additionally we seamlessly link the\nretriever's task with the reasoner, mitigating hallucinations and reducing\nirrelevant, and potentially damaging retrieved documents. We believe that the\nresearch agenda outlined in this paper has the potential to profoundly impact\nthe field of AI, democratizing access to and utilization of LLMs for a wide\nrange of entities.\n","authors":["Andrea Bacciu","Florin Cocunasu","Federico Siciliano","Fabrizio Silvestri","Nicola Tonellotto","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2307.12798v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12424v2","updated":"2023-07-25T18:56:03Z","published":"2023-07-23T20:34:18Z","title":"Interface Design to Mitigate Inflation in Recommender Systems","summary":"  Recommendation systems rely on user-provided data to learn about item quality\nand provide personalized recommendations. An implicit assumption when\naggregating ratings into item quality is that ratings are strong indicators of\nitem quality. In this work, we test this assumption using data collected from a\nmusic discovery application. Our study focuses on two factors that cause rating\ninflation: heterogeneous user rating behavior and the dynamics of personalized\nrecommendations. We show that user rating behavior substantially varies by\nuser, leading to item quality estimates that reflect the users who rated an\nitem more than the item quality itself. Additionally, items that are more\nlikely to be shown via personalized recommendations can experience a\nsubstantial increase in their exposure and potential bias toward them. To\nmitigate these effects, we analyze the results of a randomized controlled trial\nin which the rating interface was modified. The test resulted in a substantial\nimprovement in user rating behavior and a reduction in item quality inflation.\nThese findings highlight the importance of carefully considering the\nassumptions underlying recommendation systems and designing interfaces that\nencourage accurate rating behavior.\n","authors":["Rana Shahout","Yehonatan Peisakhovsky","Sasha Stoikov","Nikhil Garg"],"pdf_url":"https://arxiv.org/pdf/2307.12424v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13766v1","updated":"2023-07-25T18:53:24Z","published":"2023-07-25T18:53:24Z","title":"ClusterSeq: Enhancing Sequential Recommender Systems with Clustering\n  based Meta-Learning","summary":"  In practical scenarios, the effectiveness of sequential recommendation\nsystems is hindered by the user cold-start problem, which arises due to limited\ninteractions for accurately determining user preferences. Previous studies have\nattempted to address this issue by combining meta-learning with user and\nitem-side information. However, these approaches face inherent challenges in\nmodeling user preference dynamics, particularly for \"minor users\" who exhibit\ndistinct preferences compared to more common or \"major users.\" To overcome\nthese limitations, we present a novel approach called ClusterSeq, a\nMeta-Learning Clustering-Based Sequential Recommender System. ClusterSeq\nleverages dynamic information in the user sequence to enhance item prediction\naccuracy, even in the absence of side information. This model preserves the\npreferences of minor users without being overshadowed by major users, and it\ncapitalizes on the collective knowledge of users within the same cluster.\nExtensive experiments conducted on various benchmark datasets validate the\neffectiveness of ClusterSeq. Empirical results consistently demonstrate that\nClusterSeq outperforms several state-of-the-art meta-learning recommenders.\nNotably, compared to existing meta-learning methods, our proposed approach\nachieves a substantial improvement of 16-39% in Mean Reciprocal Rank (MRR).\n","authors":["Mohammmadmahdi Maheri","Reza Abdollahzadeh","Bardia Mohammadi","Mina Rafiei","Jafar Habibi","Hamid R. Rabiee"],"pdf_url":"https://arxiv.org/pdf/2307.13766v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.13692v1","updated":"2023-07-25T17:55:19Z","published":"2023-07-25T17:55:19Z","title":"ARB: Advanced Reasoning Benchmark for Large Language Models","summary":"  Large Language Models (LLMs) have demonstrated remarkable performance on\nvarious quantitative reasoning and knowledge benchmarks. However, many of these\nbenchmarks are losing utility as LLMs get increasingly high scores, despite not\nyet reaching expert performance in these domains. We introduce ARB, a novel\nbenchmark composed of advanced reasoning problems in multiple fields. ARB\npresents a more challenging test than prior benchmarks, featuring problems in\nmathematics, physics, biology, chemistry, and law. As a subset of ARB, we\nintroduce a challenging set of math and physics problems which require advanced\nsymbolic reasoning and domain knowledge. We evaluate recent models such as\nGPT-4 and Claude on ARB and demonstrate that current models score well below\n50% on more demanding tasks. In order to improve both automatic and assisted\nevaluation capabilities, we introduce a rubric-based evaluation approach,\nallowing GPT-4 to score its own intermediate reasoning steps. Further, we\nconduct a human evaluation of the symbolic subset of ARB, finding promising\nagreement between annotators and GPT-4 rubric evaluation scores.\n","authors":["Tomohiro Sawada","Daniel Paleka","Alexander Havrilla","Pranav Tadepalli","Paula Vidas","Alexander Kranias","John J. Nay","Kshitij Gupta","Aran Komatsuzaki"],"pdf_url":"https://arxiv.org/pdf/2307.13692v1.pdf","comment":"Submitted to NeurIPS Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/1905.10448v4","updated":"2023-07-25T17:53:01Z","published":"2019-05-24T21:19:04Z","title":"Geometric Wavelet Scattering Networks on Compact Riemannian Manifolds","summary":"  The Euclidean scattering transform was introduced nearly a decade ago to\nimprove the mathematical understanding of convolutional neural networks.\nInspired by recent interest in geometric deep learning, which aims to\ngeneralize convolutional neural networks to manifold and graph-structured\ndomains, we define a geometric scattering transform on manifolds. Similar to\nthe Euclidean scattering transform, the geometric scattering transform is based\non a cascade of wavelet filters and pointwise nonlinearities. It is invariant\nto local isometries and stable to certain types of diffeomorphisms. Empirical\nresults demonstrate its utility on several geometric learning tasks. Our\nresults generalize the deformation stability and local translation invariance\nof Euclidean scattering, and demonstrate the importance of linking the used\nfilter structures to the underlying geometry of the data.\n","authors":["Michael Perlmutter","Feng Gao","Guy Wolf","Matthew Hirn"],"pdf_url":"https://arxiv.org/pdf/1905.10448v4.pdf","comment":"35 pages; 3 figures; 2 tables; v4: Fixed a minor error. Convergence\n  in Equation 13 is in L2 not p.w. modified proof of Theorem 3.3 accordingly"},{"id":"http://arxiv.org/abs/2303.06296v2","updated":"2023-07-25T17:42:37Z","published":"2023-03-11T03:30:47Z","title":"Stabilizing Transformer Training by Preventing Attention Entropy\n  Collapse","summary":"  Training stability is of great importance to Transformers. In this work, we\ninvestigate the training dynamics of Transformers by examining the evolution of\nthe attention layers. In particular, we track the attention entropy for each\nattention head during the course of training, which is a proxy for model\nsharpness. We identify a common pattern across different architectures and\ntasks, where low attention entropy is accompanied by high training instability,\nwhich can take the form of oscillating loss or divergence. We denote the\npathologically low attention entropy, corresponding to highly concentrated\nattention scores, as $\\textit{entropy collapse}$. As a remedy, we propose\n$\\sigma$Reparam, a simple and efficient solution where we reparametrize all\nlinear layers with spectral normalization and an additional learned scalar. We\ndemonstrate that $\\sigma$Reparam successfully prevents entropy collapse in the\nattention layers, promoting more stable training. Additionally, we prove a\ntight lower bound of the attention entropy, which decreases exponentially fast\nwith the spectral norm of the attention logits, providing additional motivation\nfor our approach. We conduct experiments with $\\sigma$Reparam on image\nclassification, image self-supervised learning, machine translation, speech\nrecognition, and language modeling tasks. We show that $\\sigma$Reparam provides\nstability and robustness with respect to the choice of hyperparameters, going\nso far as enabling training (a) a Vision Transformer {to competitive\nperformance} without warmup, weight decay, layer normalization or adaptive\noptimizers; (b) deep architectures in machine translation and (c) speech\nrecognition to competitive performance without warmup and adaptive optimizers.\nCode is available at \\url{https://github.com/apple/ml-sigma-reparam}.\n","authors":["Shuangfei Zhai","Tatiana Likhomanenko","Etai Littwin","Dan Busbridge","Jason Ramapuram","Yizhe Zhang","Jiatao Gu","Josh Susskind"],"pdf_url":"https://arxiv.org/pdf/2303.06296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13680v1","updated":"2023-07-25T17:36:56Z","published":"2023-07-25T17:36:56Z","title":"High Probability Analysis for Non-Convex Stochastic Optimization with\n  Clipping","summary":"  Gradient clipping is a commonly used technique to stabilize the training\nprocess of neural networks. A growing body of studies has shown that gradient\nclipping is a promising technique for dealing with the heavy-tailed behavior\nthat emerged in stochastic optimization as well. While gradient clipping is\nsignificant, its theoretical guarantees are scarce. Most theoretical guarantees\nonly provide an in-expectation analysis and only focus on optimization\nperformance. In this paper, we provide high probability analysis in the\nnon-convex setting and derive the optimization bound and the generalization\nbound simultaneously for popular stochastic optimization algorithms with\ngradient clipping, including stochastic gradient descent and its variants of\nmomentum and adaptive stepsizes. With the gradient clipping, we study a\nheavy-tailed assumption that the gradients only have bounded $\\alpha$-th\nmoments for some $\\alpha \\in (1, 2]$, which is much weaker than the standard\nbounded second-moment assumption. Overall, our study provides a relatively\ncomplete picture for the theoretical guarantee of stochastic optimization\nalgorithms with clipping.\n","authors":["Shaojie Li","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13679v1","updated":"2023-07-25T17:36:34Z","published":"2023-07-25T17:36:34Z","title":"RED CoMETS: An ensemble classifier for symbolically represented\n  multivariate time series","summary":"  Multivariate time series classification is a rapidly growing research field\nwith practical applications in finance, healthcare, engineering, and more. The\ncomplexity of classifying multivariate time series data arises from its high\ndimensionality, temporal dependencies, and varying lengths. This paper\nintroduces a novel ensemble classifier called RED CoMETS (Random Enhanced\nCo-eye for Multivariate Time Series), which addresses these challenges. RED\nCoMETS builds upon the success of Co-eye, an ensemble classifier specifically\ndesigned for symbolically represented univariate time series, and extends its\ncapabilities to handle multivariate data. The performance of RED CoMETS is\nevaluated on benchmark datasets from the UCR archive, where it demonstrates\ncompetitive accuracy when compared to state-of-the-art techniques in\nmultivariate settings. Notably, it achieves the highest reported accuracy in\nthe literature for the 'HandMovementDirection' dataset. Moreover, the proposed\nmethod significantly reduces computation time compared to Co-eye, making it an\nefficient and effective choice for multivariate time series classification.\n","authors":["Luca A. Bennett","Zahraa S. Abdallah"],"pdf_url":"https://arxiv.org/pdf/2307.13679v1.pdf","comment":"Accepted by AALTD 2023"},{"id":"http://arxiv.org/abs/2307.12840v2","updated":"2023-07-25T17:25:03Z","published":"2023-07-24T14:37:22Z","title":"Efficiently Learning One-Hidden-Layer ReLU Networks via Schur\n  Polynomials","summary":"  We study the problem of PAC learning a linear combination of $k$ ReLU\nactivations under the standard Gaussian distribution on $\\mathbb{R}^d$ with\nrespect to the square loss. Our main result is an efficient algorithm for this\nlearning task with sample and computational complexity $(dk/\\epsilon)^{O(k)}$,\nwhere $\\epsilon>0$ is the target accuracy. Prior work had given an algorithm\nfor this problem with complexity $(dk/\\epsilon)^{h(k)}$, where the function\n$h(k)$ scales super-polynomially in $k$. Interestingly, the complexity of our\nalgorithm is near-optimal within the class of Correlational Statistical Query\nalgorithms. At a high-level, our algorithm uses tensor decomposition to\nidentify a subspace such that all the $O(k)$-order moments are small in the\northogonal directions. Its analysis makes essential use of the theory of Schur\npolynomials to show that the higher-moment error tensors are small given that\nthe lower-order ones are.\n","authors":["Ilias Diakonikolas","Daniel M. Kane"],"pdf_url":"https://arxiv.org/pdf/2307.12840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00296v2","updated":"2023-07-25T17:15:23Z","published":"2023-07-01T10:39:07Z","title":"Accelerated primal-dual methods with enlarged step sizes and operator\n  learning for nonsmooth optimal control problems","summary":"  We consider a general class of nonsmooth optimal control problems with\npartial differential equation (PDE) constraints, which are very challenging due\nto its nonsmooth objective functionals and the resulting high-dimensional and\nill-conditioned systems after discretization. We focus on the application of a\nprimal-dual method, with which different types of variables can be treated\nindividually and thus its main computation at each iteration only requires\nsolving two PDEs. Our target is to accelerate the primal-dual method with\neither larger step sizes or operator learning techniques. For the accelerated\nprimal-dual method with larger step sizes, its convergence can be still proved\nrigorously while it numerically accelerates the original primal-dual method in\na simple and universal way. For the operator learning acceleration, we\nconstruct deep neural network surrogate models for the involved PDEs. Once a\nneural operator is learned, solving a PDE requires only a forward pass of the\nneural network, and the computational cost is thus substantially reduced. The\naccelerated primal-dual method with operator learning is mesh-free, numerically\nefficient, and scalable to different types of PDEs. The acceleration\neffectiveness of these two techniques is promisingly validated by some\npreliminary numerical results.\n","authors":["Yongcun Song","Xiaoming Yuan","Hangrui Yue"],"pdf_url":"https://arxiv.org/pdf/2307.00296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07679v2","updated":"2023-07-25T17:12:57Z","published":"2023-07-15T01:53:09Z","title":"Sharp Convergence Rates for Matching Pursuit","summary":"  We study the fundamental limits of matching pursuit, or the pure greedy\nalgorithm, for approximating a target function by a sparse linear combination\nof elements from a dictionary. When the target function is contained in the\nvariation space corresponding to the dictionary, many impressive works over the\npast few decades have obtained upper and lower bounds on the error of matching\npursuit, but they do not match. The main contribution of this paper is to close\nthis gap and obtain a sharp characterization of the decay rate of matching\npursuit. Specifically, we construct a worst case dictionary which shows that\nthe existing best upper bound cannot be significantly improved. It turns out\nthat, unlike other greedy algorithm variants, the converge rate is suboptimal\nand is determined by the solution to a certain non-linear equation. This\nenables us to conclude that any amount of shrinkage improves matching pursuit\nin the worst case.\n","authors":["Jason M. Klusowski","Jonathan W. Siegel"],"pdf_url":"https://arxiv.org/pdf/2307.07679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13658v1","updated":"2023-07-25T17:09:28Z","published":"2023-07-25T17:09:28Z","title":"Towards an AI Accountability Policy","summary":"  This white paper is a response to the \"AI Accountability Policy Request for\nComments\" by the National Telecommunications and Information Administration of\nthe United States. The question numbers for which comments were requested are\nprovided in superscripts at the end of key sentences answering the respective\nquestions. The white paper offers a set of interconnected recommendations for\nan AI accountability policy.\n","authors":["Przemyslaw Grabowicz","Nicholas Perello","Yair Zick"],"pdf_url":"https://arxiv.org/pdf/2307.13658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05796v3","updated":"2023-07-25T16:57:52Z","published":"2022-12-12T09:43:26Z","title":"Generalizing DP-SGD with Shuffling and Batch Clipping","summary":"  Classical differential private DP-SGD implements individual clipping with\nrandom subsampling, which forces a mini-batch SGD approach. We provide a\ngeneral differential private algorithmic framework that goes beyond DP-SGD and\nallows any possible first order optimizers (e.g., classical SGD and momentum\nbased SGD approaches) in combination with batch clipping, which clips an\naggregate of computed gradients rather than summing clipped gradients (as is\ndone in individual clipping). The framework also admits sampling techniques\nbeyond random subsampling such as shuffling. Our DP analysis follows the $f$-DP\napproach and introduces a new proof technique which allows us to derive simple\nclosed form expressions and to also analyse group privacy. In particular, for\n$E$ epochs work and groups of size $g$, we show a $\\sqrt{g E}$ DP dependency\nfor batch clipping with shuffling.\n","authors":["Marten van Dijk","Phuong Ha Nguyen","Toan N. Nguyen","Lam M. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2212.05796v3.pdf","comment":"Update disclaimers"},{"id":"http://arxiv.org/abs/2307.13642v1","updated":"2023-07-25T16:49:54Z","published":"2023-07-25T16:49:54Z","title":"Safety Margins for Reinforcement Learning","summary":"  Any autonomous controller will be unsafe in some situations. The ability to\nquantitatively identify when these unsafe situations are about to occur is\ncrucial for drawing timely human oversight in, e.g., freight transportation\napplications. In this work, we demonstrate that the true criticality of an\nagent's situation can be robustly defined as the mean reduction in reward given\nsome number of random actions. Proxy criticality metrics that are computable in\nreal-time (i.e., without actually simulating the effects of random actions) can\nbe compared to the true criticality, and we show how to leverage these proxy\nmetrics to generate safety margins, which directly tie the consequences of\npotentially incorrect actions to an anticipated loss in overall performance. We\nevaluate our approach on learned policies from APE-X and A3C within an Atari\nenvironment, and demonstrate how safety margins decrease as agents approach\nfailure states. The integration of safety margins into programs for monitoring\ndeployed agents allows for the real-time identification of potentially\ncatastrophic situations.\n","authors":["Alexander Grushin","Walt Woods","Alvaro Velasquez","Simon Khan"],"pdf_url":"https://arxiv.org/pdf/2307.13642v1.pdf","comment":"2 pages, 2 figures. Presented at the 2023 IEEE Conference on\n  Artificial Intelligence (CAI), Santa Clara, CA"},{"id":"http://arxiv.org/abs/2210.06433v2","updated":"2023-07-25T16:43:33Z","published":"2022-10-12T17:30:12Z","title":"Self-supervised video pretraining yields human-aligned visual\n  representations","summary":"  Humans learn powerful representations of objects and scenes by observing how\nthey evolve over time. Yet, outside of specific tasks that require explicit\ntemporal understanding, static image pretraining remains the dominant paradigm\nfor learning visual foundation models. We question this mismatch, and ask\nwhether video pretraining can yield visual representations that bear the\nhallmarks of human perception: generalisation across tasks, robustness to\nperturbations, and consistency with human judgements. To that end we propose a\nnovel procedure for curating videos, and develop a contrastive framework which\nlearns from the complex transformations therein. This simple paradigm for\ndistilling knowledge from videos, called VITO, yields general representations\nthat far outperform prior video pretraining methods on image understanding\ntasks, and image pretraining methods on video understanding tasks. Moreover,\nVITO representations are significantly more robust to natural and synthetic\ndeformations than image-, video-, and adversarially-trained ones. Finally,\nVITO's predictions are strongly aligned with human judgements, surpassing\nmodels that were specifically trained for that purpose. Together, these results\nsuggest that video pretraining could be a simple way of learning unified,\nrobust, and human-aligned representations of the visual world.\n","authors":["Nikhil Parthasarathy","S. M. Ali Eslami","João Carreira","Olivier J. Hénaff"],"pdf_url":"https://arxiv.org/pdf/2210.06433v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2302.11007v2","updated":"2023-07-25T16:38:46Z","published":"2023-02-21T21:20:59Z","title":"Unification of popular artificial neural network activation functions","summary":"  We present a unified representation of the most popular neural network\nactivation functions. Adopting Mittag-Leffler functions of fractional calculus,\nwe propose a flexible and compact functional form that is able to interpolate\nbetween various activation functions and mitigate common problems in training\nneural networks such as vanishing and exploding gradients. The presented gated\nrepresentation extends the scope of fixed-shape activation functions to their\nadaptive counterparts whose shape can be learnt from the training data. The\nderivatives of the proposed functional form can also be expressed in terms of\nMittag-Leffler functions making it a suitable candidate for gradient-based\nbackpropagation algorithms. By training multiple neural networks of different\ncomplexities on various datasets with different sizes, we demonstrate that\nadopting a unified gated representation of activation functions offers a\npromising and affordable alternative to individual built-in implementations of\nactivation functions in conventional machine learning frameworks.\n","authors":["Mohammad Mostafanejad"],"pdf_url":"https://arxiv.org/pdf/2302.11007v2.pdf","comment":"The present revised version includes new results on ShuffleNet-v2 and\n  ResNet-101 neural networks"},{"id":"http://arxiv.org/abs/2306.10797v2","updated":"2023-07-25T16:33:02Z","published":"2023-06-19T09:37:18Z","title":"Variability of echo state network prediction horizon for partially\n  observed dynamical systems","summary":"  Study of dynamical systems using partial state observation is an important\nproblem due to its applicability to many real-world systems. We address the\nproblem by proposing an echo state network (ESN) framework with partial state\ninput with partial or full state output. Application to the Lorenz system and\nChua's oscillator (both numerically simulated and experimental systems)\ndemonstrate the effectiveness of our method. We show that the ESN, as an\nautonomous dynamical system, is capable of making short-term predictions up to\na few Lyapunov times. However, the prediction horizon has high variability\ndepending on the initial condition - an aspect that we explore in detail using\nthe distribution of the prediction horizon. Further, using a variety of\nstatistical metrics to compare the long-term dynamics of the ESN predictions\nwith numerically simulated or experimental dynamics and observed similar\nresults, we show that the ESN can effectively learn the system's dynamics even\nwhen trained with noisy numerical or experimental datasets. Thus, we\ndemonstrate the potential of ESNs to serve as cheap surrogate models for\nsimulating the dynamics of systems where complete observations are unavailable.\n","authors":["Ajit Mahata","Reetish Padhi","Amit Apte"],"pdf_url":"https://arxiv.org/pdf/2306.10797v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13621v1","updated":"2023-07-25T16:23:32Z","published":"2023-07-25T16:23:32Z","title":"Scaling machine learning-based chemical plant simulation: A method for\n  fine-tuning a model to induce stable fixed points","summary":"  Idealized first-principles models of chemical plants can be inaccurate. An\nalternative is to fit a Machine Learning (ML) model directly to plant sensor\ndata. We use a structured approach: Each unit within the plant gets represented\nby one ML model. After fitting the models to the data, the models are connected\ninto a flowsheet-like directed graph. We find that for smaller plants, this\napproach works well, but for larger plants, the complex dynamics arising from\nlarge and nested cycles in the flowsheet lead to instabilities in the cycle\nsolver. We analyze this problem in depth and show that it is not merely a\nspecialized concern but rather a more pervasive challenge that will likely\noccur whenever ML is applied to larger plants. To address this problem, we\npresent a way to fine-tune ML models such that solving cycles with the usual\nmethods becomes robust again.\n","authors":["Malte Esders","Gimmy Alex Fernandez Ramirez","Michael Gastegger","Satya Swarup Samal"],"pdf_url":"https://arxiv.org/pdf/2307.13621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13616v1","updated":"2023-07-25T16:20:56Z","published":"2023-07-25T16:20:56Z","title":"AI and ethics in insurance: a new solution to mitigate proxy\n  discrimination in risk modeling","summary":"  The development of Machine Learning is experiencing growing interest from the\ngeneral public, and in recent years there have been numerous press articles\nquestioning its objectivity: racism, sexism, \\dots Driven by the growing\nattention of regulators on the ethical use of data in insurance, the actuarial\ncommunity must rethink pricing and risk selection practices for fairer\ninsurance. Equity is a philosophy concept that has many different definitions\nin every jurisdiction that influence each other without currently reaching\nconsensus. In Europe, the Charter of Fundamental Rights defines guidelines on\ndiscrimination, and the use of sensitive personal data in algorithms is\nregulated. If the simple removal of the protected variables prevents any\nso-called `direct' discrimination, models are still able to `indirectly'\ndiscriminate between individuals thanks to latent interactions between\nvariables, which bring better performance (and therefore a better\nquantification of risk, segmentation of prices, and so on). After introducing\nthe key concepts related to discrimination, we illustrate the complexity of\nquantifying them. We then propose an innovative method, not yet met in the\nliterature, to reduce the risks of indirect discrimination thanks to\nmathematical concepts of linear algebra. This technique is illustrated in a\nconcrete case of risk selection in life insurance, demonstrating its simplicity\nof use and its promising performance.\n","authors":["Marguerite Sauce","Antoine Chancel","Antoine Ly"],"pdf_url":"https://arxiv.org/pdf/2307.13616v1.pdf","comment":"Preprint - WIP"},{"id":"http://arxiv.org/abs/2206.11715v2","updated":"2023-07-25T16:18:12Z","published":"2022-06-23T14:07:23Z","title":"Deep Reinforcement Learning-Assisted Federated Learning for Robust\n  Short-term Utility Demand Forecasting in Electricity Wholesale Markets","summary":"  Short-term load forecasting (STLF) plays a significant role in the operation\nof electricity trading markets. Considering the growing concern of data\nprivacy, federated learning (FL) is increasingly adopted to train STLF models\nfor utility companies (UCs) in recent research. Inspiringly, in wholesale\nmarkets, as it is not realistic for power plants (PPs) to access UCs' data\ndirectly, FL is definitely a feasible solution of obtaining an accurate STLF\nmodel for PPs. However, due to FL's distributed nature and intense competition\namong UCs, defects increasingly occur and lead to poor performance of the STLF\nmodel, indicating that simply adopting FL is not enough. In this paper, we\npropose a DRL-assisted FL approach, DEfect-AwaRe federated soft actor-critic\n(DearFSAC), to robustly train an accurate STLF model for PPs to forecast\nprecise short-term utility electricity demand. Firstly. we design a STLF model\nbased on long short-term memory (LSTM) using just historical load data and time\ndata. Furthermore, considering the uncertainty of defects occurrence, a deep\nreinforcement learning (DRL) algorithm is adopted to assist FL by alleviating\nmodel degradation caused by defects. In addition, for faster convergence of FL\ntraining, an auto-encoder is designed for both dimension reduction and quality\nevaluation of uploaded models. In the simulations, we validate our approach on\nreal data of Helsinki's UCs in 2019. The results show that DearFSAC outperforms\nall the other approaches no matter if defects occur or not.\n","authors":["Chenghao Huang","Weilong Chen","Shengrong Bu","Yanru Zhang"],"pdf_url":"https://arxiv.org/pdf/2206.11715v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2009.13370v5","updated":"2023-07-25T16:15:59Z","published":"2020-09-28T14:38:52Z","title":"Replica Analysis of the Linear Model with Markov or Hidden Markov Signal\n  Priors","summary":"  This paper estimates free energy, average mutual information, and minimum\nmean square error (MMSE) of a linear model under two assumptions: (1) the\nsource is generated by a Markov chain, (2) the source is generated via a hidden\nMarkov model. Our estimates are based on the replica method in statistical\nphysics. We show that under the posterior mean estimator, the linear model with\nMarkov sources or hidden Markov sources is decoupled into single-input AWGN\nchannels with state information available at both encoder and decoder where the\nstate distribution follows the left Perron-Frobenius eigenvector with unit\nManhattan norm of the stochastic matrix of Markov chains. Numerical results\nshow that the free energies and MSEs obtained via the replica method are\nclosely approximate to their counterparts achieved by the Metropolis-Hastings\nalgorithm or some well-known approximate message passing algorithms in the\nresearch literature.\n","authors":["Lan V. Truong"],"pdf_url":"https://arxiv.org/pdf/2009.13370v5.pdf","comment":"A shorter version to appear in IEEE Transactions on Information\n  Theory (accepted in July 2023)"},{"id":"http://arxiv.org/abs/2303.13808v2","updated":"2023-07-25T16:12:01Z","published":"2023-03-24T05:05:01Z","title":"marl-jax: Multi-Agent Reinforcement Leaning Framework","summary":"  Recent advances in Reinforcement Learning (RL) have led to many exciting\napplications. These advancements have been driven by improvements in both\nalgorithms and engineering, which have resulted in faster training of RL\nagents. We present marl-jax, a multi-agent reinforcement learning software\npackage for training and evaluating social generalization of the agents. The\npackage is designed for training a population of agents in multi-agent\nenvironments and evaluating their ability to generalize to diverse background\nagents. It is built on top of DeepMind's JAX ecosystem~\\cite{deepmind2020jax}\nand leverages the RL ecosystem developed by DeepMind. Our framework marl-jax is\ncapable of working in cooperative and competitive, simultaneous-acting\nenvironments with multiple agents. The package offers an intuitive and\nuser-friendly command-line interface for training a population and evaluating\nits generalization capabilities. In conclusion, marl-jax provides a valuable\nresource for researchers interested in exploring social generalization in the\ncontext of MARL. The open-source code for marl-jax is available at:\n\\href{https://github.com/kinalmehta/marl-jax}{https://github.com/kinalmehta/marl-jax}\n","authors":["Kinal Mehta","Anuj Mahajan","Pawan Kumar"],"pdf_url":"https://arxiv.org/pdf/2303.13808v2.pdf","comment":"Accepted at ECML-PKDD 2023 Demo Track"},{"id":"http://arxiv.org/abs/2303.05101v2","updated":"2023-07-25T15:51:16Z","published":"2023-03-09T08:20:28Z","title":"Scalable Stochastic Gradient Riemannian Langevin Dynamics in\n  Non-Diagonal Metrics","summary":"  Stochastic-gradient sampling methods are often used to perform Bayesian\ninference on neural networks. It has been observed that the methods in which\nnotions of differential geometry are included tend to have better performances,\nwith the Riemannian metric improving posterior exploration by accounting for\nthe local curvature. However, the existing methods often resort to simple\ndiagonal metrics to remain computationally efficient. This loses some of the\ngains. We propose two non-diagonal metrics that can be used in\nstochastic-gradient samplers to improve convergence and exploration but have\nonly a minor computational overhead over diagonal metrics. We show that for\nfully connected neural networks (NNs) with sparsity-inducing priors and\nconvolutional NNs with correlated priors, using these metrics can provide\nimprovements. For some other choices the posterior is sufficiently easy also\nfor the simpler metrics.\n","authors":["Hanlin Yu","Marcelo Hartmann","Bernardo Williams","Arto Klami"],"pdf_url":"https://arxiv.org/pdf/2303.05101v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13592v1","updated":"2023-07-25T15:49:25Z","published":"2023-07-25T15:49:25Z","title":"Multi-GPU Approach for Training of Graph ML Models on large CFD Meshes","summary":"  Mesh-based numerical solvers are an important part in many design tool\nchains. However, accurate simulations like computational fluid dynamics are\ntime and resource consuming which is why surrogate models are employed to\nspeed-up the solution process. Machine Learning based surrogate models on the\nother hand are fast in predicting approximate solutions but often lack\naccuracy. Thus, the development of the predictor in a predictor-corrector\napproach is the focus here, where the surrogate model predicts a flow field and\nthe numerical solver corrects it. This paper scales a state-of-the-art\nsurrogate model from the domain of graph-based machine learning to\nindustry-relevant mesh sizes of a numerical flow simulation. The approach\npartitions and distributes the flow domain to multiple GPUs and provides halo\nexchange between these partitions during training. The utilized graph neural\nnetwork operates directly on the numerical mesh and is able to preserve complex\ngeometries as well as all other properties of the mesh. The proposed surrogate\nmodel is evaluated with an application on a three dimensional turbomachinery\nsetup and compared to a traditionally trained distributed model. The results\nshow that the traditional approach produces superior predictions and\noutperforms the proposed surrogate model. Possible explanations, improvements\nand future directions are outlined.\n","authors":["Sebastian Strönisch","Maximilian Sander","Andreas Knüpfer","Marcus Meyer"],"pdf_url":"https://arxiv.org/pdf/2307.13592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13586v1","updated":"2023-07-25T15:42:11Z","published":"2023-07-25T15:42:11Z","title":"Settling the Sample Complexity of Online Reinforcement Learning","summary":"  A central issue lying at the heart of online reinforcement learning (RL) is\ndata efficiency. While a number of recent works achieved asymptotically minimal\nregret in online RL, the optimality of these results is only guaranteed in a\n``large-sample'' regime, imposing enormous burn-in cost in order for their\nalgorithms to operate optimally. How to achieve minimax-optimal regret without\nincurring any burn-in cost has been an open problem in RL theory.\n  We settle this problem for the context of finite-horizon inhomogeneous Markov\ndecision processes. Specifically, we prove that a modified version of Monotonic\nValue Propagation (MVP), a model-based algorithm proposed by\n\\cite{zhang2020reinforcement}, achieves a regret on the order of (modulo log\nfactors) \\begin{equation*}\n  \\min\\big\\{ \\sqrt{SAH^3K}, \\,HK \\big\\}, \\end{equation*} where $S$ is the\nnumber of states, $A$ is the number of actions, $H$ is the planning horizon,\nand $K$ is the total number of episodes. This regret matches the minimax lower\nbound for the entire range of sample size $K\\geq 1$, essentially eliminating\nany burn-in requirement. It also translates to a PAC sample complexity (i.e.,\nthe number of episodes needed to yield $\\varepsilon$-accuracy) of\n$\\frac{SAH^3}{\\varepsilon^2}$ up to log factor, which is minimax-optimal for\nthe full $\\varepsilon$-range.\n  Further, we extend our theory to unveil the influences of problem-dependent\nquantities like the optimal value/cost and certain variances. The key technical\ninnovation lies in the development of a new regret decomposition strategy and a\nnovel analysis paradigm to decouple complicated statistical dependency -- a\nlong-standing challenge facing the analysis of online RL in the sample-hungry\nregime.\n","authors":["Zihan Zhang","Yuxin Chen","Jason D. Lee","Simon S. Du"],"pdf_url":"https://arxiv.org/pdf/2307.13586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13581v1","updated":"2023-07-25T15:35:13Z","published":"2023-07-25T15:35:13Z","title":"Comparing Forward and Inverse Design Paradigms: A Case Study on\n  Refractory High-Entropy Alloys","summary":"  The rapid design of advanced materials is a topic of great scientific\ninterest. The conventional, ``forward'' paradigm of materials design involves\nevaluating multiple candidates to determine the best candidate that matches the\ntarget properties. However, recent advances in the field of deep learning have\ngiven rise to the possibility of an ``inverse'' design paradigm for advanced\nmaterials, wherein a model provided with the target properties is able to find\nthe best candidate. Being a relatively new concept, there remains a need to\nsystematically evaluate how these two paradigms perform in practical\napplications. Therefore, the objective of this study is to directly,\nquantitatively compare the forward and inverse design modeling paradigms. We do\nso by considering two case studies of refractory high-entropy alloy design with\ndifferent objectives and constraints and comparing the inverse design method to\nother forward schemes like localized forward search, high throughput screening,\nand multi objective optimization.\n","authors":["Arindam Debnath","Lavanya Raman","Wenjie Li","Adam M. Krajewski","Marcia Ahn","Shuang Lin","Shunli Shang","Allison M. Beese","Zi-Kui Liu","Wesley F. Reinhart"],"pdf_url":"https://arxiv.org/pdf/2307.13581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13579v1","updated":"2023-07-25T15:33:38Z","published":"2023-07-25T15:33:38Z","title":"Reinterpreting survival analysis in the universal approximator age","summary":"  Survival analysis is an integral part of the statistical toolbox. However,\nwhile most domains of classical statistics have embraced deep learning,\nsurvival analysis only recently gained some minor attention from the deep\nlearning community. This recent development is likely in part motivated by the\nCOVID-19 pandemic. We aim to provide the tools needed to fully harness the\npotential of survival analysis in deep learning. On the one hand, we discuss\nhow survival analysis connects to classification and regression. On the other\nhand, we provide technical tools. We provide a new loss function, evaluation\nmetrics, and the first universal approximating network that provably produces\nsurvival curves without numeric integration. We show that the loss function and\nmodel outperform other approaches using a large numerical study.\n","authors":["Sören Dittmer","Michael Roberts","Jacobus Preller","AIX COVNET","James H. F. Rudd","John A. D. Aston","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2307.13579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13571v1","updated":"2023-07-25T15:23:15Z","published":"2023-07-25T15:23:15Z","title":"PT$\\mathrm{L}^{p}$: Partial Transport $\\mathrm{L}^{p}$ Distances","summary":"  Optimal transport and its related problems, including optimal partial\ntransport, have proven to be valuable tools in machine learning for computing\nmeaningful distances between probability or positive measures. This success has\nled to a growing interest in defining transport-based distances that allow for\ncomparing signed measures and, more generally, multi-channeled signals.\nTransport $\\mathrm{L}^{p}$ distances are notable extensions of the optimal\ntransport framework to signed and possibly multi-channeled signals. In this\npaper, we introduce partial transport $\\mathrm{L}^{p}$ distances as a new\nfamily of metrics for comparing generic signals, benefiting from the robustness\nof partial transport distances. We provide theoretical background such as the\nexistence of optimal plans and the behavior of the distance in various limits.\nFurthermore, we introduce the sliced variation of these distances, which allows\nfor rapid comparison of generic signals. Finally, we demonstrate the\napplication of the proposed distances in signal class separability and nearest\nneighbor classification.\n","authors":["Xinran Liu","Yikun Bai","Huy Tran","Zhanqi Zhu","Matthew Thorpe","Soheil Kolouri"],"pdf_url":"https://arxiv.org/pdf/2307.13571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07343v2","updated":"2023-07-25T15:20:12Z","published":"2023-07-14T13:46:35Z","title":"MaxMin-L2-SVC-NCH: A Novel Approach for Support Vector Classifier\n  Training and Parameter Selection","summary":"  The selection of Gaussian kernel parameters plays an important role in the\napplications of support vector classification (SVC). A commonly used method is\nthe k-fold cross validation with grid search (CV), which is extremely\ntime-consuming because it needs to train a large number of SVC models. In this\npaper, a new approach is proposed to train SVC and optimize the selection of\nGaussian kernel parameters. We first formulate the training and parameter\nselection of SVC as a minimax optimization problem named as MaxMin-L2-SVC-NCH,\nin which the minimization problem is an optimization problem of finding the\nclosest points between two normal convex hulls (L2-SVC-NCH) while the\nmaximization problem is an optimization problem of finding the optimal Gaussian\nkernel parameters. A lower time complexity can be expected in MaxMin-L2-SVC-NCH\nbecause CV is not needed. We then propose a projected gradient algorithm (PGA)\nfor training L2-SVC-NCH. The famous sequential minimal optimization (SMO)\nalgorithm is a special case of the PGA. Thus, the PGA can provide more\nflexibility than the SMO. Furthermore, the solution of the maximization problem\nis done by a gradient ascent algorithm with dynamic learning rate. The\ncomparative experiments between MaxMin-L2-SVC-NCH and the previous best\napproaches on public datasets show that MaxMin-L2-SVC-NCH greatly reduces the\nnumber of models to be trained while maintaining competitive test accuracy.\nThese findings indicate that MaxMin-L2-SVC-NCH is a better choice for SVC\ntasks.\n","authors":["Linkai Luo","Qiaoling Yang","Hong Peng","Yiding Wang","Ziyang Chen"],"pdf_url":"https://arxiv.org/pdf/2307.07343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13565v1","updated":"2023-07-25T15:17:31Z","published":"2023-07-25T15:17:31Z","title":"Decision-Focused Learning: Foundations, State of the Art, Benchmark and\n  Future Opportunities","summary":"  Decision-focused learning (DFL) is an emerging paradigm in machine learning\nwhich trains a model to optimize decisions, integrating prediction and\noptimization in an end-to-end system. This paradigm holds the promise to\nrevolutionize decision-making in many real-world applications which operate\nunder uncertainty, where the estimation of unknown parameters within these\ndecision models often becomes a substantial roadblock. This paper presents a\ncomprehensive review of DFL. It provides an in-depth analysis of the various\ntechniques devised to integrate machine learning and optimization models\nintroduces a taxonomy of DFL methods distinguished by their unique\ncharacteristics, and conducts an extensive empirical evaluation of these\nmethods proposing suitable benchmark dataset and tasks for DFL. Finally, the\nstudy provides valuable insights into current and potential future avenues in\nDFL research.\n","authors":["Jayanta Mandi","James Kotary","Senne Berden","Maxime Mulamba","Victor Bucarey","Tias Guns","Ferdinando Fioretto"],"pdf_url":"https://arxiv.org/pdf/2307.13565v1.pdf","comment":"Experimental Survey and Benchmarking"},{"id":"http://arxiv.org/abs/2307.12204v2","updated":"2023-07-25T15:16:40Z","published":"2023-07-23T02:18:30Z","title":"Adversarial Agents For Attacking Inaudible Voice Activated Devices","summary":"  The paper applies reinforcement learning to novel Internet of Thing\nconfigurations. Our analysis of inaudible attacks on voice-activated devices\nconfirms the alarming risk factor of 7.6 out of 10, underlining significant\nsecurity vulnerabilities scored independently by NIST National Vulnerability\nDatabase (NVD). Our baseline network model showcases a scenario in which an\nattacker uses inaudible voice commands to gain unauthorized access to\nconfidential information on a secured laptop. We simulated many attack\nscenarios on this baseline network model, revealing the potential for mass\nexploitation of interconnected devices to discover and own privileged\ninformation through physical access without adding new hardware or amplifying\ndevice skills. Using Microsoft's CyberBattleSim framework, we evaluated six\nreinforcement learning algorithms and found that Deep-Q learning with\nexploitation proved optimal, leading to rapid ownership of all nodes in fewer\nsteps. Our findings underscore the critical need for understanding\nnon-conventional networks and new cybersecurity measures in an ever-expanding\ndigital landscape, particularly those characterized by mobile devices, voice\nactivation, and non-linear microphones susceptible to malicious actors\noperating stealth attacks in the near-ultrasound or inaudible ranges. By 2024,\nthis new attack surface might encompass more digital voice assistants than\npeople on the planet yet offer fewer remedies than conventional patching or\nfirmware fixes since the inaudible attacks arise inherently from the microphone\ndesign and digital signal processing.\n","authors":["Forrest McKee","David Noever"],"pdf_url":"https://arxiv.org/pdf/2307.12204v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05747v2","updated":"2023-07-25T15:16:33Z","published":"2023-07-08T14:14:55Z","title":"Integrating Curricula with Replays: Its Effects on Continual Learning","summary":"  Humans engage in learning and reviewing processes with curricula when\nacquiring new skills or knowledge. This human learning behavior has inspired\nthe integration of curricula with replay methods in continual learning agents.\nThe goal is to emulate the human learning process, thereby improving knowledge\nretention and facilitating learning transfer. Existing replay methods in\ncontinual learning agents involve the random selection and ordering of data\nfrom previous tasks, which has shown to be effective. However, limited research\nhas explored the integration of different curricula with replay methods to\nenhance continual learning. Our study takes initial steps in examining the\nimpact of integrating curricula with replay methods on continual learning in\nthree specific aspects: the interleaved frequency of replayed exemplars with\ntraining data, the sequence in which exemplars are replayed, and the strategy\nfor selecting exemplars into the replay buffer. These aspects of curricula\ndesign align with cognitive psychology principles and leverage the benefits of\ninterleaved practice during replays, easy-to-hard rehearsal, and exemplar\nselection strategy involving exemplars from a uniform distribution of\ndifficulties. Based on our results, these three curricula effectively mitigated\ncatastrophic forgetting and enhanced positive knowledge transfer, demonstrating\nthe potential of curricula in advancing continual learning methodologies. Our\ncode and data are available:\nhttps://github.com/ZhangLab-DeepNeuroCogLab/Integrating-Curricula-with-Replays\n","authors":["Ren Jie Tee","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.05747v2.pdf","comment":"8 pages, 6 figures, accepted in AAAI Summer Symposium Series\n  Proceedings"},{"id":"http://arxiv.org/abs/2307.08303v2","updated":"2023-07-25T14:57:05Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n  Models","summary":"  Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v2.pdf","comment":"fix typo InPairs which should be InPars"},{"id":"http://arxiv.org/abs/2307.13548v1","updated":"2023-07-25T14:51:01Z","published":"2023-07-25T14:51:01Z","title":"Node Injection Link Stealing Attack","summary":"  In this paper, we present a stealthy and effective attack that exposes\nprivacy vulnerabilities in Graph Neural Networks (GNNs) by inferring private\nlinks within graph-structured data. Focusing on the inductive setting where new\nnodes join the graph and an API is used to query predictions, we investigate\nthe potential leakage of private edge information. We also propose methods to\npreserve privacy while maintaining model utility. Our attack demonstrates\nsuperior performance in inferring the links compared to the state of the art.\nFurthermore, we examine the application of differential privacy (DP) mechanisms\nto mitigate the impact of our proposed attack, we analyze the trade-off between\nprivacy preservation and model utility. Our work highlights the privacy\nvulnerabilities inherent in GNNs, underscoring the importance of developing\nrobust privacy-preserving mechanisms for their application.\n","authors":["Oualid Zari","Javier Parra-Arnau","Ayşe Ünsal","Melek Önen"],"pdf_url":"https://arxiv.org/pdf/2307.13548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13546v1","updated":"2023-07-25T14:48:54Z","published":"2023-07-25T14:48:54Z","title":"Transfer Learning for Portfolio Optimization","summary":"  In this work, we explore the possibility of utilizing transfer learning\ntechniques to address the financial portfolio optimization problem. We\nintroduce a novel concept called \"transfer risk\", within the optimization\nframework of transfer learning. A series of numerical experiments are conducted\nfrom three categories: cross-continent transfer, cross-sector transfer, and\ncross-frequency transfer. In particular, 1. a strong correlation between the\ntransfer risk and the overall performance of transfer learning methods is\nestablished, underscoring the significance of transfer risk as a viable\nindicator of \"transferability\"; 2. transfer risk is shown to provide a\ncomputationally efficient way to identify appropriate source tasks in transfer\nlearning, enhancing the efficiency and effectiveness of the transfer learning\napproach; 3. additionally, the numerical experiments offer valuable new\ninsights for portfolio management across these different settings.\n","authors":["Haoyang Cao","Haotian Gu","Xin Guo","Mathieu Rosenbaum"],"pdf_url":"https://arxiv.org/pdf/2307.13546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13544v1","updated":"2023-07-25T14:47:36Z","published":"2023-07-25T14:47:36Z","title":"A model for efficient dynamical ranking in networks","summary":"  We present a physics-inspired method for inferring dynamic rankings in\ndirected temporal networks - networks in which each directed and timestamped\nedge reflects the outcome and timing of a pairwise interaction. The inferred\nranking of each node is real-valued and varies in time as each new edge,\nencoding an outcome like a win or loss, raises or lowers the node's estimated\nstrength or prestige, as is often observed in real scenarios including\nsequences of games, tournaments, or interactions in animal hierarchies. Our\nmethod works by solving a linear system of equations and requires only one\nparameter to be tuned. As a result, the corresponding algorithm is scalable and\nefficient. We test our method by evaluating its ability to predict interactions\n(edges' existence) and their outcomes (edges' directions) in a variety of\napplications, including both synthetic and real data. Our analysis shows that\nin many cases our method's performance is better than existing methods for\npredicting dynamic rankings and interaction outcomes.\n","authors":["Andrea Della Vecchia","Kibidi Neocosmos","Daniel B. Larremore","Cristopher Moore","Caterina De Bacco"],"pdf_url":"https://arxiv.org/pdf/2307.13544v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2303.17566v3","updated":"2023-07-25T14:42:41Z","published":"2023-03-30T17:30:42Z","title":"Non-Invasive Fairness in Learning through the Lens of Data Drift","summary":"  Machine Learning (ML) models are widely employed to drive many modern data\nsystems. While they are undeniably powerful tools, ML models often demonstrate\nimbalanced performance and unfair behaviors. The root of this problem often\nlies in the fact that different subpopulations commonly display divergent\ntrends: as a learning algorithm tries to identify trends in the data, it\nnaturally favors the trends of the majority groups, leading to a model that\nperforms poorly and unfairly for minority populations. Our goal is to improve\nthe fairness and trustworthiness of ML models by applying only non-invasive\ninterventions, i.e., without altering the data or the learning algorithm. We\nuse a simple but key insight: the divergence of trends between different\npopulations, and, consecutively, between a learned model and minority\npopulations, is analogous to data drift, which indicates the poor conformance\nbetween parts of the data and the trained model. We explore two strategies\n(model-splitting and reweighing) to resolve this drift, aiming to improve the\noverall conformance of models to the underlying data. Both our methods\nintroduce novel ways to employ the recently-proposed data profiling primitive\nof Conformance Constraints. Our experimental evaluation over 7 real-world\ndatasets shows that both DifFair and ConFair improve the fairness of ML models.\nWe demonstrate scenarios where DifFair has an edge, though ConFair has the\ngreatest practical impact and outperforms other baselines. Moreover, as a\nmodel-agnostic technique, ConFair stays robust when used against different\nmodels than the ones on which the weights have been learned, which is not the\ncase for other state of the art.\n","authors":["Ke Yang","Alexandra Meliou"],"pdf_url":"https://arxiv.org/pdf/2303.17566v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.09932v5","updated":"2023-07-25T14:41:05Z","published":"2022-01-24T19:47:10Z","title":"Learning Optimal Fair Classification Trees: Trade-offs Between\n  Interpretability, Fairness, and Accuracy","summary":"  The increasing use of machine learning in high-stakes domains -- where\npeople's livelihoods are impacted -- creates an urgent need for interpretable,\nfair, and highly accurate algorithms. With these needs in mind, we propose a\nmixed integer optimization (MIO) framework for learning optimal classification\ntrees -- one of the most interpretable models -- that can be augmented with\narbitrary fairness constraints. In order to better quantify the \"price of\ninterpretability\", we also propose a new measure of model interpretability\ncalled decision complexity that allows for comparisons across different classes\nof machine learning models. We benchmark our method against state-of-the-art\napproaches for fair classification on popular datasets; in doing so, we conduct\none of the first comprehensive analyses of the trade-offs between\ninterpretability, fairness, and predictive accuracy. Given a fixed disparity\nthreshold, our method has a price of interpretability of about 4.2 percentage\npoints in terms of out-of-sample accuracy compared to the best performing,\ncomplex models. However, our method consistently finds decisions with almost\nfull parity, while other methods rarely do.\n","authors":["Nathanael Jo","Sina Aghaei","Andrés Gómez","Phebe Vayanos"],"pdf_url":"https://arxiv.org/pdf/2201.09932v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13539v1","updated":"2023-07-25T14:40:11Z","published":"2023-07-25T14:40:11Z","title":"Model Calibration in Dense Classification with Adaptive Label\n  Perturbation","summary":"  For safety-related applications, it is crucial to produce trustworthy deep\nneural networks whose prediction is associated with confidence that can\nrepresent the likelihood of correctness for subsequent decision-making.\nExisting dense binary classification models are prone to being over-confident.\nTo improve model calibration, we propose Adaptive Stochastic Label Perturbation\n(ASLP) which learns a unique label perturbation level for each training image.\nASLP employs our proposed Self-Calibrating Binary Cross Entropy (SC-BCE) loss,\nwhich unifies label perturbation processes including stochastic approaches\n(like DisturbLabel), and label smoothing, to correct calibration while\nmaintaining classification rates. ASLP follows Maximum Entropy Inference of\nclassic statistical mechanics to maximise prediction entropy with respect to\nmissing information. It performs this while: (1) preserving classification\naccuracy on known data as a conservative solution, or (2) specifically improves\nmodel calibration degree by minimising the gap between the prediction accuracy\nand expected confidence of the target training label. Extensive results\ndemonstrate that ASLP can significantly improve calibration degrees of dense\nbinary classification models on both in-distribution and out-of-distribution\ndata. The code is available on https://github.com/Carlisle-Liu/ASLP.\n","authors":["Jiawei Liu","Changkun Ye","Shan Wang","Ruikai Cui","Jing Zhang","Kaihao Zhang","Nick Barnes"],"pdf_url":"https://arxiv.org/pdf/2307.13539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13538v1","updated":"2023-07-25T14:35:55Z","published":"2023-07-25T14:35:55Z","title":"INFINITY: Neural Field Modeling for Reynolds-Averaged Navier-Stokes\n  Equations","summary":"  For numerical design, the development of efficient and accurate surrogate\nmodels is paramount. They allow us to approximate complex physical phenomena,\nthereby reducing the computational burden of direct numerical simulations. We\npropose INFINITY, a deep learning model that utilizes implicit neural\nrepresentations (INRs) to address this challenge. Our framework encodes\ngeometric information and physical fields into compact representations and\nlearns a mapping between them to infer the physical fields. We use an airfoil\ndesign optimization problem as an example task and we evaluate our approach on\nthe challenging AirfRANS dataset, which closely resembles real-world industrial\nuse-cases. The experimental results demonstrate that our framework achieves\nstate-of-the-art performance by accurately inferring physical fields throughout\nthe volume and surface. Additionally we demonstrate its applicability in\ncontexts such as design exploration and shape optimization: our model can\ncorrectly predict drag and lift coefficients while adhering to the equations.\n","authors":["Louis Serrano","Leon Migus","Yuan Yin","Jocelyn Ahmed Mazari","Patrick Gallinari"],"pdf_url":"https://arxiv.org/pdf/2307.13538v1.pdf","comment":"ICML 2023 Workshop on Synergy of Scientific and Machine Learning\n  Modeling"},{"id":"http://arxiv.org/abs/2307.02150v3","updated":"2023-07-25T14:32:41Z","published":"2023-07-05T09:46:41Z","title":"Harmonizing Feature Attributions Across Deep Learning Architectures:\n  Enhancing Interpretability and Consistency","summary":"  Ensuring the trustworthiness and interpretability of machine learning models\nis critical to their deployment in real-world applications. Feature attribution\nmethods have gained significant attention, which provide local explanations of\nmodel predictions by attributing importance to individual input features. This\nstudy examines the generalization of feature attributions across various deep\nlearning architectures, such as convolutional neural networks (CNNs) and vision\ntransformers. We aim to assess the feasibility of utilizing a feature\nattribution method as a future detector and examine how these features can be\nharmonized across multiple models employing distinct architectures but trained\non the same data distribution. By exploring this harmonization, we aim to\ndevelop a more coherent and optimistic understanding of feature attributions,\nenhancing the consistency of local explanations across diverse deep-learning\nmodels. Our findings highlight the potential for harmonized feature attribution\nmethods to improve interpretability and foster trust in machine learning\napplications, regardless of the underlying architecture.\n","authors":["Md Abdul Kadir","Gowtham Krishna Addluri","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2307.02150v3.pdf","comment":"This version of the contribution has been submitted in KI2023"},{"id":"http://arxiv.org/abs/2207.08012v2","updated":"2023-07-25T14:30:20Z","published":"2022-07-16T20:37:46Z","title":"Meta-Referential Games to Learn Compositional Learning Behaviours","summary":"  Human beings use compositionality to generalise from past experiences to\nnovel experiences. We assume a separation of our experiences into fundamental\natomic components that can be recombined in novel ways to support our ability\nto engage with novel experiences. We frame this as the ability to learn to\ngeneralise compositionally, and we will refer to behaviours making use of this\nability as compositional learning behaviours (CLBs). A central problem to\nlearning CLBs is the resolution of a binding problem (BP). While it is another\nfeat of intelligence that human beings perform with ease, it is not the case\nfor state-of-the-art artificial agents. Thus, in order to build artificial\nagents able to collaborate with human beings, we propose to develop a novel\nbenchmark to investigate agents' abilities to exhibit CLBs by solving a\ndomain-agnostic version of the BP. We take inspiration from the language\nemergence and grounding framework of referential games and propose a\nmeta-learning extension of referential games, entitled Meta-Referential Games,\nand use this framework to build our benchmark, that we name Symbolic Behaviour\nBenchmark (S2B). We provide baseline results showing that our benchmark is a\ncompelling challenge that we hope will spur the research community towards\ndeveloping more capable artificial agents.\n","authors":["Kevin Denamganaï","Sondess Missaoui","James Alfred Walker"],"pdf_url":"https://arxiv.org/pdf/2207.08012v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2307.13535v1","updated":"2023-07-25T14:30:00Z","published":"2023-07-25T14:30:00Z","title":"Do algorithms and barriers for sparse principal component analysis\n  extend to other structured settings?","summary":"  We study a principal component analysis problem under the spiked Wishart\nmodel in which the structure in the signal is captured by a class of\nunion-of-subspace models. This general class includes vanilla sparse PCA as\nwell as its variants with graph sparsity. With the goal of studying these\nproblems under a unified statistical and computational lens, we establish\nfundamental limits that depend on the geometry of the problem instance, and\nshow that a natural projected power method exhibits local convergence to the\nstatistically near-optimal neighborhood of the solution. We complement these\nresults with end-to-end analyses of two important special cases given by path\nand tree sparsity in a general basis, showing initialization methods and\nmatching evidence of computational hardness. Overall, our results indicate that\nseveral of the phenomena observed for vanilla sparse PCA extend in a natural\nfashion to its structured counterparts.\n","authors":["Guanyi Wang","Mengqi Lou","Ashwin Pananjady"],"pdf_url":"https://arxiv.org/pdf/2307.13535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13533v1","updated":"2023-07-25T14:27:49Z","published":"2023-07-25T14:27:49Z","title":"Differentiable Turbulence II","summary":"  Differentiable fluid simulators are increasingly demonstrating value as\nuseful tools for developing data-driven models in computational fluid dynamics\n(CFD). Differentiable turbulence, or the end-to-end training of machine\nlearning (ML) models embedded in CFD solution algorithms, captures both the\ngeneralization power and limited upfront cost of physics-based simulations, and\nthe flexibility and automated training of deep learning methods. We develop a\nframework for integrating deep learning models into a generic finite element\nnumerical scheme for solving the Navier-Stokes equations, applying the\ntechnique to learn a sub-grid scale closure using a multi-scale graph neural\nnetwork. We demonstrate the method on several realizations of flow over a\nbackwards-facing step, testing on both unseen Reynolds numbers and new\ngeometry. We show that the learned closure can achieve accuracy comparable to\ntraditional large eddy simulation on a finer grid that amounts to an equivalent\nspeedup of 10x. As the desire and need for cheaper CFD simulations grows, we\nsee hybrid physics-ML methods as a path forward to be exploited in the near\nfuture.\n","authors":["Varun Shankar","Romit Maulik","Venkatasubramanian Viswanathan"],"pdf_url":"https://arxiv.org/pdf/2307.13533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13517v1","updated":"2023-07-25T14:09:53Z","published":"2023-07-25T14:09:53Z","title":"Towards Long-Term predictions of Turbulence using Neural Operators","summary":"  This paper explores Neural Operators to predict turbulent flows, focusing on\nthe Fourier Neural Operator (FNO) model. It aims to develop\nreduced-order/surrogate models for turbulent flow simulations using Machine\nLearning. Different model configurations are analyzed, with U-NET structures\n(UNO and U-FNET) performing better than the standard FNO in accuracy and\nstability. U-FNET excels in predicting turbulence at higher Reynolds numbers.\nRegularization terms, like gradient and stability losses, are essential for\nstable and accurate predictions. The study emphasizes the need for improved\nmetrics for deep learning models in fluid flow prediction. Further research\nshould focus on models handling complex flows and practical benchmarking\nmetrics.\n","authors":["Fernando Gonzalez","François-Xavier Demoulin","Simon Bernard"],"pdf_url":"https://arxiv.org/pdf/2307.13517v1.pdf","comment":"ETMM14 proceedings"},{"id":"http://arxiv.org/abs/2304.14108v4","updated":"2023-07-25T14:07:03Z","published":"2023-04-27T11:37:18Z","title":"DataComp: In search of the next generation of multimodal datasets","summary":"  Multimodal datasets are a critical component in recent breakthroughs such as\nStable Diffusion and GPT-4, yet their design does not receive the same research\nattention as model architectures or training algorithms. To address this\nshortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset\nexperiments centered around a new candidate pool of 12.8 billion image-text\npairs from Common Crawl. Participants in our benchmark design new filtering\ntechniques or curate new data sources and then evaluate their new dataset by\nrunning our standardized CLIP training code and testing the resulting model on\n38 downstream test sets. Our benchmark consists of multiple compute scales\nspanning four orders of magnitude, which enables the study of scaling trends\nand makes the benchmark accessible to researchers with varying resources. Our\nbaseline experiments show that the DataComp workflow leads to better training\nsets. In particular, our best baseline, DataComp-1B, enables training a CLIP\nViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming\nOpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training\nprocedure and compute. We release DataComp and all accompanying code at\nwww.datacomp.ai.\n","authors":["Samir Yitzhak Gadre","Gabriel Ilharco","Alex Fang","Jonathan Hayase","Georgios Smyrnis","Thao Nguyen","Ryan Marten","Mitchell Wortsman","Dhruba Ghosh","Jieyu Zhang","Eyal Orgad","Rahim Entezari","Giannis Daras","Sarah Pratt","Vivek Ramanujan","Yonatan Bitton","Kalyani Marathe","Stephen Mussmann","Richard Vencu","Mehdi Cherti","Ranjay Krishna","Pang Wei Koh","Olga Saukh","Alexander Ratner","Shuran Song","Hannaneh Hajishirzi","Ali Farhadi","Romain Beaumont","Sewoong Oh","Alex Dimakis","Jenia Jitsev","Yair Carmon","Vaishaal Shankar","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2304.14108v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13503v1","updated":"2023-07-25T13:54:00Z","published":"2023-07-25T13:54:00Z","title":"Continuous Time Evidential Distributions for Irregular Time Series","summary":"  Prevalent in many real-world settings such as healthcare, irregular time\nseries are challenging to formulate predictions from. It is difficult to infer\nthe value of a feature at any given time when observations are sporadic, as it\ncould take on a range of values depending on when it was last observed. To\ncharacterize this uncertainty we present EDICT, a strategy that learns an\nevidential distribution over irregular time series in continuous time. This\ndistribution enables well-calibrated and flexible inference of partially\nobserved features at any time of interest, while expanding uncertainty\ntemporally for sparse, irregular observations. We demonstrate that EDICT\nattains competitive performance on challenging time series classification tasks\nand enabling uncertainty-guided inference when encountering noisy data.\n","authors":["Taylor W. Killian","Haoran Zhang","Thomas Hartvigsen","Ava P. Amini"],"pdf_url":"https://arxiv.org/pdf/2307.13503v1.pdf","comment":"ICML 2023 Workshop on Interpretable Machine Learning in Healthcare.\n  Code is available at https://github.com/twkillian/EDICT"},{"id":"http://arxiv.org/abs/2307.13501v1","updated":"2023-07-25T13:51:12Z","published":"2023-07-25T13:51:12Z","title":"Deep Reinforcement Learning for Robust Goal-Based Wealth Management","summary":"  Goal-based investing is an approach to wealth management that prioritizes\nachieving specific financial goals. It is naturally formulated as a sequential\ndecision-making problem as it requires choosing the appropriate investment\nuntil a goal is achieved. Consequently, reinforcement learning, a machine\nlearning technique appropriate for sequential decision-making, offers a\npromising path for optimizing these investment strategies. In this paper, a\nnovel approach for robust goal-based wealth management based on deep\nreinforcement learning is proposed. The experimental results indicate its\nsuperiority over several goal-based wealth management benchmarks on both\nsimulated and historical market data.\n","authors":["Tessa Bauman","Bruno Gašperov","Stjepan Begušić","Zvonko Kostanjčar"],"pdf_url":"https://arxiv.org/pdf/2307.13501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13499v1","updated":"2023-07-25T13:49:15Z","published":"2023-07-25T13:49:15Z","title":"Finding Money Launderers Using Heterogeneous Graph Neural Networks","summary":"  Current anti-money laundering (AML) systems, predominantly rule-based,\nexhibit notable shortcomings in efficiently and precisely detecting instances\nof money laundering. As a result, there has been a recent surge toward\nexploring alternative approaches, particularly those utilizing machine\nlearning. Since criminals often collaborate in their money laundering\nendeavors, accounting for diverse types of customer relations and links becomes\ncrucial. In line with this, the present paper introduces a graph neural network\n(GNN) approach to identify money laundering activities within a large\nheterogeneous network constructed from real-world bank transactions and\nbusiness role data belonging to DNB, Norway's largest bank. Specifically, we\nextend the homogeneous GNN method known as the Message Passing Neural Network\n(MPNN) to operate effectively on a heterogeneous graph. As part of this\nprocedure, we propose a novel method for aggregating messages across different\nedges of the graph. Our findings highlight the importance of using an\nappropriate GNN architecture when combining information in heterogeneous\ngraphs. The performance results of our model demonstrate great potential in\nenhancing the quality of electronic surveillance systems employed by banks to\ndetect instances of money laundering. To the best of our knowledge, this is the\nfirst published work applying GNN on a large real-world heterogeneous network\nfor anti-money laundering purposes.\n","authors":["Fredrik Johannessen","Martin Jullum"],"pdf_url":"https://arxiv.org/pdf/2307.13499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13497v1","updated":"2023-07-25T13:46:36Z","published":"2023-07-25T13:46:36Z","title":"Zshot: An Open-source Framework for Zero-Shot Named Entity Recognition\n  and Relation Extraction","summary":"  The Zero-Shot Learning (ZSL) task pertains to the identification of entities\nor relations in texts that were not seen during training. ZSL has emerged as a\ncritical research area due to the scarcity of labeled data in specific domains,\nand its applications have grown significantly in recent years. With the advent\nof large pretrained language models, several novel methods have been proposed,\nresulting in substantial improvements in ZSL performance. There is a growing\ndemand, both in the research community and industry, for a comprehensive ZSL\nframework that facilitates the development and accessibility of the latest\nmethods and pretrained models.In this study, we propose a novel ZSL framework\ncalled Zshot that aims to address the aforementioned challenges. Our primary\nobjective is to provide a platform that allows researchers to compare different\nstate-of-the-art ZSL methods with standard benchmark datasets. Additionally, we\nhave designed our framework to support the industry with readily available APIs\nfor production under the standard SpaCy NLP pipeline. Our API is extendible and\nevaluable, moreover, we include numerous enhancements such as boosting the\naccuracy with pipeline ensembling and visualization utilities available as a\nSpaCy extension.\n","authors":["Gabriele Picco","Marcos Martínez Galindo","Alberto Purpura","Leopold Fuchs","Vanessa López","Hoang Thanh Lam"],"pdf_url":"https://arxiv.org/pdf/2307.13497v1.pdf","comment":"Accepted at ACL 2023"},{"id":"http://arxiv.org/abs/2306.07350v2","updated":"2023-07-25T13:44:43Z","published":"2023-06-12T18:16:33Z","title":"G-invariant diffusion maps","summary":"  The diffusion maps embedding of data lying on a manifold have shown success\nin tasks ranging from dimensionality reduction and clustering, to data\nvisualization. In this work, we consider embedding data sets which were sampled\nfrom a manifold which is closed under the action of a continuous matrix group.\nAn example of such a data set is images who's planar rotations are arbitrary.\nThe G-invariant graph Laplacian, introduced in a previous work of the authors,\nadmits eigenfunctions in the form of tensor products between the elements of\nthe irreducible unitary representations of the group and eigenvectors of\ncertain matrices. We employ these eigenfunctions to derive diffusion maps that\nintrinsically account for the group action on the data. In particular, we\nconstruct both equivariant and invariant embeddings which can be used naturally\nto cluster and align the data points. We demonstrate the effectiveness of our\nconstruction with simulated data.\n","authors":["Eitan Rosen","Xiuyuan Cheng","Yoel Shkolnisky"],"pdf_url":"https://arxiv.org/pdf/2306.07350v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07160v2","updated":"2023-07-25T13:43:11Z","published":"2022-11-14T07:40:35Z","title":"FedTracker: Furnishing Ownership Verification and Traceability for\n  Federated Learning Model","summary":"  Federated learning (FL) is a distributed machine learning paradigm allowing\nmultiple clients to collaboratively train a global model without sharing their\nlocal data. However, FL entails exposing the model to various participants.\nThis poses a risk of unauthorized model distribution or resale by the malicious\nclient, compromising the intellectual property rights of the FL group. To deter\nsuch misbehavior, it is essential to establish a mechanism for verifying the\nownership of the model and as well tracing its origin to the leaker among the\nFL participants. In this paper, we present FedTracker, the first FL model\nprotection framework that provides both ownership verification and\ntraceability. FedTracker adopts a bi-level protection scheme consisting of\nglobal watermark mechanism and local fingerprint mechanism. The former\nauthenticates the ownership of the global model, while the latter identifies\nwhich client the model is derived from. FedTracker leverages Continual Learning\n(CL) principles to embedding the watermark in a way that preserves the utility\nof the FL model on both primitive task and watermark task. FedTracker also\ndevises a novel metric to better discriminate different fingerprints.\nExperimental results show FedTracker is effective in ownership verification,\ntraceability, and maintains good fidelity and robustness against various\nwatermark removal attacks.\n","authors":["Shuo Shao","Wenyuan Yang","Hanlin Gu","Zhan Qin","Lixin Fan","Qiang Yang","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2211.07160v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13494v1","updated":"2023-07-25T13:42:22Z","published":"2023-07-25T13:42:22Z","title":"Duet: efficient and scalable hybriD neUral rElation undersTanding","summary":"  Cardinality estimation methods based on probability distribution estimation\nhave achieved high-precision estimation results compared to traditional\nmethods. However, the most advanced methods suffer from high estimation costs\ndue to the sampling method they use when dealing with range queries. Also, such\na sampling method makes them difficult to differentiate, so the supervision\nsignal from the query workload is difficult to train the model to improve the\naccuracy of cardinality estimation. In this paper, we propose a new hybrid and\ndeterministic modeling approach (Duet) for the cardinality estimation problem\nwhich has better efficiency and scalability compared to previous approaches.\nDuet allows for direct cardinality estimation of range queries with\nsignificantly lower time and memory costs, as well as in a differentiable form.\nAs the prediction process of this approach is differentiable, we can\nincorporate queries with larger model estimation errors into the training\nprocess to address the long-tail distribution problem of model estimation\nerrors on high dimensional tables. We evaluate Duet on classical datasets and\nbenchmarks, and the results prove the effectiveness of Duet.\n","authors":["Kaixin Zhang","Hongzhi Wang","Yabin Lu","Ziqi Li","Chang Shu","Yu Yan","Donghua Yang"],"pdf_url":"https://arxiv.org/pdf/2307.13494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13484v1","updated":"2023-07-25T13:21:07Z","published":"2023-07-25T13:21:07Z","title":"Rational kernel-based interpolation for complex-valued frequency\n  response functions","summary":"  This work is concerned with the kernel-based approximation of a\ncomplex-valued function from data, where the frequency response function of a\npartial differential equation in the frequency domain is of particular\ninterest. In this setting, kernel methods are employed more and more\nfrequently, however, standard kernels do not perform well. Moreover, the role\nand mathematical implications of the underlying pair of kernels, which arises\nnaturally in the complex-valued case, remain to be addressed. We introduce new\nreproducing kernel Hilbert spaces of complex-valued functions, and formulate\nthe problem of complex-valued interpolation with a kernel pair as minimum norm\ninterpolation in these spaces. Moreover, we combine the interpolant with a\nlow-order rational function, where the order is adaptively selected based on a\nnew model selection criterion. Numerical results on examples from different\nfields, including electromagnetics and acoustic examples, illustrate the\nperformance of the method, also in comparison to available rational\napproximation methods.\n","authors":["Julien Bect","Niklas Georg","Ulrich Römer","Sebastian Schöps"],"pdf_url":"https://arxiv.org/pdf/2307.13484v1.pdf","comment":"26 pages main paper, 6 pages supplement"},{"id":"http://arxiv.org/abs/2301.13395v2","updated":"2023-07-25T13:13:14Z","published":"2023-01-31T04:03:28Z","title":"Faster Predict-and-Optimize with Davis-Yin Splitting","summary":"  In many applications, a combinatorial problem must be repeatedly solved with\nsimilar, but distinct parameters. Yet, the parameters $w$ are not directly\nobserved; only contextual data $d$ that correlates with $w$ is available. It is\ntempting to use a neural network to predict $w$ given $d$, but training such a\nmodel requires reconciling the discrete nature of combinatorial optimization\nwith the gradient-based frameworks used to train neural networks. When the\nproblem in question is an Integer Linear Program (ILP), one approach to\novercoming this issue is to consider a continuous relaxation of the\ncombinatorial problem. While existing methods utilizing this approach have\nshown to be highly effective on small problems (10-100 variables), they do not\nscale well to large problems. In this work, we draw on ideas from modern convex\noptimization to design a network and training scheme which scales effortlessly\nto problems with thousands of variables.\n","authors":["Daniel McKenzie","Samy Wu Fung","Howard Heaton"],"pdf_url":"https://arxiv.org/pdf/2301.13395v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13470v1","updated":"2023-07-25T13:01:25Z","published":"2023-07-25T13:01:25Z","title":"Combinatorial Auctions and Graph Neural Networks for Local Energy\n  Flexibility Markets","summary":"  This paper proposes a new combinatorial auction framework for local energy\nflexibility markets, which addresses the issue of prosumers' inability to\nbundle multiple flexibility time intervals. To solve the underlying NP-complete\nwinner determination problems, we present a simple yet powerful heterogeneous\ntri-partite graph representation and design graph neural network-based models.\nOur models achieve an average optimal value deviation of less than 5\\% from an\noff-the-shelf optimization tool and show linear inference time complexity\ncompared to the exponential complexity of the commercial solver. Contributions\nand results demonstrate the potential of using machine learning to efficiently\nallocate energy flexibility resources in local markets and solving optimization\nproblems in general.\n","authors":["Awadelrahman M. A. Ahmed","Frank Eliassen","Yan Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13470v1.pdf","comment":"Accepted in The IEEE PES ISGT Europe 2023 (ISGT Europe 2023),\n  Grenoble, France, on October, 2023"},{"id":"http://arxiv.org/abs/2210.10147v2","updated":"2023-07-25T12:57:15Z","published":"2022-10-18T20:26:56Z","title":"TEFL: Turbo Explainable Federated Learning for 6G Trustworthy Zero-Touch\n  Network Slicing","summary":"  Sixth-generation (6G) networks anticipate intelligently supporting a massive\nnumber of coexisting and heterogeneous slices associated with various vertical\nuse cases. Such a context urges the adoption of artificial intelligence\n(AI)-driven zero-touch management and orchestration (MANO) of the end-to-end\n(E2E) slices under stringent service level agreements (SLAs). Specifically, the\ntrustworthiness of the AI black-boxes in real deployment can be achieved by\nexplainable AI (XAI) tools to build transparency between the interacting actors\nin the slicing ecosystem, such as tenants, infrastructure providers and\noperators. Inspired by the turbo principle, this paper presents a novel\niterative explainable federated learning (FL) approach where a constrained\nresource allocation model and an \\emph{explainer} exchange -- in a closed loop\n(CL) fashion -- soft attributions of the features as well as inference\npredictions to achieve a transparent and SLA-aware zero-touch service\nmanagement (ZSM) of 6G network slices at RAN-Edge setup under non-independent\nidentically distributed (non-IID) datasets. In particular, we quantitatively\nvalidate the faithfulness of the explanations via the so-called\nattribution-based \\emph{confidence metric} that is included as a constraint in\nthe run-time FL optimization task. In this respect, Integrated-Gradient (IG) as\nwell as Input $\\times$ Gradient and SHAP are used to generate the attributions\nfor the turbo explainable FL (TEFL), wherefore simulation results under\ndifferent methods confirm its superiority over an unconstrained\nIntegrated-Gradient \\emph{post-hoc} FL baseline.\n","authors":["Swastika Roy","Hatim Chergui","Christos Verikoukis"],"pdf_url":"https://arxiv.org/pdf/2210.10147v2.pdf","comment":"Overlapes with the new version"},{"id":"http://arxiv.org/abs/2307.13468v1","updated":"2023-07-25T12:56:41Z","published":"2023-07-25T12:56:41Z","title":"Gaussian Graph with Prototypical Contrastive Learning in E-Commerce\n  Bundle Recommendation","summary":"  Bundle recommendation aims to provide a bundle of items to satisfy the user\npreference on e-commerce platform. Existing successful solutions are based on\nthe contrastive graph learning paradigm where graph neural networks (GNNs) are\nemployed to learn representations from user-level and bundle-level graph views\nwith a contrastive learning module to enhance the cooperative association\nbetween different views. Nevertheless, they ignore the uncertainty issue which\nhas a significant impact in real bundle recommendation scenarios due to the\nlack of discriminative information caused by highly sparsity or diversity. We\nfurther suggest that their instancewise contrastive learning fails to\ndistinguish the semantically similar negatives (i.e., sampling bias issue),\nresulting in performance degradation. In this paper, we propose a novel\nGaussian Graph with Prototypical Contrastive Learning (GPCL) framework to\novercome these challenges. In particular, GPCL embeds each user/bundle/item as\na Gaussian distribution rather than a fixed vector. We further design a\nprototypical contrastive learning module to capture the contextual information\nand mitigate the sampling bias issue. Extensive experiments demonstrate that\nbenefiting from the proposed components, we achieve new state-of-the-art\nperformance compared to previous methods on several public datasets. Moreover,\nGPCL has been deployed on real-world e-commerce platform and achieved\nsubstantial improvements.\n","authors":["Zhao-Yang Liu","Liucheng Sun","Chenwei Weng","Qijin Chen","Chengfu Huo"],"pdf_url":"https://arxiv.org/pdf/2307.13468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13466v1","updated":"2023-07-25T12:51:25Z","published":"2023-07-25T12:51:25Z","title":"Integrating processed-based models and machine learning for crop yield\n  prediction","summary":"  Crop yield prediction typically involves the utilization of either\ntheory-driven process-based crop growth models, which have proven to be\ndifficult to calibrate for local conditions, or data-driven machine learning\nmethods, which are known to require large datasets. In this work we investigate\npotato yield prediction using a hybrid meta-modeling approach. A crop growth\nmodel is employed to generate synthetic data for (pre)training a convolutional\nneural net, which is then fine-tuned with observational data. When applied in\nsilico, our meta-modeling approach yields better predictions than a baseline\ncomprising a purely data-driven approach. When tested on real-world data from\nfield trials (n=303) and commercial fields (n=77), the meta-modeling approach\nyields competitive results with respect to the crop growth model. In the latter\nset, however, both models perform worse than a simple linear regression with a\nhand-picked feature set and dedicated preprocessing designed by domain experts.\nOur findings indicate the potential of meta-modeling for accurate crop yield\nprediction; however, further advancements and validation using extensive\nreal-world datasets is recommended to solidify its practical effectiveness.\n","authors":["Michiel G. J. Kallenberg","Bernardo Maestrini","Ron van Bree","Paul Ravensbergen","Christos Pylianidis","Frits van Evert","Ioannis N. Athanasiadis"],"pdf_url":"https://arxiv.org/pdf/2307.13466v1.pdf","comment":"6 pages, 4 figures, Accepted after peer-review at the 1st workshop on\n  Synergy of Scientific and Machine Learning Modeling, SynS & ML ICML,\n  Honolulu, Hawaii, USA. July, 2023"},{"id":"http://arxiv.org/abs/2307.13460v1","updated":"2023-07-25T12:40:48Z","published":"2023-07-25T12:40:48Z","title":"Fundamental causal bounds of quantum random access memories","summary":"  Quantum devices should operate in adherence to quantum physics principles.\nQuantum random access memory (QRAM), a fundamental component of many essential\nquantum algorithms for tasks such as linear algebra, data search, and machine\nlearning, is often proposed to offer $\\mathcal{O}(\\log N)$ circuit depth for\n$\\mathcal{O}(N)$ data size, given $N$ qubits. However, this claim appears to\nbreach the principle of relativity when dealing with a large number of qubits\nin quantum materials interacting locally. In our study we critically explore\nthe intrinsic bounds of rapid quantum memories based on causality, employing\nthe relativistic quantum field theory and Lieb-Robinson bounds in quantum\nmany-body systems. In this paper, we consider a hardware-efficient QRAM design\nin hybrid quantum acoustic systems. Assuming clock cycle times of approximately\n$10^{-3}$ seconds and a lattice spacing of about 1 micrometer, we show that\nQRAM can accommodate up to $\\mathcal{O}(10^7)$ logical qubits in 1 dimension,\n$\\mathcal{O}(10^{15})$ to $\\mathcal{O}(10^{20})$ in various 2D architectures,\nand $\\mathcal{O}(10^{24})$ in 3 dimensions. We contend that this causality\nbound broadly applies to other quantum hardware systems. Our findings highlight\nthe impact of fundamental quantum physics constraints on the long-term\nperformance of quantum computing applications in data science and suggest\npotential quantum memory designs for performance enhancement.\n","authors":["Yunfei Wang","Yuri Alexeev","Liang Jiang","Frederic T. Chong","Junyu Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13460v1.pdf","comment":"8+24=32 pages, many figures"},{"id":"http://arxiv.org/abs/2307.11465v2","updated":"2023-07-25T12:39:40Z","published":"2023-07-21T10:01:55Z","title":"A Deep Learning Approach for Overall Survival prediction in Lung Cancer\n  with Missing Values","summary":"  One of the most challenging fields where Artificial Intelligence (AI) can be\napplied is lung cancer research, specifically non-small cell lung cancer\n(NSCLC). In particular, overall survival (OS), the time between diagnosis and\ndeath, is a vital indicator of patient status, enabling tailored treatment and\nimproved OS rates. In this analysis, there are two challenges to take into\naccount. First, few studies effectively exploit the information available from\neach patient, leveraging both uncensored (i.e., dead) and censored (i.e.,\nsurvivors) patients, considering also the events' time. Second, the handling of\nincomplete data is a common issue in the medical field. This problem is\ntypically tackled through the use of imputation methods. Our objective is to\npresent an AI model able to overcome these limits, effectively learning from\nboth censored and uncensored patients and their available features, for the\nprediction of OS for NSCLC patients. We present a novel approach to survival\nanalysis with missing values in the context of NSCLC, which exploits the\nstrengths of the transformer architecture to account only for available\nfeatures without requiring any imputation strategy. By making use of ad-hoc\nlosses for OS, it is able to account for both censored and uncensored patients,\nas well as changes in risks over time. We compared our method with\nstate-of-the-art models for survival analysis coupled with different imputation\nstrategies. We evaluated the results obtained over a period of 6 years using\ndifferent time granularities obtaining a Ct-index, a time-dependent variant of\nthe C-index, of 71.97, 77.58 and 80.72 for time units of 1 month, 1 year and 2\nyears, respectively, outperforming all state-of-the-art methods regardless of\nthe imputation method used.\n","authors":["Camillo Maria Caruso","Valerio Guarrasi","Sara Ramella","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2307.11465v2.pdf","comment":"20 pages, 2 figures"},{"id":"http://arxiv.org/abs/2003.03229v4","updated":"2023-07-25T12:25:13Z","published":"2020-02-02T21:09:39Z","title":"Non-linear Neurons with Human-like Apical Dendrite Activations","summary":"  In order to classify linearly non-separable data, neurons are typically\norganized into multi-layer neural networks that are equipped with at least one\nhidden layer. Inspired by some recent discoveries in neuroscience, we propose a\nnew model of artificial neuron along with a novel activation function enabling\nthe learning of nonlinear decision boundaries using a single neuron. We show\nthat a standard neuron followed by our novel apical dendrite activation (ADA)\ncan learn the XOR logical function with 100% accuracy. Furthermore, we conduct\nexperiments on six benchmark data sets from computer vision, signal processing\nand natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,\nTiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions\nprovide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and\nSwish, for various neural network architectures, e.g. one-hidden-layer or\ntwo-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural\nnetworks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain\nfurther performance improvements when we change the standard model of the\nneuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our\ncode is available at: https://github.com/raduionescu/pynada.\n","authors":["Mariana-Iuliana Georgescu","Radu Tudor Ionescu","Nicolae-Catalin Ristea","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2003.03229v4.pdf","comment":"Accepted for publication in Applied Intelligence"},{"id":"http://arxiv.org/abs/2307.13447v1","updated":"2023-07-25T12:19:35Z","published":"2023-07-25T12:19:35Z","title":"A behavioural transformer for effective collaboration between a robot\n  and a non-stationary human","summary":"  A key challenge in human-robot collaboration is the non-stationarity created\nby humans due to changes in their behaviour. This alters environmental\ntransitions and hinders human-robot collaboration. We propose a principled\nmeta-learning framework to explore how robots could better predict human\nbehaviour, and thereby deal with issues of non-stationarity. On the basis of\nthis framework, we developed Behaviour-Transform (BeTrans). BeTrans is a\nconditional transformer that enables a robot agent to adapt quickly to new\nhuman agents with non-stationary behaviours, due to its notable performance\nwith sequential data. We trained BeTrans on simulated human agents with\ndifferent systematic biases in collaborative settings. We used an original\ncustomisable environment to show that BeTrans effectively collaborates with\nsimulated human agents and adapts faster to non-stationary simulated human\nagents than SOTA techniques.\n","authors":["Ruaridh Mon-Williams","Theodoros Stouraitis","Sethu Vijayakumar"],"pdf_url":"https://arxiv.org/pdf/2307.13447v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.13434v1","updated":"2023-07-25T12:00:48Z","published":"2023-07-25T12:00:48Z","title":"Network Traffic Classification based on Single Flow Time Series Analysis","summary":"  Network traffic monitoring using IP flows is used to handle the current\nchallenge of analyzing encrypted network communication. Nevertheless, the\npacket aggregation into flow records naturally causes information loss;\ntherefore, this paper proposes a novel flow extension for traffic features\nbased on the time series analysis of the Single Flow Time series, i.e., a time\nseries created by the number of bytes in each packet and its timestamp. We\npropose 69 universal features based on the statistical analysis of data points,\ntime domain analysis, packet distribution within the flow timespan, time series\nbehavior, and frequency domain analysis. We have demonstrated the usability and\nuniversality of the proposed feature vector for various network traffic\nclassification tasks using 15 well-known publicly available datasets. Our\nevaluation shows that the novel feature vector achieves classification\nperformance similar or better than related works on both binary and multiclass\nclassification tasks. In more than half of the evaluated tasks, the\nclassification performance increased by up to 5\\%.\n","authors":["Josef Koumar","Karel Hynek","Tomáš Čejka"],"pdf_url":"https://arxiv.org/pdf/2307.13434v1.pdf","comment":"Submitted to The 19th International Conference on Network and Service\n  Management (CNSM) 2023"},{"id":"http://arxiv.org/abs/2307.13430v1","updated":"2023-07-25T11:51:20Z","published":"2023-07-25T11:51:20Z","title":"Achieving Linear Speedup in Decentralized Stochastic Compositional\n  Minimax Optimization","summary":"  The stochastic compositional minimax problem has attracted a surge of\nattention in recent years since it covers many emerging machine learning\nmodels. Meanwhile, due to the emergence of distributed data, optimizing this\nkind of problem under the decentralized setting becomes badly needed. However,\nthe compositional structure in the loss function brings unique challenges to\ndesigning efficient decentralized optimization algorithms. In particular, our\nstudy shows that the standard gossip communication strategy cannot achieve\nlinear speedup for decentralized compositional minimax problems due to the\nlarge consensus error about the inner-level function. To address this issue, we\ndeveloped a novel decentralized stochastic compositional gradient descent\nascent with momentum algorithm to reduce the consensus error in the inner-level\nfunction. As such, our theoretical results demonstrate that it is able to\nachieve linear speedup with respect to the number of workers. We believe this\nnovel algorithmic design could benefit the development of decentralized\ncompositional optimization. Finally, we applied our methods to the imbalanced\nclassification problem. The extensive experimental results provide evidence for\nthe effectiveness of our algorithm.\n","authors":["Hongchang Gao"],"pdf_url":"https://arxiv.org/pdf/2307.13430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13425v1","updated":"2023-07-25T11:45:28Z","published":"2023-07-25T11:45:28Z","title":"A signal processing interpretation of noise-reduction convolutional\n  neural networks","summary":"  Encoding-decoding CNNs play a central role in data-driven noise reduction and\ncan be found within numerous deep-learning algorithms. However, the development\nof these CNN architectures is often done in ad-hoc fashion and theoretical\nunderpinnings for important design choices is generally lacking. Up to this\nmoment there are different existing relevant works that strive to explain the\ninternal operation of these CNNs. Still, these ideas are either scattered\nand/or may require significant expertise to be accessible for a bigger\naudience. In order to open up this exciting field, this article builds\nintuition on the theory of deep convolutional framelets and explains diverse ED\nCNN architectures in a unified theoretical framework. By connecting basic\nprinciples from signal processing to the field of deep learning, this\nself-contained material offers significant guidance for designing robust and\nefficient novel CNN architectures.\n","authors":["Luis A. Zavala-Mondragón","Peter H. N. de With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2307.13425v1.pdf","comment":"This article is currently accepted in IEEE Signal Processing Magazine\n  (SPM)"},{"id":"http://arxiv.org/abs/2307.13423v1","updated":"2023-07-25T11:42:52Z","published":"2023-07-25T11:42:52Z","title":"Non Intrusive Intelligibility Predictor for Hearing Impaired Individuals\n  using Self Supervised Speech Representations","summary":"  Self-supervised speech representations (SSSRs) have been successfully applied\nto a number of speech-processing tasks, e.g. as feature extractor for speech\nquality (SQ) prediction, which is, in turn, relevant for assessment and\ntraining speech enhancement systems for users with normal or impaired hearing.\nHowever, exact knowledge of why and how quality-related information is encoded\nwell in such representations remains poorly understood. In this work,\ntechniques for non-intrusive prediction of SQ ratings are extended to the\nprediction of intelligibility for hearing-impaired users. It is found that\nself-supervised representations are useful as input features to non-intrusive\nprediction models, achieving competitive performance to more complex systems. A\ndetailed analysis of the performance depending on Clarity Prediction Challenge\n1 listeners and enhancement systems indicates that more data might be needed to\nallow generalisation to unknown systems and (hearing-impaired) individuals\n","authors":["George Close","Thomas Hain","Stefan Goetze"],"pdf_url":"https://arxiv.org/pdf/2307.13423v1.pdf","comment":"Submitted to ASRU 2023"},{"id":"http://arxiv.org/abs/2307.13421v1","updated":"2023-07-25T11:40:47Z","published":"2023-07-25T11:40:47Z","title":"On the learning Dynamics of Attention Networks","summary":"  Attention models are typically learned by optimizing one of three standard\nloss functions that are variously called -- soft attention, hard attention, and\nlatent variable marginal likelihood (LVML) attention. All three paradigms are\nmotivated by the same goal of finding two models -- a `focus' model that\n`selects' the right \\textit{segment} of the input and a `classification' model\nthat processes the selected segment into the target label. However, they differ\nsignificantly in the way the selected segments are aggregated, resulting in\ndistinct dynamics and final results. We observe a unique signature of models\nlearned using these paradigms and explain this as a consequence of the\nevolution of the classification model under gradient descent when the focus\nmodel is fixed. We also analyze these paradigms in a simple setting and derive\nclosed-form expressions for the parameter trajectory under gradient flow. With\nthe soft attention loss, the focus model improves quickly at initialization and\nsplutters later on. On the other hand, hard attention loss behaves in the\nopposite fashion. Based on our observations, we propose a simple hybrid\napproach that combines the advantages of the different loss functions and\ndemonstrates it on a collection of semi-synthetic and real-world datasets\n","authors":["Rahul Vashisht","Harish G. Ramaswamy"],"pdf_url":"https://arxiv.org/pdf/2307.13421v1.pdf","comment":"Preprint: Accepted at ECAI-2023"},{"id":"http://arxiv.org/abs/2307.13419v1","updated":"2023-07-25T11:38:40Z","published":"2023-07-25T11:38:40Z","title":"Co-Design of Out-of-Distribution Detectors for Autonomous Emergency\n  Braking Systems","summary":"  Learning enabled components (LECs), while critical for decision making in\nautonomous vehicles (AVs), are likely to make incorrect decisions when\npresented with samples outside of their training distributions.\nOut-of-distribution (OOD) detectors have been proposed to detect such samples,\nthereby acting as a safety monitor, however, both OOD detectors and LECs\nrequire heavy utilization of embedded hardware typically found in AVs. For both\ncomponents, there is a tradeoff between non-functional and functional\nperformance, and both impact a vehicle's safety. For instance, giving an OOD\ndetector a longer response time can increase its accuracy at the expense of the\nLEC. We consider an LEC with binary output like an autonomous emergency braking\nsystem (AEBS) and use risk, the combination of severity and occurrence of a\nfailure, to model the effect of both components' design parameters on each\nother's functional and non-functional performance, as well as their impact on\nsystem safety. We formulate a co-design methodology that uses this risk model\nto find the design parameters for an OOD detector and LEC that decrease risk\nbelow that of the baseline system and demonstrate it on a vision based AEBS.\nUsing our methodology, we achieve a 42.3% risk reduction while maintaining\nequivalent resource utilization.\n","authors":["Michael Yuhas","Arvind Easwaran"],"pdf_url":"https://arxiv.org/pdf/2307.13419v1.pdf","comment":"8 pages, 6 figures, ITSC 2023"},{"id":"http://arxiv.org/abs/2307.13415v1","updated":"2023-07-25T11:23:38Z","published":"2023-07-25T11:23:38Z","title":"Communication-Efficient Orchestrations for URLLC Service via\n  Hierarchical Reinforcement Learning","summary":"  Ultra-reliable low latency communications (URLLC) service is envisioned to\nenable use cases with strict reliability and latency requirements in 5G. One\napproach for enabling URLLC services is to leverage Reinforcement Learning (RL)\nto efficiently allocate wireless resources. However, with conventional RL\nmethods, the decision variables (though being deployed at various network\nlayers) are typically optimized in the same control loop, leading to\nsignificant practical limitations on the control loop's delay as well as\nexcessive signaling and energy consumption. In this paper, we propose a\nmulti-agent Hierarchical RL (HRL) framework that enables the implementation of\nmulti-level policies with different control loop timescales. Agents with faster\ncontrol loops are deployed closer to the base station, while the ones with\nslower control loops are at the edge or closer to the core network providing\nhigh-level guidelines for low-level actions. On a use case from the prior art,\nwith our HRL framework, we optimized the maximum number of retransmissions and\ntransmission power of industrial devices. Our extensive simulation results on\nthe factory automation scenario show that the HRL framework achieves better\nperformance as the baseline single-agent RL method, with significantly less\noverhead of signal transmissions and delay compared to the one-agent RL\nmethods.\n","authors":["Wei Shi","Milad Ganjalizadeh","Hossein Shokri Ghadikolaei","Marina Petrova"],"pdf_url":"https://arxiv.org/pdf/2307.13415v1.pdf","comment":"This work has been accepted in IEEE 34th Annual International\n  Symposium on Personal, Indoor and Mobile Radio Communications (PIMRC)"},{"id":"http://arxiv.org/abs/2307.13412v1","updated":"2023-07-25T11:19:21Z","published":"2023-07-25T11:19:21Z","title":"Mitigating Memory Wall Effects in CNN Engines with On-the-Fly Weights\n  Generation","summary":"  The unprecedented accuracy of convolutional neural networks (CNNs) across a\nbroad range of AI tasks has led to their widespread deployment in mobile and\nembedded settings. In a pursuit for high-performance and energy-efficient\ninference, significant research effort has been invested in the design of\nFPGA-based CNN accelerators. In this context, single computation engines\nconstitute a popular approach to support diverse CNN modes without the overhead\nof fabric reconfiguration. Nevertheless, this flexibility often comes with\nsignificantly degraded performance on memory-bound layers and resource\nunderutilisation due to the suboptimal mapping of certain layers on the\nengine's fixed configuration. In this work, we investigate the implications in\nterms of CNN engine design for a class of models that introduce a\npre-convolution stage to decompress the weights at run time. We refer to these\napproaches as on-the-fly. This paper presents unzipFPGA, a novel CNN inference\nsystem that counteracts the limitations of existing CNN engines. The proposed\nframework comprises a novel CNN hardware architecture that introduces a weights\ngenerator module that enables the on-chip on-the-fly generation of weights,\nalleviating the negative impact of limited bandwidth on memory-bound layers. We\nfurther enhance unzipFPGA with an automated hardware-aware methodology that\ntailors the weights generation mechanism to the target CNN-device pair, leading\nto an improved accuracy-performance balance. Finally, we introduce an input\nselective processing element (PE) design that balances the load between PEs in\nsuboptimally mapped layers. The proposed framework yields hardware designs that\nachieve an average of 2.57x performance efficiency gain over highly optimised\nGPU designs for the same power constraints and up to 3.94x higher performance\ndensity over a diverse range of state-of-the-art FPGA-based CNN accelerators.\n","authors":["Stylianos I. Venieris","Javier Fernandez-Marques","Nicholas D. Lane"],"pdf_url":"https://arxiv.org/pdf/2307.13412v1.pdf","comment":"Accepted at ACM TODAES, 2023. arXiv admin note: substantial text\n  overlap with arXiv:2103.05600"},{"id":"http://arxiv.org/abs/2307.12754v2","updated":"2023-07-25T11:11:25Z","published":"2023-07-24T12:52:55Z","title":"Nonparametric Linear Feature Learning in Regression Through\n  Regularisation","summary":"  Representation learning plays a crucial role in automated feature selection,\nparticularly in the context of high-dimensional data, where non-parametric\nmethods often struggle. In this study, we focus on supervised learning\nscenarios where the pertinent information resides within a lower-dimensional\nlinear subspace of the data, namely the multi-index model. If this subspace\nwere known, it would greatly enhance prediction, computation, and\ninterpretation. To address this challenge, we propose a novel method for linear\nfeature learning with non-parametric prediction, which simultaneously estimates\nthe prediction function and the linear subspace. Our approach employs empirical\nrisk minimisation, augmented with a penalty on function derivatives, ensuring\nversatility. Leveraging the orthogonality and rotation invariance properties of\nHermite polynomials, we introduce our estimator, named RegFeaL. By utilising\nalternative minimisation, we iteratively rotate the data to improve alignment\nwith leading directions and accurately estimate the relevant dimension in\npractical settings. We establish that our method yields a consistent estimator\nof the prediction function with explicit rates. Additionally, we provide\nempirical results demonstrating the performance of RegFeaL in various\nexperiments.\n","authors":["Bertille Follain","Umut Simsekli","Francis Bach"],"pdf_url":"https://arxiv.org/pdf/2307.12754v2.pdf","comment":"42 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.13408v1","updated":"2023-07-25T11:07:43Z","published":"2023-07-25T11:07:43Z","title":"The Double-Edged Sword of Big Data and Information Technology for the\n  Disadvantaged: A Cautionary Tale from Open Banking","summary":"  This research article analyses and demonstrates the hidden implications for\nfairness of seemingly neutral data coupled with powerful technology, such as\nmachine learning (ML), using Open Banking as an example. Open Banking has\nignited a revolution in financial services, opening new opportunities for\ncustomer acquisition, management, retention, and risk assessment. However, the\ngranularity of transaction data holds potential for harm where unnoticed\nproxies for sensitive and prohibited characteristics may lead to indirect\ndiscrimination. Against this backdrop, we investigate the dimensions of\nfinancial vulnerability (FV), a global concern resulting from COVID-19 and\nrising inflation. Specifically, we look to understand the behavioral elements\nleading up to FV and its impact on at-risk, disadvantaged groups through the\nlens of fair interpretation. Using a unique dataset from a UK FinTech lender,\nwe demonstrate the power of fine-grained transaction data while simultaneously\ncautioning its safe usage. Three ML classifiers are compared in predicting the\nlikelihood of FV, and groups exhibiting different magnitudes and forms of FV\nare identified via clustering to highlight the effects of feature combination.\nOur results indicate that engineered features of financial behavior can be\npredictive of omitted personal information, particularly sensitive or protected\ncharacteristics, shedding light on the hidden dangers of Open Banking data. We\ndiscuss the implications and conclude fairness via unawareness is ineffective\nin this new technological environment.\n","authors":["Savina Dine Kim","Galina Andreeva","Michael Rovatsos"],"pdf_url":"https://arxiv.org/pdf/2307.13408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13390v1","updated":"2023-07-25T10:21:26Z","published":"2023-07-25T10:21:26Z","title":"Counterfactual Explanation via Search in Gaussian Mixture Distributed\n  Latent Space","summary":"  Counterfactual Explanations (CEs) are an important tool in Algorithmic\nRecourse for addressing two questions: 1. What are the crucial factors that led\nto an automated prediction/decision? 2. How can these factors be changed to\nachieve a more favorable outcome from a user's perspective? Thus, guiding the\nuser's interaction with AI systems by proposing easy-to-understand explanations\nand easy-to-attain feasible changes is essential for the trustworthy adoption\nand long-term acceptance of AI systems. In the literature, various methods have\nbeen proposed to generate CEs, and different quality measures have been\nsuggested to evaluate these methods. However, the generation of CEs is usually\ncomputationally expensive, and the resulting suggestions are unrealistic and\nthus non-actionable. In this paper, we introduce a new method to generate CEs\nfor a pre-trained binary classifier by first shaping the latent space of an\nautoencoder to be a mixture of Gaussian distributions. CEs are then generated\nin latent space by linear interpolation between the query sample and the\ncentroid of the target class. We show that our method maintains the\ncharacteristics of the input sample during the counterfactual search. In\nvarious experiments, we show that the proposed method is competitive based on\ndifferent quality measures on image and tabular datasets -- efficiently returns\nresults that are closer to the original data manifold compared to three\nstate-of-the-art methods, which are essential for realistic high-dimensional\nmachine learning applications.\n","authors":["Xuan Zhao","Klaus Broelemann","Gjergji Kasneci"],"pdf_url":"https://arxiv.org/pdf/2307.13390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13386v1","updated":"2023-07-25T10:15:38Z","published":"2023-07-25T10:15:38Z","title":"BotHawk: An Approach for Bots Detection in Open Source Software Projects","summary":"  Social coding platforms have revolutionized collaboration in software\ndevelopment, leading to using software bots for streamlining operations.\nHowever, The presence of open-source software (OSS) bots gives rise to problems\nincluding impersonation, spamming, bias, and security risks. Identifying bot\naccounts and behavior is a challenging task in the OSS project. This research\naims to investigate bots' behavior in open-source software projects and\nidentify bot accounts with maximum possible accuracy. Our team gathered a\ndataset of 19,779 accounts that meet standardized criteria to enable future\nresearch on bots in open-source projects. We follow a rigorous workflow to\nensure that the data we collect is accurate, generalizable, scalable, and\nup-to-date. We've identified four types of bot accounts in open-source software\nprojects by analyzing their behavior across 17 features in 5 dimensions. Our\nteam created BotHawk, a highly effective model for detecting bots in\nopen-source software projects. It outperforms other models, achieving an AUC of\n0.947 and an F1-score of 0.89. BotHawk can detect a wider variety of bots,\nincluding CI/CD and scanning bots. Furthermore, we find that the number of\nfollowers, number of repositories, and tags contain the most relevant features\nto identify the account type.\n","authors":["Fenglin Bi","Zhiwei Zhu","Wei Wang","Xiaoya Xia","Hassan Ali Khan","Peng Pu"],"pdf_url":"https://arxiv.org/pdf/2307.13386v1.pdf","comment":"Dataset, Bots Detection, Classification. Open-source Software Bots"},{"id":"http://arxiv.org/abs/2307.08572v4","updated":"2023-07-25T10:06:18Z","published":"2023-07-17T15:38:11Z","title":"Revisiting the Robustness of the Minimum Error Entropy Criterion: A\n  Transfer Learning Case Study","summary":"  Coping with distributional shifts is an important part of transfer learning\nmethods in order to perform well in real-life tasks. However, most of the\nexisting approaches in this area either focus on an ideal scenario in which the\ndata does not contain noises or employ a complicated training paradigm or model\ndesign to deal with distributional shifts. In this paper, we revisit the\nrobustness of the minimum error entropy (MEE) criterion, a widely used\nobjective in statistical signal processing to deal with non-Gaussian noises,\nand investigate its feasibility and usefulness in real-life transfer learning\nregression tasks, where distributional shifts are common. Specifically, we put\nforward a new theoretical result showing the robustness of MEE against\ncovariate shift. We also show that by simply replacing the mean squared error\n(MSE) loss with the MEE on basic transfer learning algorithms such as\nfine-tuning and linear probing, we can achieve competitive performance with\nrespect to state-of-the-art transfer learning algorithms. We justify our\narguments on both synthetic data and 5 real-world time-series data.\n","authors":["Luis Pedro Silvestrin","Shujian Yu","Mark Hoogendoorn"],"pdf_url":"https://arxiv.org/pdf/2307.08572v4.pdf","comment":"Manuscript accepted at ECAI-23. Code available at\n  https://github.com/lpsilvestrin/mee-finetune"},{"id":"http://arxiv.org/abs/2307.13381v1","updated":"2023-07-25T10:04:33Z","published":"2023-07-25T10:04:33Z","title":"Scaff-PD: Communication Efficient Fair and Robust Federated Learning","summary":"  We present Scaff-PD, a fast and communication-efficient algorithm for\ndistributionally robust federated learning. Our approach improves fairness by\noptimizing a family of distributionally robust objectives tailored to\nheterogeneous clients. We leverage the special structure of these objectives,\nand design an accelerated primal dual (APD) algorithm which uses bias corrected\nlocal steps (as in Scaffold) to achieve significant gains in communication\nefficiency and convergence speed. We evaluate Scaff-PD on several benchmark\ndatasets and demonstrate its effectiveness in improving fairness and robustness\nwhile maintaining competitive accuracy. Our results suggest that Scaff-PD is a\npromising approach for federated learning in resource-constrained and\nheterogeneous settings.\n","authors":["Yaodong Yu","Sai Praneeth Karimireddy","Yi Ma","Michael I. Jordan"],"pdf_url":"https://arxiv.org/pdf/2307.13381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14675v3","updated":"2023-07-25T09:55:38Z","published":"2023-05-24T03:32:31Z","title":"TriMLP: Revenge of a MLP-like Architecture in Sequential Recommendation","summary":"  In this paper, we present a MLP-like architecture for sequential\nrecommendation, namely TriMLP, with a novel Triangular Mixer for cross-token\ncommunications. In designing Triangular Mixer, we simplify the cross-token\noperation in MLP as the basic matrix multiplication, and drop the\nlower-triangle neurons of the weight matrix to block the anti-chronological\norder connections from future tokens. Accordingly, the information leakage\nissue can be remedied and the prediction capability of MLP can be fully\nexcavated under the standard auto-regressive mode. Take a step further, the\nmixer serially alternates two delicate MLPs with triangular shape, tagged as\nglobal and local mixing, to separately capture the long range dependencies and\nlocal patterns on fine-grained level, i.e., long and short-term preferences.\nEmpirical study on 12 datasets of different scales (50K\\textasciitilde 10M\nuser-item interactions) from 4 benchmarks (Amazon, MovieLens, Tenrec and LBSN)\nshow that TriMLP consistently attains promising accuracy/efficiency trade-off,\nwhere the average performance boost against several state-of-the-art baselines\nachieves up to 14.88% with 8.65% less inference cost.\n","authors":["Yiheng Jiang","Yuanbo Xu","Yongjian Yang","Funing Yang","Pengyang Wang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2305.14675v3.pdf","comment":"15 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.13372v1","updated":"2023-07-25T09:46:02Z","published":"2023-07-25T09:46:02Z","title":"Submodular Reinforcement Learning","summary":"  In reinforcement learning (RL), rewards of states are typically considered\nadditive, and following the Markov assumption, they are $\\textit{independent}$\nof states visited previously. In many important applications, such as coverage\ncontrol, experiment design and informative path planning, rewards naturally\nhave diminishing returns, i.e., their value decreases in light of similar\nstates visited previously. To tackle this, we propose $\\textit{submodular RL}$\n(SubRL), a paradigm which seeks to optimize more general, non-additive (and\nhistory-dependent) rewards modelled via submodular set functions which capture\ndiminishing returns. Unfortunately, in general, even in tabular settings, we\nshow that the resulting optimization problem is hard to approximate. On the\nother hand, motivated by the success of greedy algorithms in classical\nsubmodular optimization, we propose SubPO, a simple policy gradient-based\nalgorithm for SubRL that handles non-additive rewards by greedily maximizing\nmarginal gains. Indeed, under some assumptions on the underlying Markov\nDecision Process (MDP), SubPO recovers optimal constant factor approximations\nof submodular bandits. Moreover, we derive a natural policy gradient approach\nfor locally optimizing SubRL instances even in large state- and action- spaces.\nWe showcase the versatility of our approach by applying SubPO to several\napplications, such as biodiversity monitoring, Bayesian experiment design,\ninformative path planning, and coverage maximization. Our results demonstrate\nsample efficiency, as well as scalability to high-dimensional state-action\nspaces.\n","authors":["Manish Prajapat","Mojmír Mutný","Melanie N. Zeilinger","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2307.13372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13371v1","updated":"2023-07-25T09:45:47Z","published":"2023-07-25T09:45:47Z","title":"Learning Regions of Interest for Bayesian Optimization with Adaptive\n  Level-Set Estimation","summary":"  We study Bayesian optimization (BO) in high-dimensional and non-stationary\nscenarios. Existing algorithms for such scenarios typically require extensive\nhyperparameter tuning, which limits their practical effectiveness. We propose a\nframework, called BALLET, which adaptively filters for a high-confidence region\nof interest (ROI) as a superlevel-set of a nonparametric probabilistic model\nsuch as a Gaussian process (GP). Our approach is easy to tune, and is able to\nfocus on local region of the optimization space that can be tackled by existing\nBO methods. The key idea is to use two probabilistic models: a coarse GP to\nidentify the ROI, and a localized GP for optimization within the ROI. We show\ntheoretically that BALLET can efficiently shrink the search space, and can\nexhibit a tighter regret bound than standard BO without ROI filtering. We\ndemonstrate empirically the effectiveness of BALLET on both synthetic and\nreal-world optimization tasks.\n","authors":["Fengxue Zhang","Jialin Song","James Bowden","Alexander Ladd","Yisong Yue","Thomas A. Desautels","Yuxin Chen"],"pdf_url":"https://arxiv.org/pdf/2307.13371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13370v1","updated":"2023-07-25T09:42:31Z","published":"2023-07-25T09:42:31Z","title":"Computational Guarantees for Doubly Entropic Wasserstein Barycenters via\n  Damped Sinkhorn Iterations","summary":"  We study the computation of doubly regularized Wasserstein barycenters, a\nrecently introduced family of entropic barycenters governed by inner and outer\nregularization strengths. Previous research has demonstrated that various\nregularization parameter choices unify several notions of entropy-penalized\nbarycenters while also revealing new ones, including a special case of debiased\nbarycenters. In this paper, we propose and analyze an algorithm for computing\ndoubly regularized Wasserstein barycenters. Our procedure builds on damped\nSinkhorn iterations followed by exact maximization/minimization steps and\nguarantees convergence for any choice of regularization parameters. An inexact\nvariant of our algorithm, implementable using approximate Monte Carlo sampling,\noffers the first non-asymptotic convergence guarantees for approximating\nWasserstein barycenters between discrete point clouds in the\nfree-support/grid-free setting.\n","authors":["Lénaïc Chizat","Tomas Vaškevičius"],"pdf_url":"https://arxiv.org/pdf/2307.13370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13352v1","updated":"2023-07-25T09:14:45Z","published":"2023-07-25T09:14:45Z","title":"High Dimensional Distributed Gradient Descent with Arbitrary Number of\n  Byzantine Attackers","summary":"  Robust distributed learning with Byzantine failures has attracted extensive\nresearch interests in recent years. However, most of existing methods suffer\nfrom curse of dimensionality, which is increasingly serious with the growing\ncomplexity of modern machine learning models. In this paper, we design a new\nmethod that is suitable for high dimensional problems, under arbitrary number\nof Byzantine attackers. The core of our design is a direct high dimensional\nsemi-verified mean estimation method. Our idea is to identify a subspace first.\nThe components of mean value perpendicular to this subspace can be estimated\nvia gradient vectors uploaded from worker machines, while the components within\nthis subspace are estimated using auxiliary dataset. We then use our new method\nas the aggregator of distributed learning problems. Our theoretical analysis\nshows that the new method has minimax optimal statistical rates. In particular,\nthe dependence on dimensionality is significantly improved compared with\nprevious works.\n","authors":["Puning Zhao","Zhiguo Wan"],"pdf_url":"https://arxiv.org/pdf/2307.13352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02164v3","updated":"2023-07-25T09:11:48Z","published":"2023-05-03T14:55:43Z","title":"Nonparametric Generative Modeling with Conditional Sliced-Wasserstein\n  Flows","summary":"  Sliced-Wasserstein Flow (SWF) is a promising approach to nonparametric\ngenerative modeling but has not been widely adopted due to its suboptimal\ngenerative quality and lack of conditional modeling capabilities. In this work,\nwe make two major contributions to bridging this gap. First, based on a\npleasant observation that (under certain conditions) the SWF of joint\ndistributions coincides with those of conditional distributions, we propose\nConditional Sliced-Wasserstein Flow (CSWF), a simple yet effective extension of\nSWF that enables nonparametric conditional modeling. Second, we introduce\nappropriate inductive biases of images into SWF with two techniques inspired by\nlocal connectivity and multiscale representation in vision research, which\ngreatly improve the efficiency and quality of modeling images. With all the\nimprovements, we achieve generative performance comparable with many deep\nparametric generative models on both conditional and unconditional tasks in a\npurely nonparametric fashion, demonstrating its great potential.\n","authors":["Chao Du","Tianbo Li","Tianyu Pang","Shuicheng Yan","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2305.02164v3.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2306.10891v3","updated":"2023-07-25T09:03:24Z","published":"2023-06-19T12:36:54Z","title":"Transformer Training Strategies for Forecasting Multiple Load Time\n  Series","summary":"  In the smart grid of the future, accurate load forecasts on the level of\nindividual clients can help to balance supply and demand locally and to prevent\ngrid outages. While the number of monitored clients will increase with the\nongoing smart meter rollout, the amount of data per client will always be\nlimited. We evaluate whether a Transformer load forecasting model benefits from\na transfer learning strategy, where a global univariate model is trained on the\nload time series from multiple clients. In experiments with two datasets\ncontaining load time series from several hundred clients, we find that the\nglobal training strategy is superior to the multivariate and local training\nstrategies used in related work. On average, the global training strategy\nresults in 21.8% and 12.8% lower forecasting errors than the two other\nstrategies, measured across forecasting horizons from one day to one month into\nthe future. A comparison to linear models, multi-layer perceptrons and LSTMs\nshows that Transformers are effective for load forecasting when they are\ntrained with the global training strategy.\n","authors":["Matthias Hertel","Maximilian Beichter","Benedikt Heidrich","Oliver Neumann","Benjamin Schäfer","Ralf Mikut","Veit Hagenmeyer"],"pdf_url":"https://arxiv.org/pdf/2306.10891v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13333v1","updated":"2023-07-25T08:45:41Z","published":"2023-07-25T08:45:41Z","title":"Feature Importance Measurement based on Decision Tree Sampling","summary":"  Random forest is effective for prediction tasks but the randomness of tree\ngeneration hinders interpretability in feature importance analysis. To address\nthis, we proposed DT-Sampler, a SAT-based method for measuring feature\nimportance in tree-based model. Our method has fewer parameters than random\nforest and provides higher interpretability and stability for the analysis in\nreal-world problems. An implementation of DT-Sampler is available at\nhttps://github.com/tsudalab/DT-sampler.\n","authors":["Chao Huang","Diptesh Das","Koji Tsuda"],"pdf_url":"https://arxiv.org/pdf/2307.13333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13332v1","updated":"2023-07-25T08:44:58Z","published":"2023-07-25T08:44:58Z","title":"The Optimal Approximation Factors in Misspecified Off-Policy Value\n  Function Estimation","summary":"  Theoretical guarantees in reinforcement learning (RL) are known to suffer\nmultiplicative blow-up factors with respect to the misspecification error of\nfunction approximation. Yet, the nature of such \\emph{approximation factors} --\nespecially their optimal form in a given learning problem -- is poorly\nunderstood. In this paper we study this question in linear off-policy value\nfunction estimation, where many open questions remain. We study the\napproximation factor in a broad spectrum of settings, such as with the weighted\n$L_2$-norm (where the weighting is the offline state distribution), the\n$L_\\infty$ norm, the presence vs. absence of state aliasing, and full vs.\npartial coverage of the state space. We establish the optimal asymptotic\napproximation factors (up to constants) for all of these settings. In\nparticular, our bounds identify two instance-dependent factors for the\n$L_2(\\mu)$ norm and only one for the $L_\\infty$ norm, which are shown to\ndictate the hardness of off-policy evaluation under misspecification.\n","authors":["Philip Amortila","Nan Jiang","Csaba Szepesvári"],"pdf_url":"https://arxiv.org/pdf/2307.13332v1.pdf","comment":"Accepted to ICML 2023. The arXiv version contains improved results"},{"id":"http://arxiv.org/abs/2203.07747v5","updated":"2023-07-25T08:19:39Z","published":"2022-03-15T09:38:15Z","title":"Real-time Neural-MPC: Deep Learning Model Predictive Control for\n  Quadrotors and Agile Robotic Platforms","summary":"  Model Predictive Control (MPC) has become a popular framework in embedded\ncontrol for high-performance autonomous systems. However, to achieve good\ncontrol performance using MPC, an accurate dynamics model is key. To maintain\nreal-time operation, the dynamics models used on embedded systems have been\nlimited to simple first-principle models, which substantially limits their\nrepresentative power. In contrast to such simple models, machine learning\napproaches, specifically neural networks, have been shown to accurately model\neven complex dynamic effects, but their large computational complexity hindered\ncombination with fast real-time iteration loops. With this work, we present\nReal-time Neural MPC, a framework to efficiently integrate large, complex\nneural network architectures as dynamics models within a model-predictive\ncontrol pipeline. Our experiments, performed in simulation and the real world\nonboard a highly agile quadrotor platform, demonstrate the capabilities of the\ndescribed system to run learned models with, previously infeasible, large\nmodeling capacity using gradient-based online optimization MPC. Compared to\nprior implementations of neural networks in online optimization MPC we can\nleverage models of over 4000 times larger parametric capacity in a 50Hz\nreal-time window on an embedded platform. Further, we show the feasibility of\nour framework on real-world problems by reducing the positional tracking error\nby up to 82% when compared to state-of-the-art MPC approaches without neural\nnetwork dynamics.\n","authors":["Tim Salzmann","Elia Kaufmann","Jon Arrizabalaga","Marco Pavone","Davide Scaramuzza","Markus Ryll"],"pdf_url":"https://arxiv.org/pdf/2203.07747v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02119v4","updated":"2023-07-25T08:00:40Z","published":"2023-02-04T07:31:36Z","title":"Diversity Induced Environment Design via Self-Play","summary":"  Recent work on designing an appropriate distribution of environments has\nshown promise for training effective generally capable agents. Its success is\npartly because of a form of adaptive curriculum learning that generates\nenvironment instances (or levels) at the frontier of the agent's capabilities.\nHowever, such an environment design framework often struggles to find effective\nlevels in challenging design spaces and requires costly interactions with the\nenvironment. In this paper, we aim to introduce diversity in the Unsupervised\nEnvironment Design (UED) framework. Specifically, we propose a task-agnostic\nmethod to identify observed/hidden states that are representative of a given\nlevel. The outcome of this method is then utilized to characterize the\ndiversity between two levels, which as we show can be crucial to effective\nperformance. In addition, to improve sampling efficiency, we incorporate the\nself-play technique that allows the environment generator to automatically\ngenerate environments that are of great benefit to the training agent.\nQuantitatively, our approach, Diversity-induced Environment Design via\nSelf-Play (DivSP), shows compelling performance over existing methods.\n","authors":["Dexun Li","Wenjun Li","Pradeep Varakantham"],"pdf_url":"https://arxiv.org/pdf/2302.02119v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11787v2","updated":"2023-07-25T07:56:45Z","published":"2023-04-24T01:48:01Z","title":"B2Opt: Learning to Optimize Black-box Optimization with Little Budget","summary":"  The core challenge of high-dimensional and expensive black-box optimization\n(BBO) is how to obtain better performance faster with little function\nevaluation cost. The essence of the problem is how to design an efficient\noptimization strategy tailored to the target task. This paper designs a\npowerful optimization framework to automatically learn the optimization\nstrategies from the target or cheap surrogate task without human intervention.\nHowever, current methods are weak for this due to poor representation of\noptimization strategy. To achieve this, 1) drawing on the mechanism of genetic\nalgorithm, we propose a deep neural network framework called B2Opt, which has a\nstronger representation of optimization strategies based on survival of the\nfittest; 2) B2Opt can utilize the cheap surrogate functions of the target task\nto guide the design of the efficient optimization strategies. Compared to the\nstate-of-the-art BBO baselines, B2Opt can achieve multiple orders of magnitude\nperformance improvement with less function evaluation cost. We validate our\nproposal on high-dimensional synthetic functions and two real-world\napplications. We also find that deep B2Opt performs better than shallow ones.\n","authors":["Xiaobin Li","Kai Wu","Xiaoyu Zhang","Handing Wang","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2304.11787v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13304v1","updated":"2023-07-25T07:44:06Z","published":"2023-07-25T07:44:06Z","title":"QuIP: 2-Bit Quantization of Large Language Models With Guarantees","summary":"  This work studies post-training parameter quantization in large language\nmodels (LLMs). We introduce quantization with incoherence processing (QuIP), a\nnew method based on the insight that quantization benefits from incoherent\nweight and Hessian matrices, i.e., from the weights and the directions in which\nit is important to round them accurately being unaligned with the coordinate\naxes. QuIP consists of two steps: (1) an adaptive rounding procedure minimizing\na quadratic proxy objective; (2) efficient pre- and post-processing that\nensures weight and Hessian incoherence via multiplication by random orthogonal\nmatrices. We complement QuIP with the first theoretical analysis for an\nLLM-scale quantization algorithm, and show that our theory also applies to an\nexisting method, OPTQ. Empirically, we find that our incoherence preprocessing\nimproves several existing quantization algorithms and yields the first LLM\nquantization methods that produce viable results using only two bits per\nweight. Our code can be found at https://github.com/jerry-chee/QuIP .\n","authors":["Jerry Chee","Yaohui Cai","Volodymyr Kuleshov","Christopher De Sa"],"pdf_url":"https://arxiv.org/pdf/2307.13304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.08232v4","updated":"2023-07-25T07:23:18Z","published":"2021-05-18T02:17:59Z","title":"Sharp Restricted Isometry Property Bounds for Low-rank Matrix Recovery\n  Problems with Corrupted Measurements","summary":"  In this paper, we study a general low-rank matrix recovery problem with\nlinear measurements corrupted by some noise. The objective is to understand\nunder what conditions on the restricted isometry property (RIP) of the problem\nlocal search methods can find the ground truth with a small error. By analyzing\nthe landscape of the non-convex problem, we first propose a global guarantee on\nthe maximum distance between an arbitrary local minimizer and the ground truth\nunder the assumption that the RIP constant is smaller than $1/2$. We show that\nthis distance shrinks to zero as the intensity of the noise reduces. Our new\nguarantee is sharp in terms of the RIP constant and is much stronger than the\nexisting results. We then present a local guarantee for problems with an\narbitrary RIP constant, which states that any local minimizer is either\nconsiderably close to the ground truth or far away from it. Next, we prove the\nstrict saddle property, which guarantees the global convergence of the\nperturbed gradient descent method in polynomial time. The developed results\ndemonstrate how the noise intensity and the RIP constant of the problem affect\nthe landscape of the problem.\n","authors":["Ziye Ma","Yingjie Bi","Javad Lavaei","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2105.08232v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19557v2","updated":"2023-07-25T07:18:49Z","published":"2023-05-31T04:54:06Z","title":"Dictionary Learning under Symmetries via Group Representations","summary":"  The dictionary learning problem can be viewed as a data-driven process to\nlearn a suitable transformation so that data is sparsely represented directly\nfrom example data. In this paper, we examine the problem of learning a\ndictionary that is invariant under a pre-specified group of transformations.\nNatural settings include Cryo-EM, multi-object tracking, synchronization, pose\nestimation, etc. We specifically study this problem under the lens of\nmathematical representation theory. Leveraging the power of non-abelian Fourier\nanalysis for functions over compact groups, we prescribe an algorithmic recipe\nfor learning dictionaries that obey such invariances. We relate the dictionary\nlearning problem in the physical domain, which is naturally modelled as being\ninfinite dimensional, with the associated computational problem, which is\nnecessarily finite dimensional. We establish that the dictionary learning\nproblem can be effectively understood as an optimization instance over certain\nmatrix orbitopes having a particular block-diagonal structure governed by the\nirreducible representations of the group of symmetries. This perspective\nenables us to introduce a band-limiting procedure which obtains dimensionality\nreduction in applications. We provide guarantees for our computational ansatz\nto provide a desirable dictionary learning outcome. We apply our paradigm to\ninvestigate the dictionary learning problem for the groups SO(2) and SO(3).\nWhile the SO(2)-orbitope admits an exact spectrahedral description,\nsubstantially less is understood about the SO(3)-orbitope. We describe a\ntractable spectrahedral outer approximation of the SO(3)-orbitope, and\ncontribute an alternating minimization paradigm to perform optimization in this\nsetting. We provide numerical experiments to highlight the efficacy of our\napproach in learning SO(3)-invariant dictionaries, both on synthetic and on\nreal world data.\n","authors":["Subhroshekhar Ghosh","Aaron Y. R. Low","Yong Sheng Soh","Zhuohang Feng","Brendan K. Y. Tan"],"pdf_url":"https://arxiv.org/pdf/2305.19557v2.pdf","comment":"29 pages, 2 figures"},{"id":"http://arxiv.org/abs/2204.09495v2","updated":"2023-07-25T07:11:39Z","published":"2022-04-19T15:51:04Z","title":"ROI: A method for identifying organizations receiving personal data","summary":"  Many studies have exposed the massive collection of personal data in the\ndigital ecosystem through, for instance, websites, mobile apps, or smart\ndevices. This fact goes unnoticed by most users, who are also unaware that the\ncollectors are sharing their personal data with many different organizations\naround the globe. This paper assesses techniques available in the state of the\nart to identify the organizations receiving this personal data. Based on our\nfindings, we propose ROI (Receiver Organization Identifier), a fully automated\nmethod that combines different techniques to achieve a 95.71% precision score\nin identifying an organization receiving personal data. We demonstrate our\nmethod in the wild by evaluating 10,000 Android apps and exposing the\norganizations that receive users' personal data.\n","authors":["David Rodriguez","Jose M. Del Alamo","Miguel Cozar","Boni Garcia"],"pdf_url":"https://arxiv.org/pdf/2204.09495v2.pdf","comment":"23 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.13290v1","updated":"2023-07-25T07:11:30Z","published":"2023-07-25T07:11:30Z","title":"Modify Training Directions in Function Space to Reduce Generalization\n  Error","summary":"  We propose theoretical analyses of a modified natural gradient descent method\nin the neural network function space based on the eigendecompositions of neural\ntangent kernel and Fisher information matrix. We firstly present analytical\nexpression for the function learned by this modified natural gradient under the\nassumptions of Gaussian distribution and infinite width limit. Thus, we\nexplicitly derive the generalization error of the learned neural network\nfunction using theoretical methods from eigendecomposition and statistics\ntheory. By decomposing of the total generalization error attributed to\ndifferent eigenspace of the kernel in function space, we propose a criterion\nfor balancing the errors stemming from training set and the distribution\ndiscrepancy between the training set and the true data. Through this approach,\nwe establish that modifying the training direction of the neural network in\nfunction space leads to a reduction in the total generalization error.\nFurthermore, We demonstrate that this theoretical framework is capable to\nexplain many existing results of generalization enhancing methods. These\ntheoretical results are also illustrated by numerical examples on synthetic\ndata.\n","authors":["Yi Yu","Wenlian Lu","Boyu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.13290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07136v2","updated":"2023-07-25T06:53:33Z","published":"2023-04-14T14:01:12Z","title":"One Explanation Does Not Fit XIL","summary":"  Current machine learning models produce outstanding results in many areas\nbut, at the same time, suffer from shortcut learning and spurious correlations.\nTo address such flaws, the explanatory interactive machine learning (XIL)\nframework has been proposed to revise a model by employing user feedback on a\nmodel's explanation. This work sheds light on the explanations used within this\nframework. In particular, we investigate simultaneous model revision through\nmultiple explanation methods. To this end, we identified that \\textit{one\nexplanation does not fit XIL} and propose considering multiple ones when\nrevising models via XIL.\n","authors":["Felix Friedrich","David Steinmann","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2304.07136v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08621v3","updated":"2023-07-25T06:47:43Z","published":"2023-07-17T16:40:01Z","title":"Retentive Network: A Successor to Transformer for Large Language Models","summary":"  In this work, we propose Retentive Network (RetNet) as a foundation\narchitecture for large language models, simultaneously achieving training\nparallelism, low-cost inference, and good performance. We theoretically derive\nthe connection between recurrence and attention. Then we propose the retention\nmechanism for sequence modeling, which supports three computation paradigms,\ni.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel\nrepresentation allows for training parallelism. The recurrent representation\nenables low-cost $O(1)$ inference, which improves decoding throughput, latency,\nand GPU memory without sacrificing performance. The chunkwise recurrent\nrepresentation facilitates efficient long-sequence modeling with linear\ncomplexity, where each chunk is encoded parallelly while recurrently\nsummarizing the chunks. Experimental results on language modeling show that\nRetNet achieves favorable scaling results, parallel training, low-cost\ndeployment, and efficient inference. The intriguing properties make RetNet a\nstrong successor to Transformer for large language models. Code will be\navailable at https://aka.ms/retnet.\n","authors":["Yutao Sun","Li Dong","Shaohan Huang","Shuming Ma","Yuqing Xia","Jilong Xue","Jianyong Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08621v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13275v1","updated":"2023-07-25T06:13:01Z","published":"2023-07-25T06:13:01Z","title":"Curvature-based Transformer for Molecular Property Prediction","summary":"  The prediction of molecular properties is one of the most important and\nchallenging tasks in the field of artificial intelligence-based drug design.\nAmong the current mainstream methods, the most commonly used feature\nrepresentation for training DNN models is based on SMILES and molecular graphs,\nalthough these methods are concise and effective, they also limit the ability\nto capture spatial information. In this work, we propose Curvature-based\nTransformer to improve the ability of Graph Transformer neural network models\nto extract structural information on molecular graph data by introducing\nDiscretization of Ricci Curvature. To embed the curvature in the model, we add\nthe curvature information of the graph as positional Encoding to the node\nfeatures during the attention-score calculation. This method can introduce\ncurvature information from graph data without changing the original network\narchitecture, and it has the potential to be extended to other models. We\nperformed experiments on chemical molecular datasets including PCQM4M-LST,\nMoleculeNet and compared with models such as Uni-Mol, Graphormer, and the\nresults show that this method can achieve the state-of-the-art results. It is\nproved that the discretized Ricci curvature also reflects the structural and\nfunctional relationship while describing the local geometry of the graph\nmolecular data.\n","authors":["Yili Chen","Zhengyu Li","Zheng Wan","Hui Yu","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2307.13275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07840v2","updated":"2023-07-25T06:03:42Z","published":"2023-07-15T16:16:22Z","title":"RegExplainer: Generating Explanations for Graph Neural Networks in\n  Regression Task","summary":"  Graph regression is a fundamental task and has received increasing attention\nin a wide range of graph learning tasks. However, the inference process is\noften not interpretable. Most existing explanation techniques are limited to\nunderstanding GNN behaviors in classification tasks. In this work, we seek an\nexplanation to interpret the graph regression models (XAIG-R). We show that\nexisting methods overlook the distribution shifting and continuously ordered\ndecision boundary, which hinders them away from being applied in the regression\ntasks. To address these challenges, we propose a novel objective based on the\ninformation bottleneck theory and introduce a new mix-up framework, which could\nsupport various GNNs in a model-agnostic manner. We further present a\ncontrastive learning strategy to tackle the continuously ordered labels in\nregression task. To empirically verify the effectiveness of the proposed\nmethod, we introduce three benchmark datasets and a real-life dataset for\nevaluation. Extensive experiments show the effectiveness of the proposed method\nin interpreting GNN models in regression tasks.\n","authors":["Jiaxing Zhang","Zhuomin Chen","Hao Mei","Dongsheng Luo","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2307.07840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13270v1","updated":"2023-07-25T05:45:52Z","published":"2023-07-25T05:45:52Z","title":"Unbiased Weight Maximization","summary":"  A biologically plausible method for training an Artificial Neural Network\n(ANN) involves treating each unit as a stochastic Reinforcement Learning (RL)\nagent, thereby considering the network as a team of agents. Consequently, all\nunits can learn via REINFORCE, a local learning rule modulated by a global\nreward signal, which aligns more closely with biologically observed forms of\nsynaptic plasticity. Nevertheless, this learning method is often slow and\nscales poorly with network size due to inefficient structural credit\nassignment, since a single reward signal is broadcast to all units without\nconsidering individual contributions. Weight Maximization, a proposed solution,\nreplaces a unit's reward signal with the norm of its outgoing weight, thereby\nallowing each hidden unit to maximize the norm of the outgoing weight instead\nof the global reward signal. In this research report, we analyze the\ntheoretical properties of Weight Maximization and propose a variant, Unbiased\nWeight Maximization. This new approach provides an unbiased learning rule that\nincreases learning speed and improves asymptotic performance. Notably, to our\nknowledge, this is the first learning rule for a network of Bernoulli-logistic\nunits that is unbiased and scales well with the number of network's units in\nterms of learning speed.\n","authors":["Stephen Chung"],"pdf_url":"https://arxiv.org/pdf/2307.13270v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2306.15865v2","updated":"2023-07-25T05:44:09Z","published":"2023-06-28T01:41:30Z","title":"Differentially Private Distributed Estimation and Learning","summary":"  We study distributed estimation and learning problems in a networked\nenvironment in which agents exchange information to estimate unknown\nstatistical properties of random variables from their privately observed\nsamples. By exchanging information about their private observations, the agents\ncan collectively estimate the unknown quantities, but they also face privacy\nrisks. The goal of our aggregation schemes is to combine the observed data\nefficiently over time and across the network, while accommodating the privacy\nneeds of the agents and without any coordination beyond their local\nneighborhoods. Our algorithms enable the participating agents to estimate a\ncomplete sufficient statistic from private signals that are acquired offline or\nonline over time, and to preserve the privacy of their signals and network\nneighborhoods. This is achieved through linear aggregation schemes with\nadjusted randomization schemes that add noise to the exchanged estimates\nsubject to differential privacy (DP) constraints. In every case, we demonstrate\nthe efficiency of our algorithms by proving convergence to the estimators of a\nhypothetical, omniscient observer that has central access to all of the\nsignals. We also provide convergence rate analysis and finite-time performance\nguarantees and show that the noise that minimizes the convergence time to the\nbest estimates is the Laplace noise, with parameters corresponding to each\nagent's sensitivity to their signal and network characteristics. Finally, to\nsupplement and validate our theoretical results, we run experiments on\nreal-world data from the US Power Grid Network and electric consumption data\nfrom German Households to estimate the average power consumption of power\nstations and households under all privacy regimes.\n","authors":["Marios Papachristou","M. Amin Rahimian"],"pdf_url":"https://arxiv.org/pdf/2306.15865v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13267v1","updated":"2023-07-25T05:34:50Z","published":"2023-07-25T05:34:50Z","title":"Federated K-Means Clustering via Dual Decomposition-based Distributed\n  Optimization","summary":"  The use of distributed optimization in machine learning can be motivated\neither by the resulting preservation of privacy or the increase in\ncomputational efficiency. On the one hand, training data might be stored across\nmultiple devices. Training a global model within a network where each node only\nhas access to its confidential data requires the use of distributed algorithms.\nEven if the data is not confidential, sharing it might be prohibitive due to\nbandwidth limitations. On the other hand, the ever-increasing amount of\navailable data leads to large-scale machine learning problems. By splitting the\ntraining process across multiple nodes its efficiency can be significantly\nincreased. This paper aims to demonstrate how dual decomposition can be applied\nfor distributed training of $ K $-means clustering problems. After an overview\nof distributed and federated machine learning, the mixed-integer quadratically\nconstrained programming-based formulation of the $ K $-means clustering\ntraining problem is presented. The training can be performed in a distributed\nmanner by splitting the data across different nodes and linking these nodes\nthrough consensus constraints. Finally, the performance of the subgradient\nmethod, the bundle trust method, and the quasi-Newton dual ascent algorithm are\nevaluated on a set of benchmark problems. While the mixed-integer\nprogramming-based formulation of the clustering problems suffers from weak\ninteger relaxations, the presented approach can potentially be used to enable\nan efficient solution in the future, both in a central and distributed setting.\n","authors":["Vassilios Yfantis","Achim Wagner","Martin Ruskowski"],"pdf_url":"https://arxiv.org/pdf/2307.13267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13266v1","updated":"2023-07-25T05:33:06Z","published":"2023-07-25T05:33:06Z","title":"Federated Split Learning with Only Positive Labels for\n  resource-constrained IoT environment","summary":"  Distributed collaborative machine learning (DCML) is a promising method in\nthe Internet of Things (IoT) domain for training deep learning models, as data\nis distributed across multiple devices. A key advantage of this approach is\nthat it improves data privacy by removing the necessity for the centralized\naggregation of raw data but also empowers IoT devices with low computational\npower. Among various techniques in a DCML framework, federated split learning,\nknown as splitfed learning (SFL), is the most suitable for efficient training\nand testing when devices have limited computational capabilities. Nevertheless,\nwhen resource-constrained IoT devices have only positive labeled data,\nmulticlass classification deep learning models in SFL fail to converge or\nprovide suboptimal results. To overcome these challenges, we propose splitfed\nlearning with positive labels (SFPL). SFPL applies a random shuffling function\nto the smashed data received from clients before supplying it to the server for\nmodel training. Additionally, SFPL incorporates the local batch normalization\nfor the client-side model portion during the inference phase. Our results\ndemonstrate that SFPL outperforms SFL: (i) by factors of 51.54 and 32.57 for\nResNet-56 and ResNet-32, respectively, with the CIFAR-100 dataset, and (ii) by\nfactors of 9.23 and 8.52 for ResNet-32 and ResNet-8, respectively, with\nCIFAR-10 dataset. Overall, this investigation underscores the efficacy of the\nproposed SFPL framework in DCML.\n","authors":["Praveen Joshi","Chandra Thapa","Mohammed Hasanuzzaman","Ted Scully","Haithem Afli"],"pdf_url":"https://arxiv.org/pdf/2307.13266v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.13256v1","updated":"2023-07-25T04:55:45Z","published":"2023-07-25T04:55:45Z","title":"Structural Credit Assignment with Coordinated Exploration","summary":"  A biologically plausible method for training an Artificial Neural Network\n(ANN) involves treating each unit as a stochastic Reinforcement Learning (RL)\nagent, thereby considering the network as a team of agents. Consequently, all\nunits can learn via REINFORCE, a local learning rule modulated by a global\nreward signal, which aligns more closely with biologically observed forms of\nsynaptic plasticity. However, this learning method tends to be slow and does\nnot scale well with the size of the network. This inefficiency arises from two\nfactors impeding effective structural credit assignment: (i) all units\nindependently explore the network, and (ii) a single reward is used to evaluate\nthe actions of all units. Accordingly, methods aimed at improving structural\ncredit assignment can generally be classified into two categories. The first\ncategory includes algorithms that enable coordinated exploration among units,\nsuch as MAP propagation. The second category encompasses algorithms that\ncompute a more specific reward signal for each unit within the network, like\nWeight Maximization and its variants. In this research report, our focus is on\nthe first category. We propose the use of Boltzmann machines or a recurrent\nnetwork for coordinated exploration. We show that the negative phase, which is\ntypically necessary to train Boltzmann machines, can be removed. The resulting\nlearning rules are similar to the reward-modulated Hebbian learning rule.\nExperimental results demonstrate that coordinated exploration significantly\nexceeds independent exploration in training speed for multiple stochastic and\ndiscrete units based on REINFORCE, even surpassing straight-through estimator\n(STE) backpropagation.\n","authors":["Stephen Chung"],"pdf_url":"https://arxiv.org/pdf/2307.13256v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2302.08580v2","updated":"2023-07-25T04:48:18Z","published":"2023-02-16T20:58:09Z","title":"Online Learning Guided Curvature Approximation: A Quasi-Newton Method\n  with Global Non-Asymptotic Superlinear Convergence","summary":"  Quasi-Newton algorithms are among the most popular iterative methods for\nsolving unconstrained minimization problems, largely due to their favorable\nsuperlinear convergence property. However, existing results for these\nalgorithms are limited as they provide either (i) a global convergence\nguarantee with an asymptotic superlinear convergence rate, or (ii) a local\nnon-asymptotic superlinear rate for the case that the initial point and the\ninitial Hessian approximation are chosen properly. In particular, no current\nanalysis for quasi-Newton methods guarantees global convergence with an\nexplicit superlinear convergence rate. In this paper, we close this gap and\npresent the first globally convergent quasi-Newton method with an explicit\nnon-asymptotic superlinear convergence rate. Unlike classical quasi-Newton\nmethods, we build our algorithm upon the hybrid proximal extragradient method\nand propose a novel online learning framework for updating the Hessian\napproximation matrices. Specifically, guided by the convergence analysis, we\nformulate the Hessian approximation update as an online convex optimization\nproblem in the space of matrices, and we relate the bounded regret of the\nonline problem to the superlinear convergence of our method.\n","authors":["Ruichen Jiang","Qiujiang Jin","Aryan Mokhtari"],"pdf_url":"https://arxiv.org/pdf/2302.08580v2.pdf","comment":"33 pages, 1 figure, accepted to COLT 2023"},{"id":"http://arxiv.org/abs/2307.06518v2","updated":"2023-07-25T04:46:56Z","published":"2023-07-13T01:43:28Z","title":"Machine Learning practices and infrastructures","summary":"  Machine Learning (ML) systems, particularly when deployed in high-stakes\ndomains, are deeply consequential. They can exacerbate existing inequities,\ncreate new modes of discrimination, and reify outdated social constructs.\nAccordingly, the social context (i.e. organisations, teams, cultures) in which\nML systems are developed is a site of active research for the field of AI\nethics, and intervention for policymakers. This paper focuses on one aspect of\nsocial context that is often overlooked: interactions between practitioners and\nthe tools they rely on, and the role these interactions play in shaping ML\npractices and the development of ML systems. In particular, through an\nempirical study of questions asked on the Stack Exchange forums, the use of\ninteractive computing platforms (e.g. Jupyter Notebook and Google Colab) in ML\npractices is explored. I find that interactive computing platforms are used in\na host of learning and coordination practices, which constitutes an\ninfrastructural relationship between interactive computing platforms and ML\npractitioners. I describe how ML practices are co-evolving alongside the\ndevelopment of interactive computing platforms, and highlight how this risks\nmaking invisible aspects of the ML life cycle that AI ethics researchers' have\ndemonstrated to be particularly salient for the societal impact of deployed ML\nsystems.\n","authors":["Glen Berman"],"pdf_url":"https://arxiv.org/pdf/2307.06518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13239v1","updated":"2023-07-25T04:04:49Z","published":"2023-07-25T04:04:49Z","title":"RoSAS: Deep Semi-Supervised Anomaly Detection with\n  Contamination-Resilient Continuous Supervision","summary":"  Semi-supervised anomaly detection methods leverage a few anomaly examples to\nyield drastically improved performance compared to unsupervised models.\nHowever, they still suffer from two limitations: 1) unlabeled anomalies (i.e.,\nanomaly contamination) may mislead the learning process when all the unlabeled\ndata are employed as inliers for model training; 2) only discrete supervision\ninformation (such as binary or ordinal data labels) is exploited, which leads\nto suboptimal learning of anomaly scores that essentially take on a continuous\ndistribution. Therefore, this paper proposes a novel semi-supervised anomaly\ndetection method, which devises \\textit{contamination-resilient continuous\nsupervisory signals}. Specifically, we propose a mass interpolation method to\ndiffuse the abnormality of labeled anomalies, thereby creating new data samples\nlabeled with continuous abnormal degrees. Meanwhile, the contaminated area can\nbe covered by new data samples generated via combinations of data with correct\nlabels. A feature learning-based objective is added to serve as an optimization\nconstraint to regularize the network and further enhance the robustness w.r.t.\nanomaly contamination. Extensive experiments on 11 real-world datasets show\nthat our approach significantly outperforms state-of-the-art competitors by\n20%-30% in AUC-PR and obtains more robust and superior performance in settings\nwith different anomaly contamination levels and varying numbers of labeled\nanomalies. The source code is available at https://github.com/xuhongzuo/rosas/.\n","authors":["Hongzuo Xu","Yijie Wang","Guansong Pang","Songlei Jian","Ning Liu","Yongjun Wang"],"pdf_url":"https://arxiv.org/pdf/2307.13239v1.pdf","comment":"Accepted by Information Processing and Management (IP&M)"},{"id":"http://arxiv.org/abs/2307.11768v2","updated":"2023-07-25T04:01:43Z","published":"2023-07-17T00:54:10Z","title":"Question Decomposition Improves the Faithfulness of Model-Generated\n  Reasoning","summary":"  As large language models (LLMs) perform more difficult tasks, it becomes\nharder to verify the correctness and safety of their behavior. One approach to\nhelp with this issue is to prompt LLMs to externalize their reasoning, e.g., by\nhaving them generate step-by-step reasoning as they answer a question\n(Chain-of-Thought; CoT). The reasoning may enable us to check the process that\nmodels use to perform tasks. However, this approach relies on the stated\nreasoning faithfully reflecting the model's actual reasoning, which is not\nalways the case. To improve over the faithfulness of CoT reasoning, we have\nmodels generate reasoning by decomposing questions into subquestions.\nDecomposition-based methods achieve strong performance on question-answering\ntasks, sometimes approaching that of CoT while improving the faithfulness of\nthe model's stated reasoning on several recently-proposed metrics. By forcing\nthe model to answer simpler subquestions in separate contexts, we greatly\nincrease the faithfulness of model-generated reasoning over CoT, while still\nachieving some of the performance gains of CoT. Our results show it is possible\nto improve the faithfulness of model-generated reasoning; continued\nimprovements may lead to reasoning that enables us to verify the correctness\nand safety of LLM behavior.\n","authors":["Ansh Radhakrishnan","Karina Nguyen","Anna Chen","Carol Chen","Carson Denison","Danny Hernandez","Esin Durmus","Evan Hubinger","Jackson Kernion","Kamilė Lukošiūtė","Newton Cheng","Nicholas Joseph","Nicholas Schiefer","Oliver Rausch","Sam McCandlish","Sheer El Showk","Tamera Lanham","Tim Maxwell","Venkatesa Chandrasekaran","Zac Hatfield-Dodds","Jared Kaplan","Jan Brauner","Samuel R. Bowman","Ethan Perez"],"pdf_url":"https://arxiv.org/pdf/2307.11768v2.pdf","comment":"For few-shot examples and prompts, see\n  https://github.com/anthropics/DecompositionFaithfulnessPaper"},{"id":"http://arxiv.org/abs/2307.13236v1","updated":"2023-07-25T03:59:04Z","published":"2023-07-25T03:59:04Z","title":"Audio-aware Query-enhanced Transformer for Audio-Visual Segmentation","summary":"  The goal of the audio-visual segmentation (AVS) task is to segment the\nsounding objects in the video frames using audio cues. However, current\nfusion-based methods have the performance limitations due to the small\nreceptive field of convolution and inadequate fusion of audio-visual features.\nTo overcome these issues, we propose a novel \\textbf{Au}dio-aware\nquery-enhanced \\textbf{TR}ansformer (AuTR) to tackle the task. Unlike existing\nmethods, our approach introduces a multimodal transformer architecture that\nenables deep fusion and aggregation of audio-visual features. Furthermore, we\ndevise an audio-aware query-enhanced transformer decoder that explicitly helps\nthe model focus on the segmentation of the pinpointed sounding objects based on\naudio signals, while disregarding silent yet salient objects. Experimental\nresults show that our method outperforms previous methods and demonstrates\nbetter generalization ability in multi-sound and open-set scenarios.\n","authors":["Jinxiang Liu","Chen Ju","Chaofan Ma","Yanfeng Wang","Yu Wang","Ya Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13236v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.11019"},{"id":"http://arxiv.org/abs/2307.13231v1","updated":"2023-07-25T03:45:56Z","published":"2023-07-25T03:45:56Z","title":"Spectral-DP: Differentially Private Deep Learning through Spectral\n  Perturbation and Filtering","summary":"  Differential privacy is a widely accepted measure of privacy in the context\nof deep learning algorithms, and achieving it relies on a noisy training\napproach known as differentially private stochastic gradient descent (DP-SGD).\nDP-SGD requires direct noise addition to every gradient in a dense neural\nnetwork, the privacy is achieved at a significant utility cost. In this work,\nwe present Spectral-DP, a new differentially private learning approach which\ncombines gradient perturbation in the spectral domain with spectral filtering\nto achieve a desired privacy guarantee with a lower noise scale and thus better\nutility. We develop differentially private deep learning methods based on\nSpectral-DP for architectures that contain both convolution and fully connected\nlayers. In particular, for fully connected layers, we combine a block-circulant\nbased spatial restructuring with Spectral-DP to achieve better utility. Through\ncomprehensive experiments, we study and provide guidelines to implement\nSpectral-DP deep learning on benchmark datasets. In comparison with\nstate-of-the-art DP-SGD based approaches, Spectral-DP is shown to have\nuniformly better utility performance in both training from scratch and transfer\nlearning settings.\n","authors":["Ce Feng","Nuo Xu","Wujie Wen","Parv Venkitasubramaniam","Caiwen Ding"],"pdf_url":"https://arxiv.org/pdf/2307.13231v1.pdf","comment":"Accepted in 2023 IEEE Symposium on Security and Privacy (SP)"},{"id":"http://arxiv.org/abs/2207.11159v2","updated":"2023-07-25T03:24:43Z","published":"2022-07-22T15:55:49Z","title":"Network Revenue Management with Demand Learning and Fair\n  Resource-Consumption Balancing","summary":"  In addition to maximizing the total revenue, decision-makers in lots of\nindustries would like to guarantee balanced consumption across different\nresources. For instance, in the retailing industry, ensuring a balanced\nconsumption of resources from different suppliers enhances fairness and helps\nmain a good channel relationship; in the cloud computing industry,\nresource-consumption balance helps increase customer satisfaction and reduce\noperational costs. Motivated by these practical needs, this paper studies the\nprice-based network revenue management (NRM) problem with both demand learning\nand fair resource-consumption balancing. We introduce the regularized revenue,\ni.e., the total revenue with a balancing regularization, as our objective to\nincorporate fair resource-consumption balancing into the revenue maximization\ngoal. We propose a primal-dual-type online policy with the\nUpper-Confidence-Bound (UCB) demand learning method to maximize the regularized\nrevenue. We adopt several innovative techniques to make our algorithm a unified\nand computationally efficient framework for the continuous price set and a wide\nclass of balancing regularizers. Our algorithm achieves a worst-case regret of\n$\\widetilde O(N^{5/2}\\sqrt{T})$, where $N$ denotes the number of products and\n$T$ denotes the number of time periods. Numerical experiments in a few NRM\nexamples demonstrate the effectiveness of our algorithm in simultaneously\nachieving revenue maximization and fair resource-consumption balancing\n","authors":["Xi Chen","Jiameng Lyu","Yining Wang","Yuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2207.11159v2.pdf","comment":"The original title is Fairness-aware Network Revenue Management With\n  Demand Learning"},{"id":"http://arxiv.org/abs/2307.13219v1","updated":"2023-07-25T03:11:18Z","published":"2023-07-25T03:11:18Z","title":"A Primer on the Data Cleaning Pipeline","summary":"  The availability of both structured and unstructured databases, such as\nelectronic health data, social media data, patent data, and surveys that are\noften updated in real time, among others, has grown rapidly over the past\ndecade. With this expansion, the statistical and methodological questions\naround data integration, or rather merging multiple data sources, has also\ngrown. Specifically, the science of the ``data cleaning pipeline'' contains\nfour stages that allow an analyst to perform downstream tasks, predictive\nanalyses, or statistical analyses on ``cleaned data.'' This article provides a\nreview of this emerging field, introducing technical terminology and commonly\nused methods.\n","authors":["Rebecca C. Steorts"],"pdf_url":"https://arxiv.org/pdf/2307.13219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12438v2","updated":"2023-07-25T03:03:45Z","published":"2023-07-23T21:46:55Z","title":"Multifidelity Covariance Estimation via Regression on the Manifold of\n  Symmetric Positive Definite Matrices","summary":"  We introduce a multifidelity estimator of covariance matrices formulated as\nthe solution to a regression problem on the manifold of symmetric positive\ndefinite matrices. The estimator is positive definite by construction, and the\nMahalanobis distance minimized to obtain it possesses properties which enable\npractical computation. We show that our manifold regression multifidelity\n(MRMF) covariance estimator is a maximum likelihood estimator under a certain\nerror model on manifold tangent space. More broadly, we show that our\nRiemannian regression framework encompasses existing multifidelity covariance\nestimators constructed from control variates. We demonstrate via numerical\nexamples that our estimator can provide significant decreases, up to one order\nof magnitude, in squared estimation error relative to both single-fidelity and\nother multifidelity covariance estimators. Furthermore, preservation of\npositive definiteness ensures that our estimator is compatible with downstream\ntasks, such as data assimilation and metric learning, in which this property is\nessential.\n","authors":["Aimee Maurais","Terrence Alsup","Benjamin Peherstorfer","Youssef Marzouk"],"pdf_url":"https://arxiv.org/pdf/2307.12438v2.pdf","comment":"30 pages + 15-page supplement"},{"id":"http://arxiv.org/abs/2307.13214v1","updated":"2023-07-25T02:55:33Z","published":"2023-07-25T02:55:33Z","title":"FedMEKT: Distillation-based Embedding Knowledge Transfer for Multimodal\n  Federated Learning","summary":"  Federated learning (FL) enables a decentralized machine learning paradigm for\nmultiple clients to collaboratively train a generalized global model without\nsharing their private data. Most existing works simply propose typical FL\nsystems for single-modal data, thus limiting its potential on exploiting\nvaluable multimodal data for future personalized applications. Furthermore, the\nmajority of FL approaches still rely on the labeled data at the client side,\nwhich is limited in real-world applications due to the inability of\nself-annotation from users. In light of these limitations, we propose a novel\nmultimodal FL framework that employs a semi-supervised learning approach to\nleverage the representations from different modalities. Bringing this concept\ninto a system, we develop a distillation-based multimodal embedding knowledge\ntransfer mechanism, namely FedMEKT, which allows the server and clients to\nexchange the joint knowledge of their learning models extracted from a small\nmultimodal proxy dataset. Our FedMEKT iteratively updates the generalized\nglobal encoders with the joint embedding knowledge from the participating\nclients. Thereby, to address the modality discrepancy and labeled data\nconstraint in existing FL systems, our proposed FedMEKT comprises local\nmultimodal autoencoder learning, generalized multimodal autoencoder\nconstruction, and generalized classifier learning. Through extensive\nexperiments on three multimodal human activity recognition datasets, we\ndemonstrate that FedMEKT achieves superior global encoder performance on linear\nevaluation and guarantees user privacy for personal data and model parameters\nwhile demanding less communication cost than other baselines.\n","authors":["Huy Q. Le","Minh N. H. Nguyen","Chu Myaet Thwal","Yu Qiao","Chaoning Zhang","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2307.13214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.03332v2","updated":"2023-07-25T02:44:54Z","published":"2021-05-07T15:33:49Z","title":"Finite volume method network for acceleration of unsteady computational\n  fluid dynamics: non-reacting and reacting flows","summary":"  Despite rapid improvements in the performance of central processing unit\n(CPU), the calculation cost of simulating chemically reacting flow using CFD\nremains infeasible in many cases. The application of the convolutional neural\nnetworks (CNNs) specialized in image processing in flow field prediction has\nbeen studied, but the need to develop a neural netweork design fitted for CFD\nis recently emerged. In this study, a neural network model introducing the\nfinite volume method (FVM) with a unique network architecture and\nphysics-informed loss function was developed to accelerate CFD simulations. The\ndeveloped network model, considering the nature of the CFD flow field where the\nidentical governing equations are applied to all grids, can predict the future\nfields with only two previous fields unlike the CNNs requiring many field\nimages (>10,000). The performance of this baseline model was evaluated using\nCFD time series data from non-reacting flow and reacting flow simulation;\ncounterflow and hydrogen flame with 20 detailed chemistries. Consequently, we\ndemonstrated that (1) the FVM-based network architecture provided improved\naccuracy of multistep time series prediction compared to the previous MLP model\n(2) the physic-informed loss function prevented non-physical overfitting\nproblem and ultimately reduced the error in time series prediction (3)\nobserving the calculated residuals in an unsupervised manner could indirectly\nestimate the network accuracy. Additionally, under the reacting flow dataset,\nthe computational speed of this network model was measured to be about 10 times\nfaster than that of the CFD solver.\n","authors":["Joongoo Jeon","Juhyeong Lee","Sung Joong Kim"],"pdf_url":"https://arxiv.org/pdf/2105.03332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13434v2","updated":"2023-07-25T02:24:04Z","published":"2023-02-26T23:02:33Z","title":"Spatial-temporal Transformer-guided Diffusion based Data Augmentation\n  for Efficient Skeleton-based Action Recognition","summary":"  Recently, skeleton-based human action has become a hot research topic because\nthe compact representation of human skeletons brings new blood to this research\ndomain. As a result, researchers began to notice the importance of using RGB or\nother sensors to analyze human action by extracting skeleton information.\nLeveraging the rapid development of deep learning (DL), a significant number of\nskeleton-based human action approaches have been presented with fine-designed\nDL structures recently. However, a well-trained DL model always demands\nhigh-quality and sufficient data, which is hard to obtain without costing high\nexpenses and human labor. In this paper, we introduce a novel data augmentation\nmethod for skeleton-based action recognition tasks, which can effectively\ngenerate high-quality and diverse sequential actions. In order to obtain\nnatural and realistic action sequences, we propose denoising diffusion\nprobabilistic models (DDPMs) that can generate a series of synthetic action\nsequences, and their generation process is precisely guided by a\nspatial-temporal transformer (ST-Trans). Experimental results show that our\nmethod outperforms the state-of-the-art (SOTA) motion generation approaches on\ndifferent naturality and diversity metrics. It proves that its high-quality\nsynthetic data can also be effectively deployed to existing action recognition\nmodels with significant performance improvement.\n","authors":["Yifan Jiang","Han Chen","Hanseok Ko"],"pdf_url":"https://arxiv.org/pdf/2302.13434v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13206v1","updated":"2023-07-25T02:11:41Z","published":"2023-07-25T02:11:41Z","title":"Transferability of Graph Neural Networks using Graphon and Sampling\n  Theories","summary":"  Graph neural networks (GNNs) have become powerful tools for processing\ngraph-based information in various domains. A desirable property of GNNs is\ntransferability, where a trained network can swap in information from a\ndifferent graph without retraining and retain its accuracy. A recent method of\ncapturing transferability of GNNs is through the use of graphons, which are\nsymmetric, measurable functions representing the limit of large dense graphs.\nIn this work, we contribute to the application of graphons to GNNs by\npresenting an explicit two-layer graphon neural network (WNN) architecture. We\nprove its ability to approximate bandlimited signals within a specified error\ntolerance using a minimal number of network weights. We then leverage this\nresult, to establish the transferability of an explicit two-layer GNN over all\nsufficiently large graphs in a sequence converging to a graphon. Our work\naddresses transferability between both deterministic weighted graphs and simple\nrandom graphs and overcomes issues related to the curse of dimensionality that\narise in other GNN results. The proposed WNN and GNN architectures offer\npractical solutions for handling graph data of varying sizes while maintaining\nperformance guarantees without extensive retraining.\n","authors":["A. Martina Neuman","Jason J. Bramburger"],"pdf_url":"https://arxiv.org/pdf/2307.13206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13127v2","updated":"2023-07-25T01:54:26Z","published":"2023-05-18T20:15:04Z","title":"What Symptoms and How Long? An Interpretable AI Approach for Depression\n  Detection in Social Media","summary":"  Depression is the most prevalent and serious mental illness, which induces\ngrave financial and societal ramifications. Depression detection is key for\nearly intervention to mitigate those consequences. Such a high-stake decision\ninherently necessitates interpretability. Although a few depression detection\nstudies attempt to explain the decision based on the importance score or\nattention weights, these explanations misalign with the clinical depression\ndiagnosis criterion that is based on depressive symptoms. To fill this gap, we\nfollow the computational design science paradigm to develop a novel Multi-Scale\nTemporal Prototype Network (MSTPNet). MSTPNet innovatively detects and\ninterprets depressive symptoms as well as how long they last. Extensive\nempirical analyses using a large-scale dataset show that MSTPNet outperforms\nstate-of-the-art depression detection methods with an F1-score of 0.851. This\nresult also reveals new symptoms that are unnoted in the survey approach, such\nas sharing admiration for a different life. We further conduct a user study to\ndemonstrate its superiority over the benchmarks in interpretability. This study\ncontributes to IS literature with a novel interpretable deep learning model for\ndepression detection in social media. In practice, our proposed method can be\nimplemented in social media platforms to provide personalized online resources\nfor detected depressed patients.\n","authors":["Junwei Kuang","Jiaheng Xie","Zhijun Yan"],"pdf_url":"https://arxiv.org/pdf/2305.13127v2.pdf","comment":"56 pages, 10 figures, 21 tables"},{"id":"http://arxiv.org/abs/2303.17743v2","updated":"2023-07-25T01:50:39Z","published":"2023-03-30T23:30:42Z","title":"FairGen: Towards Fair Graph Generation","summary":"  There have been tremendous efforts over the past decades dedicated to the\ngeneration of realistic graphs in a variety of domains, ranging from social\nnetworks to computer networks, from gene regulatory networks to online\ntransaction networks. Despite the remarkable success, the vast majority of\nthese works are unsupervised in nature and are typically trained to minimize\nthe expected graph reconstruction loss, which would result in the\nrepresentation disparity issue in the generated graphs, i.e., the protected\ngroups (often minorities) contribute less to the objective and thus suffer from\nsystematically higher errors. In this paper, we aim to tailor graph generation\nto downstream mining tasks by leveraging label information and user-preferred\nparity constraint. In particular, we start from the investigation of\nrepresentation disparity in the context of graph generative models. To mitigate\nthe disparity, we propose a fairness-aware graph generative model named\nFairGen. Our model jointly trains a label-informed graph generation module and\na fair representation learning module by progressively learning the behaviors\nof the protected and unprotected groups, from the `easy' concepts to the `hard'\nones. In addition, we propose a generic context sampling strategy for graph\ngenerative models, which is proven to be capable of fairly capturing the\ncontextual information of each group with a high probability. Experimental\nresults on seven real-world data sets, including web-based graphs, demonstrate\nthat FairGen (1) obtains performance on par with state-of-the-art graph\ngenerative models across six network properties, (2) mitigates the\nrepresentation disparity issues in the generated graphs, and (3) substantially\nboosts the model performance by up to 17% in downstream tasks via data\naugmentation.\n","authors":["Lecheng Zheng","Dawei Zhou","Hanghang Tong","Jiejun Xu","Yada Zhu","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2303.17743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13199v1","updated":"2023-07-25T01:35:37Z","published":"2023-07-25T01:35:37Z","title":"An Investigation into Glomeruli Detection in Kidney H&E and PAS Images\n  using YOLO","summary":"  Context: Analyzing digital pathology images is necessary to draw diagnostic\nconclusions by investigating tissue patterns and cellular morphology. However,\nmanual evaluation can be time-consuming, expensive, and prone to inter- and\nintra-observer variability. Objective: To assist pathologists using\ncomputerized solutions, automated tissue structure detection and segmentation\nmust be proposed. Furthermore, generating pixel-level object annotations for\nhistopathology images is expensive and time-consuming. As a result, detection\nmodels with bounding box labels may be a feasible solution. Design: This paper\nstudies. YOLO-v4 (You-Only-Look-Once), a real-time object detector for\nmicroscopic images. YOLO uses a single neural network to predict several\nbounding boxes and class probabilities for objects of interest. YOLO can\nenhance detection performance by training on whole slide images. YOLO-v4 has\nbeen used in this paper. for glomeruli detection in human kidney images.\nMultiple experiments have been designed and conducted based on different\ntraining data of two public datasets and a private dataset from the University\nof Michigan for fine-tuning the model. The model was tested on the private\ndataset from the University of Michigan, serving as an external validation of\ntwo different stains, namely hematoxylin and eosin (H&E) and periodic\nacid-Schiff (PAS). Results: Average specificity and sensitivity for all\nexperiments, and comparison of existing segmentation methods on the same\ndatasets are discussed. Conclusions: Automated glomeruli detection in human\nkidney images is possible using modern AI models. The design and validation for\ndifferent stains still depends on variability of public multi-stain datasets.\n","authors":["Kimia Hemmatirad","Morteza Babaie","Jeffrey Hodgin","Liron Pantanowitz","H. R. Tizhoosh"],"pdf_url":"https://arxiv.org/pdf/2307.13199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13192v1","updated":"2023-07-25T01:14:56Z","published":"2023-07-25T01:14:56Z","title":"Counterfactual Explanation Policies in RL","summary":"  As Reinforcement Learning (RL) agents are increasingly employed in diverse\ndecision-making problems using reward preferences, it becomes important to\nensure that policies learned by these frameworks in mapping observations to a\nprobability distribution of the possible actions are explainable. However,\nthere is little to no work in the systematic understanding of these complex\npolicies in a contrastive manner, i.e., what minimal changes to the policy\nwould improve/worsen its performance to a desired level. In this work, we\npresent COUNTERPOL, the first framework to analyze RL policies using\ncounterfactual explanations in the form of minimal changes to the policy that\nlead to the desired outcome. We do so by incorporating counterfactuals in\nsupervised learning in RL with the target outcome regulated using desired\nreturn. We establish a theoretical connection between Counterpol and widely\nused trust region-based policy optimization methods in RL. Extensive empirical\nanalysis shows the efficacy of COUNTERPOL in generating explanations for\n(un)learning skills while keeping close to the original policy. Our results on\nfive different RL environments with diverse state and action spaces demonstrate\nthe utility of counterfactual explanations, paving the way for new frontiers in\ndesigning and developing counterfactual policies.\n","authors":["Shripad V. Deshmukh","Srivatsan R","Supriti Vijay","Jayakumar Subramanian","Chirag Agarwal"],"pdf_url":"https://arxiv.org/pdf/2307.13192v1.pdf","comment":"ICML Workshop on Counterfactuals in Minds and Machines, 2023"},{"id":"http://arxiv.org/abs/2302.03693v2","updated":"2023-07-25T00:36:06Z","published":"2023-02-07T20:43:48Z","title":"Concept Algebra for Score-Based Conditional Models","summary":"  This paper concerns the structure of learned representations in text-guided\ngenerative models, focusing on score-based models. Here, we focus on the idea\nthat concepts are encoded as subspaces (or directions) of some representation\nspace. We develop a mathematical formalization of this idea.Using this\nformalism, we show there's a natural choice of representation with this\nproperty, and we develop a simple method for identifying the part of the\nrepresentation corresponding to a given concept. In particular, this allows us\nto manipulate the concepts expressed by the model through algebraic\nmanipulation of the representation. We demonstrate the idea with examples\ntext-guided image generation, using Stable Diffusion.\n","authors":["Zihao Wang","Lin Gui","Jeffrey Negrea","Victor Veitch"],"pdf_url":"https://arxiv.org/pdf/2302.03693v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02034v2","updated":"2023-07-25T00:19:04Z","published":"2023-03-03T15:52:06Z","title":"Linear CNNs Discover the Statistical Structure of the Dataset Using Only\n  the Most Dominant Frequencies","summary":"  We here present a stepping stone towards a deeper understanding of\nconvolutional neural networks (CNNs) in the form of a theory of learning in\nlinear CNNs. Through analyzing the gradient descent equations, we discover that\nthe evolution of the network during training is determined by the interplay\nbetween the dataset structure and the convolutional network structure. We show\nthat linear CNNs discover the statistical structure of the dataset with\nnon-linear, ordered, stage-like transitions, and that the speed of discovery\nchanges depending on the relationship between the dataset and the convolutional\nnetwork structure. Moreover, we find that this interplay lies at the heart of\nwhat we call the ``dominant frequency bias'', where linear CNNs arrive at these\ndiscoveries using only the dominant frequencies of the different structural\nparts present in the dataset. We furthermore provide experiments that show how\nour theory relates to deep, non-linear CNNs used in practice. Our findings shed\nnew light on the inner working of CNNs, and can help explain their shortcut\nlearning and their tendency to rely on texture instead of shape.\n","authors":["Hannah Pinson","Joeri Lenaerts","Vincent Ginis"],"pdf_url":"https://arxiv.org/pdf/2303.02034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13181v1","updated":"2023-07-25T00:01:10Z","published":"2023-07-25T00:01:10Z","title":"Neural Memory Decoding with EEG Data and Representation Learning","summary":"  We describe a method for the neural decoding of memory from EEG data. Using\nthis method, a concept being recalled can be identified from an EEG trace with\nan average top-1 accuracy of about 78.4% (chance 4%). The method employs deep\nrepresentation learning with supervised contrastive loss to map an EEG\nrecording of brain activity to a low-dimensional space. Because representation\nlearning is used, concepts can be identified even if they do not appear in the\ntraining data set. However, reference EEG data must exist for each such\nconcept. We also show an application of the method to the problem of\ninformation retrieval. In neural information retrieval, EEG data is captured\nwhile a user recalls the contents of a document, and a list of links to\npredicted documents is produced.\n","authors":["Glenn Bruns","Michael Haidar","Federico Rubino"],"pdf_url":"https://arxiv.org/pdf/2307.13181v1.pdf","comment":"18 pages, 18 figures"},{"id":"http://arxiv.org/abs/2307.04056v2","updated":"2023-07-25T23:49:11Z","published":"2023-07-08T23:19:53Z","title":"Manifold Filter-Combine Networks","summary":"  We introduce a class of manifold neural networks (MNNs) that we call Manifold\nFilter-Combine Networks (MFCNs), that aims to further our understanding of\nMNNs, analogous to how the aggregate-combine framework helps with the\nunderstanding of graph neural networks (GNNs). This class includes a wide\nvariety of subclasses that can be thought of as the manifold analog of various\npopular GNNs. We then consider a method, based on building a data-driven graph,\nfor implementing such networks when one does not have global knowledge of the\nmanifold, but merely has access to finitely many sample points. We provide\nsufficient conditions for the network to provably converge to its continuum\nlimit as the number of sample points tends to infinity. Unlike previous work\n(which focused on specific graph constructions), our rate of convergence does\nnot directly depend on the number of filters used. Moreover, it exhibits linear\ndependence on the depth of the network rather than the exponential dependence\nobtained previously. Additionally, we provide several examples of interesting\nsubclasses of MFCNs and of the rates of convergence that are obtained under\nspecific graph constructions.\n","authors":["Joyce Chew","Edward De Brouwer","Smita Krishnaswamy","Deanna Needell","Michael Perlmutter"],"pdf_url":"https://arxiv.org/pdf/2307.04056v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13865v1","updated":"2023-07-25T23:46:48Z","published":"2023-07-25T23:46:48Z","title":"Pretrained Deep 2.5D Models for Efficient Predictive Modeling from\n  Retinal OCT","summary":"  In the field of medical imaging, 3D deep learning models play a crucial role\nin building powerful predictive models of disease progression. However, the\nsize of these models presents significant challenges, both in terms of\ncomputational resources and data requirements. Moreover, achieving high-quality\npretraining of 3D models proves to be even more challenging. To address these\nissues, hybrid 2.5D approaches provide an effective solution for utilizing 3D\nvolumetric data efficiently using 2D models. Combining 2D and 3D techniques\noffers a promising avenue for optimizing performance while minimizing memory\nrequirements. In this paper, we explore 2.5D architectures based on a\ncombination of convolutional neural networks (CNNs), long short-term memory\n(LSTM), and Transformers. In addition, leveraging the benefits of recent\nnon-contrastive pretraining approaches in 2D, we enhanced the performance and\ndata efficiency of 2.5D techniques even further. We demonstrate the\neffectiveness of architectures and associated pretraining on a task of\npredicting progression to wet age-related macular degeneration (AMD) within a\nsix-month period on two large longitudinal OCT datasets.\n","authors":["Taha Emre","Marzieh Oghbaie","Arunava Chakravarty","Antoine Rivail","Sophie Riedl","Julia Mai","Hendrik P. N. Scholl","Sobha Sivaprasad","Daniel Rueckert","Andrew Lotery","Ursula Schmidt-Erfurth","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2307.13865v1.pdf","comment":"Accepted at OMIA-X MICCAI'23 Workshop"},{"id":"http://arxiv.org/abs/2305.11990v2","updated":"2023-07-25T23:43:35Z","published":"2023-05-19T20:30:59Z","title":"Productive Crop Field Detection: A New Dataset and Deep Learning\n  Benchmark Results","summary":"  In precision agriculture, detecting productive crop fields is an essential\npractice that allows the farmer to evaluate operating performance separately\nand compare different seed varieties, pesticides, and fertilizers. However,\nmanually identifying productive fields is often a time-consuming and\nerror-prone task. Previous studies explore different methods to detect crop\nfields using advanced machine learning algorithms, but they often lack good\nquality labeled data. In this context, we propose a high-quality dataset\ngenerated by machine operation combined with Sentinel-2 images tracked over\ntime. As far as we know, it is the first one to overcome the lack of labeled\nsamples by using this technique. In sequence, we apply a semi-supervised\nclassification of unlabeled data and state-of-the-art supervised and\nself-supervised deep learning methods to detect productive crop fields\nautomatically. Finally, the results demonstrate high accuracy in Positive\nUnlabeled learning, which perfectly fits the problem where we have high\nconfidence in the positive samples. Best performances have been found in\nTriplet Loss Siamese given the existence of an accurate dataset and Contrastive\nLearning considering situations where we do not have a comprehensive labeled\ndataset available.\n","authors":["Eduardo Nascimento","John Just","Jurandy Almeida","Tiago Almeida"],"pdf_url":"https://arxiv.org/pdf/2305.11990v2.pdf","comment":"Preprint of the paper https://doi.org/10.1109/lgrs.2023.3296064\n  published in IEEE Geoscience and Remote Sensing Letters"},{"id":"http://arxiv.org/abs/2305.19442v3","updated":"2023-07-25T23:27:28Z","published":"2023-05-30T22:30:30Z","title":"SimFBO: Towards Simple, Flexible and Communication-efficient Federated\n  Bilevel Learning","summary":"  Federated bilevel optimization (FBO) has shown great potential recently in\nmachine learning and edge computing due to the emerging nested optimization\nstructure in meta-learning, fine-tuning, hyperparameter tuning, etc. However,\nexisting FBO algorithms often involve complicated computations and require\nmultiple sub-loops per iteration, each of which contains a number of\ncommunication rounds. In this paper, we propose a simple and flexible FBO\nframework named SimFBO, which is easy to implement without sub-loops, and\nincludes a generalized server-side aggregation and update for improving\ncommunication efficiency. We further propose System-level heterogeneity robust\nFBO (ShroFBO) as a variant of SimFBO with stronger resilience to heterogeneous\nlocal computation. We show that SimFBO and ShroFBO provably achieve a linear\nconvergence speedup with partial client participation and client sampling\nwithout replacement, as well as improved sample and communication complexities.\nExperiments demonstrate the effectiveness of the proposed methods over existing\nFBO algorithms.\n","authors":["Yifan Yang","Peiyao Xiao","Kaiyi Ji"],"pdf_url":"https://arxiv.org/pdf/2305.19442v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13861v1","updated":"2023-07-25T23:25:05Z","published":"2023-07-25T23:25:05Z","title":"Learning to Design Analog Circuits to Meet Threshold Specifications","summary":"  Automated design of analog and radio-frequency circuits using supervised or\nreinforcement learning from simulation data has recently been studied as an\nalternative to manual expert design. It is straightforward for a design agent\nto learn an inverse function from desired performance metrics to circuit\nparameters. However, it is more common for a user to have threshold performance\ncriteria rather than an exact target vector of feasible performance measures.\nIn this work, we propose a method for generating from simulation data a dataset\non which a system can be trained via supervised learning to design circuits to\nmeet threshold specifications. We moreover perform the to-date most extensive\nevaluation of automated analog circuit design, including experimenting in a\nsignificantly more diverse set of circuits than in prior work, covering linear,\nnonlinear, and autonomous circuit configurations, and show that our method\nconsistently reaches success rate better than 90% at 5% error margin, while\nalso improving data efficiency by upward of an order of magnitude. A demo of\nthis system is available at circuits.streamlit.app\n","authors":["Dmitrii Krylov","Pooya Khajeh","Junhan Ouyang","Thomas Reeves","Tongkai Liu","Hiba Ajmal","Hamidreza Aghasi","Roy Fox"],"pdf_url":"https://arxiv.org/pdf/2307.13861v1.pdf","comment":"in proceedings of ICML 23"},{"id":"http://arxiv.org/abs/2307.13856v1","updated":"2023-07-25T23:09:05Z","published":"2023-07-25T23:09:05Z","title":"On the unreasonable vulnerability of transformers for image restoration\n  -- and an easy fix","summary":"  Following their success in visual recognition tasks, Vision\nTransformers(ViTs) are being increasingly employed for image restoration. As a\nfew recent works claim that ViTs for image classification also have better\nrobustness properties, we investigate whether the improved adversarial\nrobustness of ViTs extends to image restoration. We consider the recently\nproposed Restormer model, as well as NAFNet and the \"Baseline network\" which\nare both simplified versions of a Restormer. We use Projected Gradient Descent\n(PGD) and CosPGD, a recently proposed adversarial attack tailored to pixel-wise\nprediction tasks for our robustness evaluation. Our experiments are performed\non real-world images from the GoPro dataset for image deblurring. Our analysis\nindicates that contrary to as advocated by ViTs in image classification works,\nthese models are highly susceptible to adversarial attacks. We attempt to\nimprove their robustness through adversarial training. While this yields a\nsignificant increase in robustness for Restormer, results on other networks are\nless promising. Interestingly, the design choices in NAFNet and Baselines,\nwhich were based on iid performance, and not on robust generalization, seem to\nbe at odds with the model robustness. Thus, we investigate this further and\nfind a fix.\n","authors":["Shashank Agnihotri","Kanchana Vaishnavi Gandikota","Julia Grabinski","Paramanand Chandramouli","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.13856v1.pdf","comment":"Tags: Robustness, adversarial attacks, image deblurring, image\n  restoration, NAFNet, Baseline, Restormer, adversarial training"},{"id":"http://arxiv.org/abs/2307.13855v1","updated":"2023-07-25T23:02:35Z","published":"2023-07-25T23:02:35Z","title":"Exploring the Sharpened Cosine Similarity","summary":"  Convolutional layers have long served as the primary workhorse for image\nclassification. Recently, an alternative to convolution was proposed using the\nSharpened Cosine Similarity (SCS), which in theory may serve as a better\nfeature detector. While multiple sources report promising results, there has\nnot been to date a full-scale empirical analysis of neural network performance\nusing these new layers. In our work, we explore SCS's parameter behavior and\npotential as a drop-in replacement for convolutions in multiple CNN\narchitectures benchmarked on CIFAR-10. We find that while SCS may not yield\nsignificant increases in accuracy, it may learn more interpretable\nrepresentations. We also find that, in some circumstances, SCS may confer a\nslight increase in adversarial robustness.\n","authors":["Skyler Wu","Fred Lu","Edward Raff","James Holt"],"pdf_url":"https://arxiv.org/pdf/2307.13855v1.pdf","comment":"Accepted to I Can't Believe It's Not Better Workshop (ICBINB) at\n  NeurIPS 2022"},{"id":"http://arxiv.org/abs/2307.13854v1","updated":"2023-07-25T22:59:32Z","published":"2023-07-25T22:59:32Z","title":"WebArena: A Realistic Web Environment for Building Autonomous Agents","summary":"  With generative AI advances, the exciting potential for autonomous agents to\nmanage daily tasks via natural language commands has emerged. However, cur rent\nagents are primarily created and tested in simplified synthetic environments,\nsubstantially limiting real-world scenario representation. In this paper, we\nbuild an environment for agent command and control that is highly realistic and\nreproducible. Specifically, we focus on agents that perform tasks on websites,\nand we create an environment with fully functional websites from four common\ndomains: e-commerce, social forum discussions, collaborative software\ndevelopment, and content management. Our environment is enriched with tools\n(e.g., a map) and external knowledge bases (e.g., user manuals) to encourage\nhuman-like task-solving. Building upon our environment, we release a set of\nbenchmark tasks focusing on evaluating the functional correctness of task\ncompletions. The tasks in our benchmark are diverse, long-horizon, and are\ndesigned to emulate tasks that humans routinely perform on the internet. We\ndesign and implement several autonomous agents, integrating recent techniques\nsuch as reasoning before acting. The results demonstrate that solving complex\ntasks is challenging: our best GPT-4-based agent only achieves an end-to-end\ntask success rate of 10.59%. These results highlight the need for further\ndevelopment of robust agents, that current state-of-the-art LMs are far from\nperfect performance in these real-life tasks, and that WebArena can be used to\nmeasure such progress. Our code, data, environment reproduction resources, and\nvideo demonstrations are publicly available at https://webarena.dev/.\n","authors":["Shuyan Zhou","Frank F. Xu","Hao Zhu","Xuhui Zhou","Robert Lo","Abishek Sridhar","Xianyi Cheng","Yonatan Bisk","Daniel Fried","Uri Alon","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2307.13854v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2307.13851v1","updated":"2023-07-25T22:54:47Z","published":"2023-07-25T22:54:47Z","title":"SplitFed resilience to packet loss: Where to split, that is the question","summary":"  Decentralized machine learning has broadened its scope recently with the\ninvention of Federated Learning (FL), Split Learning (SL), and their hybrids\nlike Split Federated Learning (SplitFed or SFL). The goal of SFL is to reduce\nthe computational power required by each client in FL and parallelize SL while\nmaintaining privacy. This paper investigates the robustness of SFL against\npacket loss on communication links. The performance of various SFL aggregation\nstrategies is examined by splitting the model at two points -- shallow split\nand deep split -- and testing whether the split point makes a statistically\nsignificant difference to the accuracy of the final model. Experiments are\ncarried out on a segmentation model for human embryo images and indicate the\nstatistically significant advantage of a deeper split point.\n","authors":["Chamani Shiranthika","Zahra Hafezi Kafshgari","Parvaneh Saeedi","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2307.13851v1.pdf","comment":"10 pages, 4 figures, MICCAI 2023 Workshop on Distributed,\n  Collaborative and Federated Learning"},{"id":"http://arxiv.org/abs/2210.07612v2","updated":"2023-07-25T22:51:42Z","published":"2022-10-14T08:09:33Z","title":"Monotonicity and Double Descent in Uncertainty Estimation with Gaussian\n  Processes","summary":"  Despite their importance for assessing reliability of predictions,\nuncertainty quantification (UQ) measures for machine learning models have only\nrecently begun to be rigorously characterized. One prominent issue is the curse\nof dimensionality: it is commonly believed that the marginal likelihood should\nbe reminiscent of cross-validation metrics and that both should deteriorate\nwith larger input dimensions. We prove that by tuning hyperparameters to\nmaximize marginal likelihood (the empirical Bayes procedure), the performance,\nas measured by the marginal likelihood, improves monotonically} with the input\ndimension. On the other hand, we prove that cross-validation metrics exhibit\nqualitatively different behavior that is characteristic of double descent. Cold\nposteriors, which have recently attracted interest due to their improved\nperformance in certain settings, appear to exacerbate these phenomena. We\nverify empirically that our results hold for real data, beyond our considered\nassumptions, and we explore consequences involving synthetic covariates.\n","authors":["Liam Hodgkinson","Chris van der Heide","Fred Roosta","Michael W. Mahoney"],"pdf_url":"https://arxiv.org/pdf/2210.07612v2.pdf","comment":"33 pages, 21 figures"},{"id":"http://arxiv.org/abs/2307.13850v1","updated":"2023-07-25T22:51:36Z","published":"2023-07-25T22:51:36Z","title":"MAEA: Multimodal Attribution for Embodied AI","summary":"  Understanding multimodal perception for embodied AI is an open question\nbecause such inputs may contain highly complementary as well as redundant\ninformation for the task. A relevant direction for multimodal policies is\nunderstanding the global trends of each modality at the fusion layer. To this\nend, we disentangle the attributions for visual, language, and previous action\ninputs across different policies trained on the ALFRED dataset. Attribution\nanalysis can be utilized to rank and group the failure scenarios, investigate\nmodeling and dataset biases, and critically analyze multimodal EAI policies for\nrobustness and user trust before deployment. We present MAEA, a framework to\ncompute global attributions per modality of any differentiable policy. In\naddition, we show how attributions enable lower-level behavior analysis in EAI\npolicies for language and visual attributions.\n","authors":["Vidhi Jain","Jayant Sravan Tamarapalli","Sahiti Yerramilli","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2307.13850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12171v2","updated":"2023-07-25T22:18:33Z","published":"2023-07-22T21:36:03Z","title":"Learn to Compress (LtC): Efficient Learning-based Streaming Video\n  Analytics","summary":"  Video analytics are often performed as cloud services in edge settings,\nmainly to offload computation, and also in situations where the results are not\ndirectly consumed at the video sensors. Sending high-quality video data from\nthe edge devices can be expensive both in terms of bandwidth and power use. In\norder to build a streaming video analytics pipeline that makes efficient use of\nthese resources, it is therefore imperative to reduce the size of the video\nstream. Traditional video compression algorithms are unaware of the semantics\nof the video, and can be both inefficient and harmful for the analytics\nperformance. In this paper, we introduce LtC, a collaborative framework between\nthe video source and the analytics server, that efficiently learns to reduce\nthe video streams within an analytics pipeline. Specifically, LtC uses the\nfull-fledged analytics algorithm at the server as a teacher to train a\nlightweight student neural network, which is then deployed at the video source.\nThe student network is trained to comprehend the semantic significance of\nvarious regions within the videos, which is used to differentially preserve the\ncrucial regions in high quality while the remaining regions undergo aggressive\ncompression. Furthermore, LtC also incorporates a novel temporal filtering\nalgorithm based on feature-differencing to omit transmitting frames that do not\ncontribute new information. Overall, LtC is able to use 28-35% less bandwidth\nand has up to 45% shorter response delay compared to recently published state\nof the art streaming frameworks while achieving similar analytics performance.\n","authors":["Quazi Mishkatul Alam","Israat Haque","Nael Abu-Ghazaleh"],"pdf_url":"https://arxiv.org/pdf/2307.12171v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12004v2","updated":"2023-07-25T22:11:24Z","published":"2023-02-08T23:31:35Z","title":"Knowledge Distillation-based Information Sharing for Online Process\n  Monitoring in Decentralized Manufacturing System","summary":"  In advanced manufacturing, the incorporation of sensing technology provides\nan opportunity to achieve efficient in-situ process monitoring using machine\nlearning methods. Meanwhile, the advances of information technologies also\nenable a connected and decentralized environment for manufacturing systems,\nmaking different manufacturing units in the system collaborate more closely. In\na decentralized manufacturing system, the involved units may fabricate same or\nsimilar products and deploy their own machine learning model for online process\nmonitoring. However, due to the possible inconsistency of task progress during\nthe operation, it is also common that some units have more informative data\nwhile some have less informative data. Thus, the monitoring performance of\nmachine learning model for each unit may highly vary. Therefore, it is\nextremely valuable to achieve efficient and secured knowledge sharing among the\nunits in a decentralized manufacturing system for enhancement of poorly\nperformed models. To realize this goal, this paper proposes a novel knowledge\ndistillation-based information sharing (KD-IS) framework, which could distill\ninformative knowledge from well performed models to improve the monitoring\nperformance of poorly performed models. To validate the effectiveness of this\nmethod, a real-world case study is conducted in a connected fused filament\nfabrication (FFF)-based additive manufacturing (AM) platform. The experimental\nresults show that the developed method is very efficient in improving model\nmonitoring performance at poorly performed models, with solid protection on\npotential data privacy.\n","authors":["Zhangyue Shi","Yuxuan Li","Chenang Liu"],"pdf_url":"https://arxiv.org/pdf/2302.12004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13831v1","updated":"2023-07-25T21:59:17Z","published":"2023-07-25T21:59:17Z","title":"Relationship between Batch Size and Number of Steps Needed for Nonconvex\n  Optimization of Stochastic Gradient Descent using Armijo Line Search","summary":"  Stochastic gradient descent (SGD) is the simplest deep learning optimizer\nwith which to train deep neural networks. While SGD can use various learning\nrates, such as constant or diminishing rates, the previous numerical results\nshowed that SGD performs better than other deep learning optimizers using when\nit uses learning rates given by line search methods. In this paper, we perform\na convergence analysis on SGD with a learning rate given by an Armijo line\nsearch for nonconvex optimization. The analysis indicates that the upper bound\nof the expectation of the squared norm of the full gradient becomes small when\nthe number of steps and the batch size are large. Next, we show that, for SGD\nwith the Armijo-line-search learning rate, the number of steps needed for\nnonconvex optimization is a monotone decreasing convex function of the batch\nsize; that is, the number of steps needed for nonconvex optimization decreases\nas the batch size increases. Furthermore, we show that the stochastic\nfirst-order oracle (SFO) complexity, which is the stochastic gradient\ncomputation cost, is a convex function of the batch size; that is, there exists\na critical batch size that minimizes the SFO complexity. Finally, we provide\nnumerical results that support our theoretical results. The numerical results\nindicate that the number of steps needed for training deep neural networks\ndecreases as the batch size increases and that there exist the critical batch\nsizes that can be estimated from the theoretical results.\n","authors":["Yuki Tsukada","Hideaki Iiduka"],"pdf_url":"https://arxiv.org/pdf/2307.13831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13824v1","updated":"2023-07-25T21:38:08Z","published":"2023-07-25T21:38:08Z","title":"Offline Reinforcement Learning with On-Policy Q-Function Regularization","summary":"  The core challenge of offline reinforcement learning (RL) is dealing with the\n(potentially catastrophic) extrapolation error induced by the distribution\nshift between the history dataset and the desired policy. A large portion of\nprior work tackles this challenge by implicitly/explicitly regularizing the\nlearning policy towards the behavior policy, which is hard to estimate reliably\nin practice. In this work, we propose to regularize towards the Q-function of\nthe behavior policy instead of the behavior policy itself, under the premise\nthat the Q-function can be estimated more reliably and easily by a SARSA-style\nestimate and handles the extrapolation error more straightforwardly. We propose\ntwo algorithms taking advantage of the estimated Q-function through\nregularizations, and demonstrate they exhibit strong performance on the D4RL\nbenchmarks.\n","authors":["Laixi Shi","Robert Dadashi","Yuejie Chi","Pablo Samuel Castro","Matthieu Geist"],"pdf_url":"https://arxiv.org/pdf/2307.13824v1.pdf","comment":"Published at European Conference on Machine Learning (ECML), 2023"}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.13537v1","updated":"2023-07-25T14:35:25Z","published":"2023-07-25T14:35:25Z","title":"Spectrum-guided Multi-granularity Referring Video Object Segmentation","summary":"  Current referring video object segmentation (R-VOS) techniques extract\nconditional kernels from encoded (low-resolution) vision-language features to\nsegment the decoded high-resolution features. We discovered that this causes\nsignificant feature drift, which the segmentation kernels struggle to perceive\nduring the forward computation. This negatively affects the ability of\nsegmentation kernels. To address the drift problem, we propose a\nSpectrum-guided Multi-granularity (SgMg) approach, which performs direct\nsegmentation on the encoded features and employs visual details to further\noptimize the masks. In addition, we propose Spectrum-guided Cross-modal Fusion\n(SCF) to perform intra-frame global interactions in the spectral domain for\neffective multimodal representation. Finally, we extend SgMg to perform\nmulti-object R-VOS, a new paradigm that enables simultaneous segmentation of\nmultiple referred objects in a video. This not only makes R-VOS faster, but\nalso more practical. Extensive experiments show that SgMg achieves\nstate-of-the-art performance on four video benchmark datasets, outperforming\nthe nearest competitor by 2.8% points on Ref-YouTube-VOS. Our extended SgMg\nenables multi-object R-VOS, runs about 3 times faster while maintaining\nsatisfactory performance. Code is available at https://github.com/bo-miao/SgMg.\n","authors":["Bo Miao","Mohammed Bennamoun","Yongsheng Gao","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2307.13537v1.pdf","comment":"Accepted by ICCV 2023, code is at https://github.com/bo-miao/SgMg"},{"id":"http://arxiv.org/abs/2307.13346v1","updated":"2023-07-25T09:03:27Z","published":"2023-07-25T09:03:27Z","title":"A Snoring Sound Dataset for Body Position Recognition: Collection,\n  Annotation, and Analysis","summary":"  Obstructive Sleep Apnea-Hypopnea Syndrome (OSAHS) is a chronic breathing\ndisorder caused by a blockage in the upper airways. Snoring is a prominent\nsymptom of OSAHS, and previous studies have attempted to identify the\nobstruction site of the upper airways by snoring sounds. Despite some progress,\nthe classification of the obstruction site remains challenging in real-world\nclinical settings due to the influence of sleep body position on upper airways.\nTo address this challenge, this paper proposes a snore-based sleep body\nposition recognition dataset (SSBPR) consisting of 7570 snoring recordings,\nwhich comprises six distinct labels for sleep body position: supine, supine but\nleft lateral head, supine but right lateral head, left-side lying, right-side\nlying and prone. Experimental results show that snoring sounds exhibit certain\nacoustic features that enable their effective utilization for identifying body\nposture during sleep in real-world scenarios.\n","authors":["Li Xiao","Xiuping Yang","Xinhong Li","Weiping Tu","Xiong Chen","Weiyan Yi","Jie Lin","Yuhong Yang","Yanzhen Ren"],"pdf_url":"https://arxiv.org/pdf/2307.13346v1.pdf","comment":"Accepted to INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2307.13236v1","updated":"2023-07-25T03:59:04Z","published":"2023-07-25T03:59:04Z","title":"Audio-aware Query-enhanced Transformer for Audio-Visual Segmentation","summary":"  The goal of the audio-visual segmentation (AVS) task is to segment the\nsounding objects in the video frames using audio cues. However, current\nfusion-based methods have the performance limitations due to the small\nreceptive field of convolution and inadequate fusion of audio-visual features.\nTo overcome these issues, we propose a novel \\textbf{Au}dio-aware\nquery-enhanced \\textbf{TR}ansformer (AuTR) to tackle the task. Unlike existing\nmethods, our approach introduces a multimodal transformer architecture that\nenables deep fusion and aggregation of audio-visual features. Furthermore, we\ndevise an audio-aware query-enhanced transformer decoder that explicitly helps\nthe model focus on the segmentation of the pinpointed sounding objects based on\naudio signals, while disregarding silent yet salient objects. Experimental\nresults show that our method outperforms previous methods and demonstrates\nbetter generalization ability in multi-sound and open-set scenarios.\n","authors":["Jinxiang Liu","Chen Ju","Chaofan Ma","Yanfeng Wang","Yu Wang","Ya Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13236v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.11019"},{"id":"http://arxiv.org/abs/2307.13205v1","updated":"2023-07-25T02:08:28Z","published":"2023-07-25T02:08:28Z","title":"Text-oriented Modality Reinforcement Network for Multimodal Sentiment\n  Analysis from Unaligned Multimodal Sequences","summary":"  Multimodal Sentiment Analysis (MSA) aims to mine sentiment information from\ntext, visual, and acoustic modalities. Previous works have focused on\nrepresentation learning and feature fusion strategies. However, most of these\nefforts ignored the disparity in the semantic richness of different modalities\nand treated each modality in the same manner. That may lead to strong\nmodalities being neglected and weak modalities being overvalued. Motivated by\nthese observations, we propose a Text-oriented Modality Reinforcement Network\n(TMRN), which focuses on the dominance of the text modality in MSA. More\nspecifically, we design a Text-Centered Cross-modal Attention (TCCA) module to\nmake full interaction for text/acoustic and text/visual pairs, and a Text-Gated\nSelf-Attention (TGSA) module to guide the self-reinforcement of the other two\nmodalities. Furthermore, we present an adaptive fusion mechanism to decide the\nproportion of different modalities involved in the fusion process. Finally, we\ncombine the feature matrices into vectors to get the final representation for\nthe downstream tasks. Experimental results show that our TMRN outperforms the\nstate-of-the-art methods on two MSA benchmarks.\n","authors":["Yuxuan Lei","Dingkang Yang","Mingcheng Li","Shunli Wang","Jiawei Chen","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13205v1.pdf","comment":"Accepted by CICAI 2023 (Finalist of Best Student Paper Award)"}]},"2023-07-26T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.14334v1","updated":"2023-07-26T17:52:22Z","published":"2023-07-26T17:52:22Z","title":"Towards Generalist Biomedical AI","summary":"  Medicine is inherently multimodal, with rich data modalities spanning text,\nimaging, genomics, and more. Generalist biomedical artificial intelligence (AI)\nsystems that flexibly encode, integrate, and interpret this data at scale can\npotentially enable impactful applications ranging from scientific discovery to\ncare delivery. To enable the development of these models, we first curate\nMultiMedBench, a new multimodal biomedical benchmark. MultiMedBench encompasses\n14 diverse tasks such as medical question answering, mammography and\ndermatology image interpretation, radiology report generation and\nsummarization, and genomic variant calling. We then introduce Med-PaLM\nMultimodal (Med-PaLM M), our proof of concept for a generalist biomedical AI\nsystem. Med-PaLM M is a large multimodal generative model that flexibly encodes\nand interprets biomedical data including clinical language, imaging, and\ngenomics with the same set of model weights. Med-PaLM M reaches performance\ncompetitive with or exceeding the state of the art on all MultiMedBench tasks,\noften surpassing specialist models by a wide margin. We also report examples of\nzero-shot generalization to novel medical concepts and tasks, positive transfer\nlearning across tasks, and emergent zero-shot medical reasoning. To further\nprobe the capabilities and limitations of Med-PaLM M, we conduct a radiologist\nevaluation of model-generated (and human) chest X-ray reports and observe\nencouraging performance across model scales. In a side-by-side ranking on 246\nretrospective chest X-rays, clinicians express a pairwise preference for\nMed-PaLM M reports over those produced by radiologists in up to 40.50% of\ncases, suggesting potential clinical utility. While considerable work is needed\nto validate these models in real-world use cases, our results represent a\nmilestone towards the development of generalist biomedical AI systems.\n","authors":["Tao Tu","Shekoofeh Azizi","Danny Driess","Mike Schaekermann","Mohamed Amin","Pi-Chuan Chang","Andrew Carroll","Chuck Lau","Ryutaro Tanno","Ira Ktena","Basil Mustafa","Aakanksha Chowdhery","Yun Liu","Simon Kornblith","David Fleet","Philip Mansfield","Sushant Prakash","Renee Wong","Sunny Virmani","Christopher Semturs","S Sara Mahdavi","Bradley Green","Ewa Dominowska","Blaise Aguera y Arcas","Joelle Barral","Dale Webster","Greg S. Corrado","Yossi Matias","Karan Singhal","Pete Florence","Alan Karthikesalingam","Vivek Natarajan"],"pdf_url":"https://arxiv.org/pdf/2307.14334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14324v1","updated":"2023-07-26T17:42:43Z","published":"2023-07-26T17:42:43Z","title":"Evaluating the Moral Beliefs Encoded in LLMs","summary":"  This paper presents a case study on the design, administration,\npost-processing, and evaluation of surveys on large language models (LLMs). It\ncomprises two components: (1) A statistical method for eliciting beliefs\nencoded in LLMs. We introduce statistical measures and evaluation metrics that\nquantify the probability of an LLM \"making a choice\", the associated\nuncertainty, and the consistency of that choice. (2) We apply this method to\nstudy what moral beliefs are encoded in different LLMs, especially in ambiguous\ncases where the right choice is not obvious. We design a large-scale survey\ncomprising 680 high-ambiguity moral scenarios (e.g., \"Should I tell a white\nlie?\") and 687 low-ambiguity moral scenarios (e.g., \"Should I stop for a\npedestrian on the road?\"). Each scenario includes a description, two possible\nactions, and auxiliary labels indicating violated rules (e.g., \"do not kill\").\nWe administer the survey to 28 open- and closed-source LLMs. We find that (a)\nin unambiguous scenarios, most models \"choose\" actions that align with\ncommonsense. In ambiguous cases, most models express uncertainty. (b) Some\nmodels are uncertain about choosing the commonsense action because their\nresponses are sensitive to the question-wording. (c) Some models reflect clear\npreferences in ambiguous scenarios. Specifically, closed-source models tend to\nagree with each other.\n","authors":["Nino Scherrer","Claudia Shi","Amir Feder","David M. Blei"],"pdf_url":"https://arxiv.org/pdf/2307.14324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14311v1","updated":"2023-07-26T17:21:53Z","published":"2023-07-26T17:21:53Z","title":"Comparative Analysis of Libraries for the Sentimental Analysis","summary":"  This study is main goal is to provide a comparative comparison of libraries\nusing machine learning methods. Experts in natural language processing (NLP)\nare becoming more and more interested in sentiment analysis (SA) of text\nchanges. The objective of employing NLP text analysis techniques is to\nrecognize and categorize feelings related to twitter users utterances. In this\nexamination, issues with SA and the libraries utilized are also looked at.\nprovides a number of cooperative methods to classify emotional polarity. The\nNaive Bayes Classifier, Decision Tree Classifier, Maxent Classifier, Sklearn\nClassifier, Sklearn Classifier MultinomialNB, and other conjoint learning\nalgorithms, according to recent research, are very effective. In the project\nwill use Five Python and R libraries NLTK, TextBlob, Vader, Transformers (GPT\nand BERT pretrained), and Tidytext will be used in the study to apply sentiment\nanalysis techniques. Four machine learning models Tree of Decisions (DT),\nSupport Vector Machine (SVM), Naive Bayes (NB), and K-Nearest Neighbor (KNN)\nwill also be used. To evaluate how well libraries for SA operate in the social\nnetwork environment, comparative study was also carried out. The measures to\nassess the best algorithms in this experiment, which used a single data set for\neach method, were precision, recall, and F1 score. We conclude that the BERT\ntransformer method with an Accuracy: 0.973 is recommended for sentiment\nanalysis.\n","authors":["Wendy Ccoya","Edson Pinto"],"pdf_url":"https://arxiv.org/pdf/2307.14311v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.14305v1","updated":"2023-07-26T17:13:00Z","published":"2023-07-26T17:13:00Z","title":"Automatically Evaluating Opinion Prevalence in Opinion Summarization","summary":"  When faced with a large number of product reviews, it is not clear that a\nhuman can remember all of them and weight opinions representatively to write a\ngood reference summary. We propose an automatic metric to test the prevalence\nof the opinions that a summary expresses, based on counting the number of\nreviews that are consistent with each statement in the summary, while\ndiscrediting trivial or redundant statements. To formulate this opinion\nprevalence metric, we consider several existing methods to score the factual\nconsistency of a summary statement with respect to each individual source\nreview. On a corpus of Amazon product reviews, we gather multiple human\njudgments of the opinion consistency, to determine which automatic metric best\nexpresses consistency in product reviews. Using the resulting opinion\nprevalence metric, we show that a human authored summary has only slightly\nbetter opinion prevalence than randomly selected extracts from the source\nreviews, and previous extractive and abstractive unsupervised opinion\nsummarization methods perform worse than humans. We demonstrate room for\nimprovement with a greedy construction of extractive summaries with twice the\nopinion prevalence achieved by humans. Finally, we show that preprocessing\nsource reviews by simplification can raise the opinion prevalence achieved by\nexisting abstractive opinion summarization systems to the level of human\nperformance.\n","authors":["Christopher Malon"],"pdf_url":"https://arxiv.org/pdf/2307.14305v1.pdf","comment":"The 6th Workshop on e-Commerce and NLP (KDD 2023)"},{"id":"http://arxiv.org/abs/2307.14298v1","updated":"2023-07-26T16:58:10Z","published":"2023-07-26T16:58:10Z","title":"ChatGPT and Persuasive Technologies for the Management and Delivery of\n  Personalized Recommendations in Hotel Hospitality","summary":"  Recommender systems have become indispensable tools in the hotel hospitality\nindustry, enabling personalized and tailored experiences for guests. Recent\nadvancements in large language models (LLMs), such as ChatGPT, and persuasive\ntechnologies, have opened new avenues for enhancing the effectiveness of those\nsystems. This paper explores the potential of integrating ChatGPT and\npersuasive technologies for automating and improving hotel hospitality\nrecommender systems. First, we delve into the capabilities of ChatGPT, which\ncan understand and generate human-like text, enabling more accurate and\ncontext-aware recommendations. We discuss the integration of ChatGPT into\nrecommender systems, highlighting the ability to analyze user preferences,\nextract valuable insights from online reviews, and generate personalized\nrecommendations based on guest profiles. Second, we investigate the role of\npersuasive technology in influencing user behavior and enhancing the persuasive\nimpact of hotel recommendations. By incorporating persuasive techniques, such\nas social proof, scarcity and personalization, recommender systems can\neffectively influence user decision-making and encourage desired actions, such\nas booking a specific hotel or upgrading their room. To investigate the\nefficacy of ChatGPT and persuasive technologies, we present a pilot experi-ment\nwith a case study involving a hotel recommender system. We aim to study the\nimpact of integrating ChatGPT and persua-sive techniques on user engagement,\nsatisfaction, and conversion rates. The preliminary results demonstrate the\npotential of these technologies in enhancing the overall guest experience and\nbusiness performance. Overall, this paper contributes to the field of hotel\nhospitality by exploring the synergistic relationship between LLMs and\npersuasive technology in recommender systems, ultimately influencing guest\nsatisfaction and hotel revenue.\n","authors":["Manolis Remountakis","Konstantinos Kotis","Babis Kourtzis","George E. Tsekouras"],"pdf_url":"https://arxiv.org/pdf/2307.14298v1.pdf","comment":"17 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.14291v1","updated":"2023-07-26T16:49:11Z","published":"2023-07-26T16:49:11Z","title":"Founding a mathematical diffusion model in linguistics. The case study\n  of German syntactic features in the North-Eastern Italian dialects","summary":"  We take as a case study the spread of Germanic syntactic features into\nRomance dialects of North-Eastern Italy, which occurred after the immigration\nof German people in the Tyrol during the High Middle Ages.\n  An interactive map is produced using tools of what is called Geographic Data\nScience. A smooth two-dimensional surface $\\mathcal{G}$ expresses locally which\nfraction of territory uses a given German language feature: it is obtained by\ninterpolating a discrete function that says if at any surveyed locality that\nfeature is used or not.\\newline\n  This surface $\\mathcal{G}$ is thought of as the value at the present time of\na function describing a diffusion-convection phenomenon in two dimensions (here\nsaid \\emph{tidal} mode), which is subjected in a very natural way to the same\nequation, suitably contextualized, used in physics for a number of\nphenomenological facts like the heat diffusion. It is shown that solutions of\nthis equation, evaluated at the present time, fit well with the data as\ninterpolated by $\\mathcal{G}$, thus providing convincing pictures of\ndiffusion-convection of the linguistic features of the case study, albeit\nsimplifications and approximations.\\newline\n  Very importantly, it is shown that Schmidt's 'waves' can be counted among the\nsolutions of the diffusion equation: superimposing Schmidt 'waves' to a 'tidal\nflooding' can reproduce complexities of real linguistic diffusion events.\n","authors":["I. Lazzizzera"],"pdf_url":"https://arxiv.org/pdf/2307.14291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13617v2","updated":"2023-07-26T16:14:24Z","published":"2023-07-25T16:21:07Z","title":"GPT-3 Models are Few-Shot Financial Reasoners","summary":"  Financial analysis is an important tool for evaluating company performance.\nPractitioners work to answer financial questions to make profitable investment\ndecisions, and use advanced quantitative analyses to do so. As a result,\nFinancial Question Answering (QA) is a question answering task that requires\ndeep reasoning about numbers. Furthermore, it is unknown how well pre-trained\nlanguage models can reason in the financial domain. The current\nstate-of-the-art requires a retriever to collect relevant facts about the\nfinancial question from the text and a generator to produce a valid financial\nprogram and a final answer. However, recently large language models like GPT-3\nhave achieved state-of-the-art performance on wide variety of tasks with just a\nfew shot examples. We run several experiments with GPT-3 and find that a\nseparate retrieval model and logic engine continue to be essential components\nto achieving SOTA performance in this task, particularly due to the precise\nnature of financial questions and the complex information stored in financial\ndocuments. With this understanding, our refined prompt-engineering approach on\nGPT-3 achieves near SOTA accuracy without any fine-tuning.\n","authors":["Raul Salles de Padua","Imran Qureshi","Mustafa U. Karakaplan"],"pdf_url":"https://arxiv.org/pdf/2307.13617v2.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.06281v2","updated":"2023-07-26T16:02:57Z","published":"2023-07-12T16:23:09Z","title":"MMBench: Is Your Multi-modal Model an All-around Player?","summary":"  Large vision-language models have recently achieved remarkable progress,\nexhibiting great perception and reasoning abilities concerning visual\ninformation. However, how to effectively evaluate these large vision-language\nmodels remains a major obstacle, hindering future model development.\nTraditional benchmarks like VQAv2 or COCO Caption provide quantitative\nperformance measurements but suffer from a lack of fine-grained ability\nassessment and non-robust evaluation metrics. Recent subjective benchmarks,\nsuch as OwlEval, offer comprehensive evaluations of a model's abilities by\nincorporating human labor, but they are not scalable and display significant\nbias. In response to these challenges, we propose MMBench, a novel\nmulti-modality benchmark. MMBench methodically develops a comprehensive\nevaluation pipeline, primarily comprised of two elements. The first element is\na meticulously curated dataset that surpasses existing similar benchmarks in\nterms of the number and variety of evaluation questions and abilities. The\nsecond element introduces a novel CircularEval strategy and incorporates the\nuse of ChatGPT. This implementation is designed to convert free-form\npredictions into pre-defined choices, thereby facilitating a more robust\nevaluation of the model's predictions. MMBench is a systematically-designed\nobjective benchmark for robustly evaluating the various abilities of\nvision-language models. We hope MMBench will assist the research community in\nbetter evaluating their models and encourage future advancements in this\ndomain. Project page: https://opencompass.org.cn/mmbench.\n","authors":["Yuan Liu","Haodong Duan","Yuanhan Zhang","Bo Li","Songyang Zhang","Wangbo Zhao","Yike Yuan","Jiaqi Wang","Conghui He","Ziwei Liu","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2307.06281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.12645v2","updated":"2023-07-26T15:49:48Z","published":"2022-02-25T12:10:02Z","title":"Exploring Multi-Modal Representations for Ambiguity Detection &\n  Coreference Resolution in the SIMMC 2.0 Challenge","summary":"  Anaphoric expressions, such as pronouns and referential descriptions, are\nsituated with respect to the linguistic context of prior turns, as well as, the\nimmediate visual environment. However, a speaker's referential descriptions do\nnot always uniquely identify the referent, leading to ambiguities in need of\nresolution through subsequent clarificational exchanges. Thus, effective\nAmbiguity Detection and Coreference Resolution are key to task success in\nConversational AI. In this paper, we present models for these two tasks as part\nof the SIMMC 2.0 Challenge (Kottur et al. 2021). Specifically, we use TOD-BERT\nand LXMERT based models, compare them to a number of baselines and provide\nablation experiments. Our results show that (1) language models are able to\nexploit correlations in the data to detect ambiguity; and (2) unimodal\ncoreference resolution models can avoid the need for a vision component,\nthrough the use of smart object representations.\n","authors":["Javier Chiyah-Garcia","Alessandro Suglia","José Lopes","Arash Eshghi","Helen Hastie"],"pdf_url":"https://arxiv.org/pdf/2202.12645v2.pdf","comment":"Accepted to AAAI 2022 DSTC10 Workshop"},{"id":"http://arxiv.org/abs/2307.13528v2","updated":"2023-07-26T15:17:49Z","published":"2023-07-25T14:20:51Z","title":"FacTool: Factuality Detection in Generative AI -- A Tool Augmented\n  Framework for Multi-Task and Multi-Domain Scenarios","summary":"  The emergence of generative pre-trained models has facilitated the synthesis\nof high-quality text, but it has also posed challenges in identifying factual\nerrors in the generated text. In particular: (1) A wider range of tasks now\nface an increasing risk of containing factual errors when handled by generative\nmodels. (2) Generated texts tend to be lengthy and lack a clearly defined\ngranularity for individual facts. (3) There is a scarcity of explicit evidence\navailable during the process of fact checking. With the above challenges in\nmind, in this paper, we propose FacTool, a task and domain agnostic framework\nfor detecting factual errors of texts generated by large language models (e.g.,\nChatGPT). Experiments on four different tasks (knowledge-based QA, code\ngeneration, mathematical reasoning, and scientific literature review) show the\nefficacy of the proposed method. We release the code of FacTool associated with\nChatGPT plugin interface at https://github.com/GAIR-NLP/factool .\n","authors":["I-Chun Chern","Steffi Chern","Shiqi Chen","Weizhe Yuan","Kehua Feng","Chunting Zhou","Junxian He","Graham Neubig","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13528v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14236v1","updated":"2023-07-26T15:04:24Z","published":"2023-07-26T15:04:24Z","title":"UnScientify: Detecting Scientific Uncertainty in Scholarly Full Text","summary":"  This demo paper presents UnScientify, an interactive system designed to\ndetect scientific uncertainty in scholarly full text. The system utilizes a\nweakly supervised technique that employs a fine-grained annotation scheme to\nidentify verbally formulated uncertainty at the sentence level in scientific\ntexts. The pipeline for the system includes a combination of pattern matching,\ncomplex sentence checking, and authorial reference checking. Our approach\nautomates labeling and annotation tasks for scientific uncertainty\nidentification, taking into account different types of scientific uncertainty,\nthat can serve various applications such as information retrieval, text mining,\nand scholarly document processing. Additionally, UnScientify provides\ninterpretable results, aiding in the comprehension of identified instances of\nscientific uncertainty in text.\n","authors":["Panggih Kusuma Ningrum","Philipp Mayr","Iana Atanassova"],"pdf_url":"https://arxiv.org/pdf/2307.14236v1.pdf","comment":"Paper accepted for the Joint Workshop of the 4th Extraction and\n  Evaluation of Knowledge Entities from Scientific Documents and the 3rd AI +\n  Informetrics (EEKE-AII2023), June 26, 2023, Santa Fe, New Mexico, USA and\n  Online"},{"id":"http://arxiv.org/abs/2306.00017v3","updated":"2023-07-26T15:03:09Z","published":"2023-05-30T15:15:40Z","title":"Towards Explainable and Language-Agnostic LLMs: Symbolic Reverse\n  Engineering of Language at Scale","summary":"  Large language models (LLMs) have achieved a milestone that undenia-bly\nchanged many held beliefs in artificial intelligence (AI). However, there\nremains many limitations of these LLMs when it comes to true language\nunderstanding, limitations that are a byproduct of the under-lying architecture\nof deep neural networks. Moreover, and due to their subsymbolic nature,\nwhatever knowledge these models acquire about how language works will always be\nburied in billions of microfeatures (weights), none of which is meaningful on\nits own, making such models hopelessly unexplainable. To address these\nlimitations, we suggest com-bining the strength of symbolic representations\nwith what we believe to be the key to the success of LLMs, namely a successful\nbottom-up re-verse engineering of language at scale. As such we argue for a\nbottom-up reverse engineering of language in a symbolic setting. Hints on what\nthis project amounts to have been suggested by several authors, and we discuss\nin some detail here how this project could be accomplished.\n","authors":["Walid S. Saba"],"pdf_url":"https://arxiv.org/pdf/2306.00017v3.pdf","comment":"Draft, preprint"},{"id":"http://arxiv.org/abs/2307.10930v2","updated":"2023-07-26T14:21:47Z","published":"2023-07-20T14:59:02Z","title":"MediaGPT : A Large Language Model For Chinese Media","summary":"  Large language models (LLMs) have shown remarkable capabilities in generating\nhigh-quality text and making predictions based on large amounts of data,\nincluding the media domain. However, in practical applications, the differences\nbetween the media's use cases and the general-purpose applications of LLMs have\nbecome increasingly apparent, especially Chinese. This paper examines the\nunique characteristics of media-domain-specific LLMs compared to general LLMs,\ndesigned a diverse set of task instruction types to cater the specific\nrequirements of the domain and constructed unique datasets that are tailored to\nthe media domain. Based on these, we proposed MediaGPT, a domain-specific LLM\nfor the Chinese media domain, training by domain-specific data and experts SFT\ndata. By performing human experts evaluation and strong model evaluation on a\nvalidation set, this paper demonstrated that MediaGPT outperforms mainstream\nmodels on various Chinese media domain tasks and verifies the importance of\ndomain data and domain-defined prompt types for building an effective\ndomain-specific LLM.\n","authors":["Zhonghao Wang","Zijia Lu","Bo Jin","Haiying Deng"],"pdf_url":"https://arxiv.org/pdf/2307.10930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14142v1","updated":"2023-07-26T12:13:00Z","published":"2023-07-26T12:13:00Z","title":"LOIS: Looking Out of Instance Semantics for Visual Question Answering","summary":"  Visual question answering (VQA) has been intensively studied as a multimodal\ntask that requires effort in bridging vision and language to infer answers\ncorrectly. Recent attempts have developed various attention-based modules for\nsolving VQA tasks. However, the performance of model inference is largely\nbottlenecked by visual processing for semantics understanding. Most existing\ndetection methods rely on bounding boxes, remaining a serious challenge for VQA\nmodels to understand the causal nexus of object semantics in images and\ncorrectly infer contextual information. To this end, we propose a finer model\nframework without bounding boxes in this work, termed Looking Out of Instance\nSemantics (LOIS) to tackle this important issue. LOIS enables more fine-grained\nfeature descriptions to produce visual facts. Furthermore, to overcome the\nlabel ambiguity caused by instance masks, two types of relation attention\nmodules: 1) intra-modality and 2) inter-modality, are devised to infer the\ncorrect answers from the different multi-view features. Specifically, we\nimplement a mutual relation attention module to model sophisticated and deeper\nvisual semantic relations between instance objects and background information.\nIn addition, our proposed attention model can further analyze salient image\nregions by focusing on important word-related questions. Experimental results\non four benchmark VQA datasets prove that our proposed method has favorable\nperformance in improving visual reasoning capability.\n","authors":["Siyu Zhang","Yeming Chen","Yaoru Sun","Fang Wang","Haibo Shi","Haoran Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14134v1","updated":"2023-07-26T12:02:30Z","published":"2023-07-26T12:02:30Z","title":"Developing and Evaluating Tiny to Medium-Sized Turkish BERT Models","summary":"  This study introduces and evaluates tiny, mini, small, and medium-sized\nuncased Turkish BERT models, aiming to bridge the research gap in\nless-resourced languages. We trained these models on a diverse dataset\nencompassing over 75GB of text from multiple sources and tested them on several\ntasks, including mask prediction, sentiment analysis, news classification, and,\nzero-shot classification. Despite their smaller size, our models exhibited\nrobust performance, including zero-shot task, while ensuring computational\nefficiency and faster execution times. Our findings provide valuable insights\ninto the development and application of smaller language models, especially in\nthe context of the Turkish language.\n","authors":["Himmet Toprak Kesgin","Muzaffer Kaan Yuce","Mehmet Fatih Amasyali"],"pdf_url":"https://arxiv.org/pdf/2307.14134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14132v1","updated":"2023-07-26T11:59:14Z","published":"2023-07-26T11:59:14Z","title":"Say Goodbye to RNN-T Loss: A Novel CIF-based Transducer Architecture for\n  Automatic Speech Recognition","summary":"  RNN-T models are widely used in ASR, which rely on the RNN-T loss to achieve\nlength alignment between input audio and target sequence. However, the\nimplementation complexity and the alignment-based optimization target of RNN-T\nloss lead to computational redundancy and a reduced role for predictor network,\nrespectively. In this paper, we propose a novel model named CIF-Transducer\n(CIF-T) which incorporates the Continuous Integrate-and-Fire (CIF) mechanism\nwith the RNN-T model to achieve efficient alignment. In this way, the RNN-T\nloss is abandoned, thus bringing a computational reduction and allowing the\npredictor network a more significant role. We also introduce Funnel-CIF,\nContext Blocks, Unified Gating and Bilinear Pooling joint network, and\nauxiliary training strategy to further improve performance. Experiments on the\n178-hour AISHELL-1 and 10000-hour WenetSpeech datasets show that CIF-T achieves\nstate-of-the-art results with lower computational overhead compared to RNN-T\nmodels.\n","authors":["Tian-Hao Zhang","Dinghao Zhou","Guiping Zhon","Baoxiang Li"],"pdf_url":"https://arxiv.org/pdf/2307.14132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14117v1","updated":"2023-07-26T11:34:53Z","published":"2023-07-26T11:34:53Z","title":"Leveraging Implicit Feedback from Deployment Data in Dialogue","summary":"  We study improving social conversational agents by learning from natural\ndialogue between users and a deployed model, without extra annotations. To\nimplicitly measure the quality of a machine-generated utterance, we leverage\nsignals like user response length, sentiment and reaction of the future human\nutterances in the collected dialogue episodes. Our experiments use the publicly\nreleased deployment data from BlenderBot (Xu et al., 2023). Human evaluation\nindicates improvements in our new models over baseline responses; however, we\nfind that some proxy signals can lead to more generations with undesirable\nproperties as well. For example, optimizing for conversation length can lead to\nmore controversial or unfriendly generations compared to the baseline, whereas\noptimizing for positive sentiment or reaction can decrease these behaviors.\n","authors":["Richard Yuanzhe Pang","Stephen Roller","Kyunghyun Cho","He He","Jason Weston"],"pdf_url":"https://arxiv.org/pdf/2307.14117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14107v1","updated":"2023-07-26T11:10:04Z","published":"2023-07-26T11:10:04Z","title":"Decoding ChatGPT: A Taxonomy of Existing Research, Current Challenges,\n  and Possible Future Directions","summary":"  Chat Generative Pre-trained Transformer (ChatGPT) has gained significant\ninterest and attention since its launch in November 2022. It has shown\nimpressive performance in various domains, including passing exams and creative\nwriting. However, challenges and concerns related to biases and trust persist.\nIn this work, we present a comprehensive review of over 100 Scopus-indexed\npublications on ChatGPT, aiming to provide a taxonomy of ChatGPT research and\nexplore its applications. We critically analyze the existing literature,\nidentifying common approaches employed in the studies. Additionally, we\ninvestigate diverse application areas where ChatGPT has found utility, such as\nhealthcare, marketing and financial services, software engineering, academic\nand scientific writing, research and education, environmental science, and\nnatural language processing. Through examining these applications, we gain\nvaluable insights into the potential of ChatGPT in addressing real-world\nchallenges. We also discuss crucial issues related to ChatGPT, including biases\nand trustworthiness, emphasizing the need for further research and development\nin these areas. Furthermore, we identify potential future directions for\nChatGPT research, proposing solutions to current challenges and speculating on\nexpected advancements. By fully leveraging the capabilities of ChatGPT, we can\nunlock its potential across various domains, leading to advancements in\nconversational AI and transformative impacts in society.\n","authors":["Shahab Saquib Sohail","Faiza Farhat","Yassine Himeur","Mohammad Nadeem","Dag Øivind Madsen","Yashbir Singh","Shadi Atalla","Wathiq Mansoor"],"pdf_url":"https://arxiv.org/pdf/2307.14107v1.pdf","comment":"31 pages. 8 figures and 3 tables"},{"id":"http://arxiv.org/abs/2307.06440v2","updated":"2023-07-26T10:33:21Z","published":"2023-07-12T20:10:14Z","title":"No Train No Gain: Revisiting Efficient Training Algorithms For\n  Transformer-based Language Models","summary":"  The computation necessary for training Transformer-based language models has\nskyrocketed in recent years. This trend has motivated research on efficient\ntraining algorithms designed to improve training, validation, and downstream\nperformance faster than standard training. In this work, we revisit three\ncategories of such algorithms: dynamic architectures (layer stacking, layer\ndropping), batch selection (selective backprop, RHO loss), and efficient\noptimizers (Lion, Sophia). When pre-training BERT and T5 with a fixed\ncomputation budget using such methods, we find that their training, validation,\nand downstream gains vanish compared to a baseline with a fully-decayed\nlearning rate. We define an evaluation protocol that enables computation to be\ndone on arbitrary machines by mapping all computation time to a reference\nmachine which we call reference system time. We discuss the limitations of our\nproposed protocol and release our code to encourage rigorous research in\nefficient training procedures: https://github.com/JeanKaddour/NoTrainNoGain.\n","authors":["Jean Kaddour","Oscar Key","Piotr Nawrot","Pasquale Minervini","Matt J. Kusner"],"pdf_url":"https://arxiv.org/pdf/2307.06440v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03506v2","updated":"2023-07-26T09:06:22Z","published":"2023-07-07T10:42:44Z","title":"Derivative Free Weight-space Ensembling","summary":"  Recent work suggests that interpolating between the weights of two\nspecialized language models can transfer knowledge between tasks in a way that\nmulti-task learning cannot. However, very few have explored interpolation\nbetween more than two models, where each has a distinct knowledge base. In this\npaper, we introduce Derivative Free Weight-space Ensembling (DFWE), a new\nfew-sample task transfer approach for open-domain dialogue. Our framework\ncreates a set of diverse expert language models trained using a predefined set\nof source tasks. Next, we finetune each of the expert models on the target\ntask, approaching the target task from several distinct knowledge bases.\nFinally, we linearly interpolate between the model weights using a\ngradient-free-optimization algorithm, to efficiently find a good interpolation\nweighting. We demonstrate the effectiveness of the method on FETA-Friends\noutperforming the standard pretrain-finetune approach.\n","authors":["Dean Ninalga"],"pdf_url":"https://arxiv.org/pdf/2307.03506v2.pdf","comment":"For consideration at the 5th Workshop on NLP for Conversational AI\n  (co-located with ACL 2023)"},{"id":"http://arxiv.org/abs/2306.02377v2","updated":"2023-07-26T08:54:03Z","published":"2023-06-04T15:23:16Z","title":"\"Are you telling me to put glasses on the dog?'' Content-Grounded\n  Annotation of Instruction Clarification Requests in the CoDraw Dataset","summary":"  Instruction Clarification Requests are a mechanism to solve communication\nproblems, which is very functional in instruction-following interactions.\nRecent work has argued that the CoDraw dataset is a valuable source of\nnaturally occurring iCRs. Beyond identifying when iCRs should be made, dialogue\nmodels should also be able to generate them with suitable form and content. In\nthis work, we introduce CoDraw-iCR (v2), extending the existing iCR identifiers\nwith fine-grained information grounded in the underlying dialogue game items\nand possible actions. Our annotation can serve to model and evaluate repair\ncapabilities of dialogue agents.\n","authors":["Brielen Madureira","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2306.02377v2.pdf","comment":"A 2-page version will appear at SemDial 2023 as a poster"},{"id":"http://arxiv.org/abs/2307.14031v1","updated":"2023-07-26T08:29:42Z","published":"2023-07-26T08:29:42Z","title":"Multi3WOZ: A Multilingual, Multi-Domain, Multi-Parallel Dataset for\n  Training and Evaluating Culturally Adapted Task-Oriented Dialog Systems","summary":"  Creating high-quality annotated data for task-oriented dialog (ToD) is known\nto be notoriously difficult, and the challenges are amplified when the goal is\nto create equitable, culturally adapted, and large-scale ToD datasets for\nmultiple languages. Therefore, the current datasets are still very scarce and\nsuffer from limitations such as translation-based non-native dialogs with\ntranslation artefacts, small scale, or lack of cultural adaptation, among\nothers. In this work, we first take stock of the current landscape of\nmultilingual ToD datasets, offering a systematic overview of their properties\nand limitations. Aiming to reduce all the detected limitations, we then\nintroduce Multi3WOZ, a novel multilingual, multi-domain, multi-parallel ToD\ndataset. It is large-scale and offers culturally adapted dialogs in 4 languages\nto enable training and evaluation of multilingual and cross-lingual ToD\nsystems. We describe a complex bottom-up data collection process that yielded\nthe final dataset, and offer the first sets of baseline scores across different\nToD-related tasks for future reference, also highlighting its challenging\nnature.\n","authors":["Songbo Hu","Han Zhou","Mete Hergul","Milan Gritta","Guchun Zhang","Ignacio Iacobacci","Ivan Vulić","Anna Korhonen"],"pdf_url":"https://arxiv.org/pdf/2307.14031v1.pdf","comment":"A pre-MIT Press publication version for TACL"},{"id":"http://arxiv.org/abs/2307.14005v1","updated":"2023-07-26T07:36:25Z","published":"2023-07-26T07:36:25Z","title":"Unsupervised extraction of local and global keywords from a single text","summary":"  We propose an unsupervised, corpus-independent method to extract keywords\nfrom a single text. It is based on the spatial distribution of words and the\nresponse of this distribution to a random permutation of words. As compared to\nexisting methods (such as e.g. YAKE) our method has three advantages. First, it\nis significantly more effective at extracting keywords from long texts. Second,\nit allows inference of two types of keywords: local and global. Third, it\nuncovers basic themes in texts. Additionally, our method is\nlanguage-independent and applies to short texts. The results are obtained via\nhuman annotators with previous knowledge of texts from our database of\nclassical literary works (the agreement between annotators is from moderate to\nsubstantial). Our results are supported via human-independent arguments based\non the average length of extracted content words and on the average number of\nnouns in extracted words. We discuss relations of keywords with higher-order\ntextual features and reveal a connection between keywords and chapter\ndivisions.\n","authors":["Lida Aleksanyan","Armen E. Allahverdyan"],"pdf_url":"https://arxiv.org/pdf/2307.14005v1.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2307.14004v1","updated":"2023-07-26T07:34:19Z","published":"2023-07-26T07:34:19Z","title":"Affective Natural Language Generation of Event Descriptions through\n  Fine-grained Appraisal Conditions","summary":"  Models for affective text generation have shown a remarkable progress, but\nthey commonly rely only on basic emotion theories or valance/arousal values as\nconditions. This is appropriate when the goal is to create explicit emotion\nstatements (\"The kid is happy.\"). Emotions are, however, commonly communicated\nimplicitly. For instance, the emotional interpretation of an event (\"Their dog\ndied.\") does often not require an explicit emotion statement. In psychology,\nappraisal theories explain the link between a cognitive evaluation of an event\nand the potentially developed emotion. They put the assessment of the situation\non the spot, for instance regarding the own control or the responsibility for\nwhat happens. We hypothesize and subsequently show that including appraisal\nvariables as conditions in a generation framework comes with two advantages.\n(1) The generation model is informed in greater detail about what makes a\nspecific emotion and what properties it has. This leads to text generation that\nbetter fulfills the condition. (2) The variables of appraisal allow a user to\nperform a more fine-grained control of the generated text, by stating\nproperties of a situation instead of only providing the emotion category. Our\nBart and T5-based experiments with 7 emotions (Anger, Disgust, Fear, Guilt,\nJoy, Sadness, Shame), and 7 appraisals (Attention, Responsibility, Control,\nCircumstance, Pleasantness, Effort, Certainty) show that (1) adding appraisals\nduring training improves the accurateness of the generated texts by 10 pp in\nF1. Further, (2) the texts with appraisal variables are longer and contain more\ndetails. This exemplifies the greater control for users.\n","authors":["Yarik Menchaca Resendiz","Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2307.14004v1.pdf","comment":"Accepted to INLG 2023"},{"id":"http://arxiv.org/abs/2307.13989v1","updated":"2023-07-26T06:54:31Z","published":"2023-07-26T06:54:31Z","title":"This is not correct! Negation-aware Evaluation of Language Generation\n  Systems","summary":"  Large language models underestimate the impact of negations on how much they\nchange the meaning of a sentence. Therefore, learned evaluation metrics based\non these models are insensitive to negations. In this paper, we propose\nNegBLEURT, a negation-aware version of the BLEURT evaluation metric. For that,\nwe designed a rule-based sentence negation tool and used it to create the\nCANNOT negation evaluation dataset. Based on this dataset, we fine-tuned a\nsentence transformer and an evaluation metric to improve their negation\nsensitivity. Evaluating these models on existing benchmarks shows that our\nfine-tuned models outperform existing metrics on the negated sentences by far\nwhile preserving their base models' performances on other perturbations.\n","authors":["Miriam Anschütz","Diego Miguel Lozano","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2307.13989v1.pdf","comment":"Accepted to INLG 2023"},{"id":"http://arxiv.org/abs/2307.08072v2","updated":"2023-07-26T04:15:48Z","published":"2023-07-16T15:11:01Z","title":"Do Emergent Abilities Exist in Quantized Large Language Models: An\n  Empirical Study","summary":"  Despite the superior performance, Large Language Models~(LLMs) require\nsignificant computational resources for deployment and use. To overcome this\nissue, quantization methods have been widely applied to reduce the memory\nfootprint of LLMs as well as increasing the inference rate. However, a major\nchallenge is that low-bit quantization methods often lead to performance\ndegradation. It is important to understand how quantization impacts the\ncapacity of LLMs. Different from previous studies focused on overall\nperformance, this work aims to investigate the impact of quantization on\n\\emph{emergent abilities}, which are important characteristics that distinguish\nLLMs from small language models. Specially, we examine the abilities of\nin-context learning, chain-of-thought reasoning, and instruction-following in\nquantized LLMs. Our empirical experiments show that these emergent abilities\nstill exist in 4-bit quantization models, while 2-bit models encounter severe\nperformance degradation on the test of these abilities. To improve the\nperformance of low-bit models, we conduct two special experiments: (1)\nfine-gained impact analysis that studies which components (or substructures)\nare more sensitive to quantization, and (2) performance compensation through\nmodel fine-tuning. Our work derives a series of important findings to\nunderstand the impact of quantization on emergent abilities, and sheds lights\non the possibilities of extremely low-bit quantization for LLMs.\n","authors":["Peiyu Liu","Zikang Liu","Ze-Feng Gao","Dawei Gao","Wayne Xin Zhao","Yaliang Li","Bolin Ding","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2307.08072v2.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.13949v1","updated":"2023-07-26T04:03:25Z","published":"2023-07-26T04:03:25Z","title":"How Does Diffusion Influence Pretrained Language Models on\n  Out-of-Distribution Data?","summary":"  Transformer-based pretrained language models (PLMs) have achieved great\nsuccess in modern NLP. An important advantage of PLMs is good\nout-of-distribution (OOD) robustness. Recently, diffusion models have attracted\na lot of work to apply diffusion to PLMs. It remains under-explored how\ndiffusion influences PLMs on OOD data. The core of diffusion models is a\nforward diffusion process which gradually applies Gaussian noise to inputs, and\na reverse denoising process which removes noise. The noised input\nreconstruction is a fundamental ability of diffusion models. We directly\nanalyze OOD robustness by measuring the reconstruction loss, including testing\nthe abilities to reconstruct OOD data, and to detect OOD samples. Experiments\nare conducted by analyzing different training parameters and data statistical\nfeatures on eight datasets. It shows that finetuning PLMs with diffusion\ndegrades the reconstruction ability on OOD data. The comparison also shows that\ndiffusion models can effectively detect OOD samples, achieving state-of-the-art\nperformance in most of the datasets with an absolute accuracy improvement up to\n18%. These results indicate that diffusion reduces OOD robustness of PLMs.\n","authors":["Huazheng Wang","Daixuan Cheng","Haifeng Sun","Jingyu Wang","Qi Qi","Jianxin Liao","Jing Wang","Cong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13949v1.pdf","comment":"Accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2307.13923v1","updated":"2023-07-26T02:45:38Z","published":"2023-07-26T02:45:38Z","title":"GrammarGPT: Exploring Open-Source LLMs for Native Chinese Grammatical\n  Error Correction with Supervised Fine-Tuning","summary":"  Grammatical error correction aims to correct ungrammatical sentences\nautomatically. Recently, some work has demonstrated the excellent capabilities\nof closed-source Large Language Models (LLMs, e.g., ChatGPT) in grammatical\nerror correction. However, the potential of open-source LLMs remains\nunexplored. In this paper, we introduced GrammarGPT, an open-source LLM, to\npreliminary explore its potential for native Chinese grammatical error\ncorrection. The core recipe of GrammarGPT is to leverage the hybrid dataset of\nChatGPT-generated and human-annotated. For grammatical errors with clues, we\nproposed a heuristic method to guide ChatGPT to generate ungrammatical\nsentences by providing those clues. For grammatical errors without clues, we\ncollected ungrammatical sentences from publicly available websites and manually\ncorrected them. In addition, we employed an error-invariant augmentation method\nto enhance the ability of the model to correct native Chinese grammatical\nerrors. We ultimately constructed about 1k parallel data and utilized these\ndata to fine-tune open-source LLMs (e.g., Phoenix, released by The Chinese\nUniversity of Hong Kong, Shenzhen) with instruction tuning. The experimental\nresults show that GrammarGPT outperforms the existing SOTA system\nsignificantly. Although model parameters are 20x larger than the SOTA baseline,\nthe required amount of data for instruction tuning is 1200x smaller,\nillustrating the potential of open-source LLMs on native CGEC. Our GrammarGPT\nranks $3^{rd}$ on NLPCC2023 SharedTask1, demonstrating our approach's\neffectiveness. The code and data are available at\n\\url{https://github.com/FreedomIntelligence/GrammarGPT}.\n","authors":["Yaxin Fan","Feng Jiang","Peifeng Li","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2307.13923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04192v2","updated":"2023-07-26T02:29:03Z","published":"2023-07-09T14:54:30Z","title":"SAS Video-QA: Self-Adaptive Sampling for Efficient Video\n  Question-Answering","summary":"  Video question--answering is a fundamental task in the field of video\nunderstanding. Although current vision--language models (VLMs) equipped with\nVideo Transformers have enabled temporal modeling and yielded superior results,\nthey are at the cost of huge computational power and thus too expensive to\ndeploy in real-time application scenarios. An economical workaround only\nsamples a small portion of frames to represent the main content of that video\nand tune an image--text model on these sampled frames. Recent video\nunderstanding models usually randomly sample a set of frames or clips,\nregardless of internal correlations between their visual contents, nor their\nrelevance to the problem. We argue that such kinds of aimless sampling may omit\nthe key frames from which the correct answer can be deduced, and the situation\ngets worse when the sampling sparsity increases, which always happens as the\nvideo lengths increase. To mitigate this issue, we propose two frame sampling\nstrategies, namely the most domain frames (MDF) and most implied frames (MIF),\nto maximally preserve those frames that are most likely vital to the given\nquestions. MDF passively minimizes the risk of key frame omission in a\nbootstrap manner, while MIS actively searches key frames customized for each\nvideo--question pair with the assistance of auxiliary models. The experimental\nresults on three public datasets from three advanced VLMs (CLIP, GIT and\nAll-in-one) demonstrate that our proposed strategies can boost the performance\nfor image--text pretrained models. The source codes pertaining to the method\nproposed in this paper are publicly available at\nhttps://github.com/declare-lab/sas-vqa.\n","authors":["Wei Han","Hui Chen","Min-Yen Kan","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2307.04192v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.02115v2","updated":"2023-07-26T02:20:30Z","published":"2023-06-03T14:01:54Z","title":"Table and Image Generation for Investigating Knowledge of Entities in\n  Pre-trained Vision and Language Models","summary":"  In this paper, we propose a table and image generation task to verify how the\nknowledge about entities acquired from natural language is retained in Vision &\nLanguage (V&L) models. This task consists of two parts: the first is to\ngenerate a table containing knowledge about an entity and its related image,\nand the second is to generate an image from an entity with a caption and a\ntable containing related knowledge of the entity. In both tasks, the model must\nknow the entities used to perform the generation properly. We created the\nWikipedia Table and Image Generation (WikiTIG) dataset from about 200,000\ninfoboxes in English Wikipedia articles to perform the proposed tasks. We\nevaluated the performance on the tasks with respect to the above research\nquestion using the V&L model OFA, which has achieved state-of-the-art results\nin multiple tasks. Experimental results show that OFA forgets part of its\nentity knowledge by pre-training as a complement to improve the performance of\nimage related tasks.\n","authors":["Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2306.02115v2.pdf","comment":"Accepted at ACL 2023"},{"id":"http://arxiv.org/abs/2106.11483v9","updated":"2023-07-26T01:56:20Z","published":"2021-06-22T02:12:29Z","title":"A Comprehensive Comparison of Pre-training Language Models","summary":"  Recently, the development of pre-trained language models has brought natural\nlanguage processing (NLP) tasks to the new state-of-the-art. In this paper we\nexplore the efficiency of various pre-trained language models. We pre-train a\nlist of transformer-based models with the same amount of text and the same\ntraining steps. The experimental results shows that the most improvement upon\nthe origin BERT is adding the RNN-layer to capture more contextual information\nfor short text understanding. But the conclusion is: There are no remarkable\nimprovement for short text understanding for similar BERT structures.\nData-centric method[12] can achieve better performance.\n","authors":["Tong Guo"],"pdf_url":"https://arxiv.org/pdf/2106.11483v9.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13900v1","updated":"2023-07-26T01:48:52Z","published":"2023-07-26T01:48:52Z","title":"FinTree: Financial Dataset Pretrain Transformer Encoder for Relation\n  Extraction","summary":"  We present FinTree, Financial Dataset Pretrain Transformer Encoder for\nRelation Extraction. Utilizing an encoder language model, we further pretrain\nFinTree on the financial dataset, adapting the model in financial domain tasks.\nFinTree stands out with its novel structure that predicts a masked token\ninstead of the conventional [CLS] token, inspired by the Pattern Exploiting\nTraining methodology. This structure allows for more accurate relation\npredictions between two given entities. The model is trained with a unique\ninput pattern to provide contextual and positional information about the\nentities of interest, and a post-processing step ensures accurate predictions\nin line with the entity types. Our experiments demonstrate that FinTree\noutperforms on the REFinD, a large-scale financial relation extraction dataset.\nThe code and pretrained models are available at\nhttps://github.com/HJ-Ok/FinTree.\n","authors":["Hyunjong Ok"],"pdf_url":"https://arxiv.org/pdf/2307.13900v1.pdf","comment":"4pages, 2 figures, The SIGIR'23 Workshop on Knowledge Discovery from\n  Unstructured Data in Financial Services"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.14341v1","updated":"2023-07-26T17:59:20Z","published":"2023-07-26T17:59:20Z","title":"Virtual Mirrors: Non-Line-of-Sight Imaging Beyond the Third Bounce","summary":"  Non-line-of-sight (NLOS) imaging methods are capable of reconstructing\ncomplex scenes that are not visible to an observer using indirect illumination.\nHowever, they assume only third-bounce illumination, so they are currently\nlimited to single-corner configurations, and present limited visibility when\nimaging surfaces at certain orientations. To reason about and tackle these\nlimitations, we make the key observation that planar diffuse surfaces behave\nspecularly at wavelengths used in the computational wave-based NLOS imaging\ndomain. We call such surfaces virtual mirrors. We leverage this observation to\nexpand the capabilities of NLOS imaging using illumination beyond the third\nbounce, addressing two problems: imaging single-corner objects at limited\nvisibility angles, and imaging objects hidden behind two corners. To image\nobjects at limited visibility angles, we first analyze the reflections of the\nknown illuminated point on surfaces of the scene as an estimator of the\nposition and orientation of objects with limited visibility. We then image\nthose limited visibility objects by computationally building secondary\napertures at other surfaces that observe the target object from a direct\nvisibility perspective. Beyond single-corner NLOS imaging, we exploit the\nspecular behavior of virtual mirrors to image objects hidden behind a second\ncorner by imaging the space behind such virtual mirrors, where the mirror image\nof objects hidden around two corners is formed. No specular surfaces were\ninvolved in the making of this paper.\n","authors":["Diego Royo","Talha Sultan","Adolfo Muñoz","Khadijeh Masumnia-Bisheh","Eric Brandt","Diego Gutierrez","Andreas Velten","Julio Marco"],"pdf_url":"https://arxiv.org/pdf/2307.14341v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14336v1","updated":"2023-07-26T17:55:32Z","published":"2023-07-26T17:55:32Z","title":"MAMo: Leveraging Memory and Attention for Monocular Video Depth\n  Estimation","summary":"  We propose MAMo, a novel memory and attention frame-work for monocular video\ndepth estimation. MAMo can augment and improve any single-image depth\nestimation networks into video depth estimation models, enabling them to take\nadvantage of the temporal information to predict more accurate depth. In MAMo,\nwe augment model with memory which aids the depth prediction as the model\nstreams through the video. Specifically, the memory stores learned visual and\ndisplacement tokens of the previous time instances. This allows the depth\nnetwork to cross-reference relevant features from the past when predicting\ndepth on the current frame. We introduce a novel scheme to continuously update\nthe memory, optimizing it to keep tokens that correspond with both the past and\nthe present visual information. We adopt attention-based approach to process\nmemory features where we first learn the spatio-temporal relation among the\nresultant visual and displacement memory tokens using self-attention module.\nFurther, the output features of self-attention are aggregated with the current\nvisual features through cross-attention. The cross-attended features are\nfinally given to a decoder to predict depth on the current frame. Through\nextensive experiments on several benchmarks, including KITTI, NYU-Depth V2, and\nDDAD, we show that MAMo consistently improves monocular depth estimation\nnetworks and sets new state-of-the-art (SOTA) accuracy. Notably, our MAMo video\ndepth estimation provides higher accuracy with lower latency, when omparing to\nSOTA cost-volume-based video depth models.\n","authors":["Rajeev Yasarla","Hong Cai","Jisoo Jeong","Yunxiao Shi","Risheek Garrepalli","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2307.14336v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14334v1","updated":"2023-07-26T17:52:22Z","published":"2023-07-26T17:52:22Z","title":"Towards Generalist Biomedical AI","summary":"  Medicine is inherently multimodal, with rich data modalities spanning text,\nimaging, genomics, and more. Generalist biomedical artificial intelligence (AI)\nsystems that flexibly encode, integrate, and interpret this data at scale can\npotentially enable impactful applications ranging from scientific discovery to\ncare delivery. To enable the development of these models, we first curate\nMultiMedBench, a new multimodal biomedical benchmark. MultiMedBench encompasses\n14 diverse tasks such as medical question answering, mammography and\ndermatology image interpretation, radiology report generation and\nsummarization, and genomic variant calling. We then introduce Med-PaLM\nMultimodal (Med-PaLM M), our proof of concept for a generalist biomedical AI\nsystem. Med-PaLM M is a large multimodal generative model that flexibly encodes\nand interprets biomedical data including clinical language, imaging, and\ngenomics with the same set of model weights. Med-PaLM M reaches performance\ncompetitive with or exceeding the state of the art on all MultiMedBench tasks,\noften surpassing specialist models by a wide margin. We also report examples of\nzero-shot generalization to novel medical concepts and tasks, positive transfer\nlearning across tasks, and emergent zero-shot medical reasoning. To further\nprobe the capabilities and limitations of Med-PaLM M, we conduct a radiologist\nevaluation of model-generated (and human) chest X-ray reports and observe\nencouraging performance across model scales. In a side-by-side ranking on 246\nretrospective chest X-rays, clinicians express a pairwise preference for\nMed-PaLM M reports over those produced by radiologists in up to 40.50% of\ncases, suggesting potential clinical utility. While considerable work is needed\nto validate these models in real-world use cases, our results represent a\nmilestone towards the development of generalist biomedical AI systems.\n","authors":["Tao Tu","Shekoofeh Azizi","Danny Driess","Mike Schaekermann","Mohamed Amin","Pi-Chuan Chang","Andrew Carroll","Chuck Lau","Ryutaro Tanno","Ira Ktena","Basil Mustafa","Aakanksha Chowdhery","Yun Liu","Simon Kornblith","David Fleet","Philip Mansfield","Sushant Prakash","Renee Wong","Sunny Virmani","Christopher Semturs","S Sara Mahdavi","Bradley Green","Ewa Dominowska","Blaise Aguera y Arcas","Joelle Barral","Dale Webster","Greg S. Corrado","Yossi Matias","Karan Singhal","Pete Florence","Alan Karthikesalingam","Vivek Natarajan"],"pdf_url":"https://arxiv.org/pdf/2307.14334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14332v1","updated":"2023-07-26T17:50:17Z","published":"2023-07-26T17:50:17Z","title":"Event-based Vision for Early Prediction of Manipulation Actions","summary":"  Neuromorphic visual sensors are artificial retinas that output sequences of\nasynchronous events when brightness changes occur in the scene. These sensors\noffer many advantages including very high temporal resolution, no motion blur\nand smart data compression ideal for real-time processing. In this study, we\nintroduce an event-based dataset on fine-grained manipulation actions and\nperform an experimental study on the use of transformers for action prediction\nwith events. There is enormous interest in the fields of cognitive robotics and\nhuman-robot interaction on understanding and predicting human actions as early\nas possible. Early prediction allows anticipating complex stages for planning,\nenabling effective and real-time interaction. Our Transformer network uses\nevents to predict manipulation actions as they occur, using online inference.\nThe model succeeds at predicting actions early on, building up confidence over\ntime and achieving state-of-the-art classification. Moreover, the\nattention-based transformer architecture allows us to study the role of the\nspatio-temporal patterns selected by the model. Our experiments show that the\nTransformer network captures action dynamic features outperforming video-based\napproaches and succeeding with scenarios where the differences between actions\nlie in very subtle cues. Finally, we release the new event dataset, which is\nthe first in the literature for manipulation action recognition. Code will be\navailable at https://github.com/DaniDeniz/EventVisionTransformer.\n","authors":["Daniel Deniz","Cornelia Fermuller","Eduardo Ros","Manuel Rodriguez-Alvarez","Francisco Barranco"],"pdf_url":"https://arxiv.org/pdf/2307.14332v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.14331v1","updated":"2023-07-26T17:50:10Z","published":"2023-07-26T17:50:10Z","title":"Visual Instruction Inversion: Image Editing via Visual Prompting","summary":"  Text-conditioned image editing has emerged as a powerful tool for editing\nimages. However, in many situations, language can be ambiguous and ineffective\nin describing specific image edits. When faced with such challenges, visual\nprompts can be a more informative and intuitive way to convey ideas. We present\na method for image editing via visual prompting. Given pairs of example that\nrepresent the \"before\" and \"after\" images of an edit, our goal is to learn a\ntext-based editing direction that can be used to perform the same edit on new\nimages. We leverage the rich, pretrained editing capabilities of text-to-image\ndiffusion models by inverting visual prompts into editing instructions. Our\nresults show that with just one example pair, we can achieve competitive\nresults compared to state-of-the-art text-conditioned image editing frameworks.\n","authors":["Thao Nguyen","Yuheng Li","Utkarsh Ojha","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2307.14331v1.pdf","comment":"Project page: https://thaoshibe.github.io/visii/"},{"id":"http://arxiv.org/abs/2304.08870v2","updated":"2023-07-26T17:13:12Z","published":"2023-04-18T10:05:37Z","title":"UPGPT: Universal Diffusion Model for Person Image Generation, Editing\n  and Pose Transfer","summary":"  Text-to-image models (T2I) such as StableDiffusion have been used to generate\nhigh quality images of people. However, due to the random nature of the\ngeneration process, the person has a different appearance e.g. pose, face, and\nclothing, despite using the same text prompt. The appearance inconsistency\nmakes T2I unsuitable for pose transfer. We address this by proposing a\nmultimodal diffusion model that accepts text, pose, and visual prompting. Our\nmodel is the first unified method to perform all person image tasks -\ngeneration, pose transfer, and mask-less edit. We also pioneer using small\ndimensional 3D body model parameters directly to demonstrate new capability -\nsimultaneous pose and camera view interpolation while maintaining the person's\nappearance.\n","authors":["Soon Yau Cheong","Armin Mustafa","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2304.08870v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14294v1","updated":"2023-07-26T16:51:18Z","published":"2023-07-26T16:51:18Z","title":"Unraveling the Complexity of Splitting Sequential Data: Tackling\n  Challenges in Video and Time Series Analysis","summary":"  Splitting of sequential data, such as videos and time series, is an essential\nstep in various data analysis tasks, including object tracking and anomaly\ndetection. However, splitting sequential data presents a variety of challenges\nthat can impact the accuracy and reliability of subsequent analyses. This\nconcept article examines the challenges associated with splitting sequential\ndata, including data acquisition, data representation, split ratio selection,\nsetting up quality criteria, and choosing suitable selection strategies. We\nexplore these challenges through two real-world examples: motor test benches\nand particle tracking in liquids.\n","authors":["Diego Botache","Kristina Dingel","Rico Huhnstock","Arno Ehresmann","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2307.14294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14288v1","updated":"2023-07-26T16:43:22Z","published":"2023-07-26T16:43:22Z","title":"US & MR Image-Fusion Based on Skin Co-Registration","summary":"  The study and development of innovative solutions for the advanced\nvisualisation, representation and analysis of medical images offer different\nresearch directions. Current practice in medical imaging consists in combining\nreal-time US with imaging modalities that allow internal anatomy acquisitions,\nsuch as CT, MRI, PET or similar. Application of image-fusion approaches can be\nfound in tracking surgical tools and/or needles, in real-time during\ninterventions. Thus, this work proposes a fusion imaging system for the\nregistration of CT and MRI images with real-time US acquisition leveraging a 3D\ncamera sensor. The main focus of the work is the portability of the system and\nits applicability to different anatomical districts.\n","authors":["Martina Paccini","Giacomo Paschina","Stefano De Beni","Giuseppe Patanè"],"pdf_url":"https://arxiv.org/pdf/2307.14288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.03829v2","updated":"2023-07-26T16:26:52Z","published":"2022-10-07T21:49:26Z","title":"Early Detection of Bark Beetle Attack Using Remote Sensing and Machine\n  Learning: A Review","summary":"  This paper provides a comprehensive review of past and current advances in\nthe early detection of bark beetle-induced tree mortality from three primary\nperspectives: bark beetle & host interactions, RS, and ML/DL. In contrast to\nprior efforts, this review encompasses all RS systems and emphasizes ML/DL\nmethods to investigate their strengths and weaknesses. We parse existing\nliterature based on multi- or hyper-spectral analyses and distill their\nknowledge based on: bark beetle species & attack phases with a primary emphasis\non early stages of attacks, host trees, study regions, RS platforms & sensors,\nspectral/spatial/temporal resolutions, spectral signatures, spectral vegetation\nindices (SVIs), ML approaches, learning schemes, task categories, models,\nalgorithms, classes/clusters, features, and DL networks & architectures.\nAlthough DL-based methods and the random forest (RF) algorithm showed promising\nresults, highlighting their potential to detect subtle changes across visible,\nthermal, and short-wave infrared (SWIR) spectral regions, they still have\nlimited effectiveness and high uncertainties. To inspire novel solutions to\nthese shortcomings, we delve into the principal challenges & opportunities from\ndifferent perspectives, enabling a deeper understanding of the current state of\nresearch and guiding future research directions.\n","authors":["Seyed Mojtaba Marvasti-Zadeh","Devin Goodsman","Nilanjan Ray","Nadir Erbilgin"],"pdf_url":"https://arxiv.org/pdf/2210.03829v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.14278v1","updated":"2023-07-26T16:19:19Z","published":"2023-07-26T16:19:19Z","title":"Large-scale Fully-Unsupervised Re-Identification","summary":"  Fully-unsupervised Person and Vehicle Re-Identification have received\nincreasing attention due to their broad applicability in surveillance,\nforensics, event understanding, and smart cities, without requiring any manual\nannotation. However, most of the prior art has been evaluated in datasets that\nhave just a couple thousand samples. Such small-data setups often allow the use\nof costly techniques in time and memory footprints, such as Re-Ranking, to\nimprove clustering results. Moreover, some previous work even pre-selects the\nbest clustering hyper-parameters for each dataset, which is unrealistic in a\nlarge-scale fully-unsupervised scenario. In this context, this work tackles a\nmore realistic scenario and proposes two strategies to learn from large-scale\nunlabeled data. The first strategy performs a local neighborhood sampling to\nreduce the dataset size in each iteration without violating neighborhood\nrelationships. A second strategy leverages a novel Re-Ranking technique, which\nhas a lower time upper bound complexity and reduces the memory complexity from\nO(n^2) to O(kn) with k << n. To avoid the pre-selection of specific\nhyper-parameter values for the clustering algorithm, we also present a novel\nscheduling algorithm that adjusts the density parameter during training, to\nleverage the diversity of samples and keep the learning robust to noisy\nlabeling. Finally, due to the complementary knowledge learned by different\nmodels, we also introduce a co-training strategy that relies upon the\npermutation of predicted pseudo-labels, among the backbones, with no need for\nany hyper-parameters or weighting optimization. The proposed methodology\noutperforms the state-of-the-art methods in well-known benchmarks and in the\nchallenging large-scale Veri-Wild dataset, with a faster and memory-efficient\nRe-Ranking strategy, and a large-scale, noisy-robust, and ensemble-based\nlearning approach.\n","authors":["Gabriel Bertocco","Fernanda Andaló","Terrance E. Boult","Anderson Rocha"],"pdf_url":"https://arxiv.org/pdf/2307.14278v1.pdf","comment":"This paper has been submitted for possible publication in an IEEE\n  Transactions"},{"id":"http://arxiv.org/abs/2307.14277v1","updated":"2023-07-26T16:14:21Z","published":"2023-07-26T16:14:21Z","title":"G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and\n  Game Theory","summary":"  The recent video grounding works attempt to introduce vanilla contrastive\nlearning into video grounding. However, we claim that this naive solution is\nsuboptimal. Contrastive learning requires two key properties: (1)\n\\emph{alignment} of features of similar samples, and (2) \\emph{uniformity} of\nthe induced distribution of the normalized features on the hypersphere. Due to\ntwo annoying issues in video grounding: (1) the co-existence of some visual\nentities in both ground truth and other moments, \\ie semantic overlapping; (2)\nonly a few moments in the video are annotated, \\ie sparse annotation dilemma,\nvanilla contrastive learning is unable to model the correlations between\ntemporally distant moments and learned inconsistent video representations. Both\ncharacteristics lead to vanilla contrastive learning being unsuitable for video\ngrounding. In this paper, we introduce Geodesic and Game Localization (G2L), a\nsemantically aligned and uniform video grounding framework via geodesic and\ngame theory. We quantify the correlations among moments leveraging the geodesic\ndistance that guides the model to learn the correct cross-modal\nrepresentations. Furthermore, from the novel perspective of game theory, we\npropose semantic Shapley interaction based on geodesic distance sampling to\nlearn fine-grained semantic alignment in similar moments. Experiments on three\nbenchmarks demonstrate the effectiveness of our method.\n","authors":["Hongxiang Li","Meng Cao","Xuxin Cheng","Yaowei Li","Zhihong Zhu","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2307.14277v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.14273v1","updated":"2023-07-26T16:11:51Z","published":"2023-07-26T16:11:51Z","title":"Deepfake Image Generation for Improved Brain Tumor Segmentation","summary":"  As the world progresses in technology and health, awareness of disease by\nrevealing asymptomatic signs improves. It is important to detect and treat\ntumors in early stage as it can be life-threatening. Computer-aided\ntechnologies are used to overcome lingering limitations facing disease\ndiagnosis, while brain tumor segmentation remains a difficult process,\nespecially when multi-modality data is involved. This is mainly attributed to\nineffective training due to lack of data and corresponding labelling. This work\ninvestigates the feasibility of employing deep-fake image generation for\neffective brain tumor segmentation. To this end, a Generative Adversarial\nNetwork was used for image-to-image translation for increasing dataset size,\nfollowed by image segmentation using a U-Net-based convolutional neural network\ntrained with deepfake images. Performance of the proposed approach is compared\nwith ground truth of four publicly available datasets. Results show improved\nperformance in terms of image segmentation quality metrics, and could\npotentially assist when training with limited data.\n","authors":["Roa'a Al-Emaryeen","Sara Al-Nahhas","Fatima Himour","Waleed Mahafza","Omar Al-Kadi"],"pdf_url":"https://arxiv.org/pdf/2307.14273v1.pdf","comment":"6 pages, 8 figures, 2 tables, conference paper"},{"id":"http://arxiv.org/abs/2307.06281v2","updated":"2023-07-26T16:02:57Z","published":"2023-07-12T16:23:09Z","title":"MMBench: Is Your Multi-modal Model an All-around Player?","summary":"  Large vision-language models have recently achieved remarkable progress,\nexhibiting great perception and reasoning abilities concerning visual\ninformation. However, how to effectively evaluate these large vision-language\nmodels remains a major obstacle, hindering future model development.\nTraditional benchmarks like VQAv2 or COCO Caption provide quantitative\nperformance measurements but suffer from a lack of fine-grained ability\nassessment and non-robust evaluation metrics. Recent subjective benchmarks,\nsuch as OwlEval, offer comprehensive evaluations of a model's abilities by\nincorporating human labor, but they are not scalable and display significant\nbias. In response to these challenges, we propose MMBench, a novel\nmulti-modality benchmark. MMBench methodically develops a comprehensive\nevaluation pipeline, primarily comprised of two elements. The first element is\na meticulously curated dataset that surpasses existing similar benchmarks in\nterms of the number and variety of evaluation questions and abilities. The\nsecond element introduces a novel CircularEval strategy and incorporates the\nuse of ChatGPT. This implementation is designed to convert free-form\npredictions into pre-defined choices, thereby facilitating a more robust\nevaluation of the model's predictions. MMBench is a systematically-designed\nobjective benchmark for robustly evaluating the various abilities of\nvision-language models. We hope MMBench will assist the research community in\nbetter evaluating their models and encourage future advancements in this\ndomain. Project page: https://opencompass.org.cn/mmbench.\n","authors":["Yuan Liu","Haodong Duan","Yuanhan Zhang","Bo Li","Songyang Zhang","Wangbo Zhao","Yike Yuan","Jiaqi Wang","Conghui He","Ziwei Liu","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2307.06281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01188v2","updated":"2023-07-26T15:58:35Z","published":"2023-06-01T22:57:32Z","title":"Event-based Stereo Visual Odometry with Native Temporal Resolution via\n  Continuous-time Gaussian Process Regression","summary":"  Event-based cameras asynchronously capture individual visual changes in a\nscene. This makes them more robust than traditional frame-based cameras to\nhighly dynamic motions and poor illumination. It also means that every\nmeasurement in a scene can occur at a unique time.\n  Handling these different measurement times is a major challenge of using\nevent-based cameras. It is often addressed in visual odometry (VO) pipelines by\napproximating temporally close measurements as occurring at one common time.\nThis grouping simplifies the estimation problem but, absent additional sensors,\nsacrifices the inherent temporal resolution of event-based cameras.\n  This paper instead presents a complete stereo VO pipeline that estimates\ndirectly with individual event-measurement times without requiring any grouping\nor approximation in the estimation state. It uses continuous-time trajectory\nestimation to maintain the temporal fidelity and asynchronous nature of\nevent-based cameras through Gaussian process regression with a physically\nmotivated prior. Its performance is evaluated on the MVSEC dataset, where it\nachieves 7.9e-3 and 5.9e-3 RMS relative error on two independent sequences,\noutperforming the existing publicly available event-based stereo VO pipeline by\ntwo and four times, respectively.\n","authors":["Jianeng Wang","Jonathan D. Gammell"],"pdf_url":"https://arxiv.org/pdf/2306.01188v2.pdf","comment":"Submitted to IEEE Robotics and Automation Letters (RA-L). Manuscript\n  #23-1314. 8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.14262v1","updated":"2023-07-26T15:50:02Z","published":"2023-07-26T15:50:02Z","title":"Artifact Restoration in Histology Images with Diffusion Probabilistic\n  Models","summary":"  Histological whole slide images (WSIs) can be usually compromised by\nartifacts, such as tissue folding and bubbles, which will increase the\nexamination difficulty for both pathologists and Computer-Aided Diagnosis (CAD)\nsystems. Existing approaches to restoring artifact images are confined to\nGenerative Adversarial Networks (GANs), where the restoration process is\nformulated as an image-to-image transfer. Those methods are prone to suffer\nfrom mode collapse and unexpected mistransfer in the stain style, leading to\nunsatisfied and unrealistic restored images. Innovatively, we make the first\nattempt at a denoising diffusion probabilistic model for histological artifact\nrestoration, namely ArtiFusion.Specifically, ArtiFusion formulates the artifact\nregion restoration as a gradual denoising process, and its training relies\nsolely on artifact-free images to simplify the training complexity.Furthermore,\nto capture local-global correlations in the regional artifact restoration, a\nnovel Swin-Transformer denoising architecture is designed, along with a time\ntoken scheme. Our extensive evaluations demonstrate the effectiveness of\nArtiFusion as a pre-processing method for histology analysis, which can\nsuccessfully preserve the tissue structures and stain style in artifact-free\nregions during the restoration. Code is available at\nhttps://github.com/zhenqi-he/ArtiFusion.\n","authors":["Zhenqi He","Junjun He","Jin Ye","Yiqing Shen"],"pdf_url":"https://arxiv.org/pdf/2307.14262v1.pdf","comment":"Accepted by MICCAI2023"},{"id":"http://arxiv.org/abs/2211.05783v3","updated":"2023-07-26T15:42:58Z","published":"2022-11-10T18:59:54Z","title":"Unifying Flow, Stereo and Depth Estimation","summary":"  We present a unified formulation and model for three motion and 3D perception\ntasks: optical flow, rectified stereo matching and unrectified stereo depth\nestimation from posed images. Unlike previous specialized architectures for\neach specific task, we formulate all three tasks as a unified dense\ncorrespondence matching problem, which can be solved with a single model by\ndirectly comparing feature similarities. Such a formulation calls for\ndiscriminative feature representations, which we achieve using a Transformer,\nin particular the cross-attention mechanism. We demonstrate that\ncross-attention enables integration of knowledge from another image via\ncross-view interactions, which greatly improves the quality of the extracted\nfeatures. Our unified model naturally enables cross-task transfer since the\nmodel architecture and parameters are shared across tasks. We outperform RAFT\nwith our unified model on the challenging Sintel dataset, and our final model\nthat uses a few additional task-specific refinement steps outperforms or\ncompares favorably to recent state-of-the-art methods on 10 popular flow,\nstereo and depth datasets, while being simpler and more efficient in terms of\nmodel design and inference speed.\n","authors":["Haofei Xu","Jing Zhang","Jianfei Cai","Hamid Rezatofighi","Fisher Yu","Dacheng Tao","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2211.05783v3.pdf","comment":"TPAMI 2023, Project Page: https://haofeixu.github.io/unimatch, Code:\n  https://github.com/autonomousvision/unimatch, Demo:\n  https://huggingface.co/spaces/haofeixu/unimatch"},{"id":"http://arxiv.org/abs/2307.14253v1","updated":"2023-07-26T15:33:35Z","published":"2023-07-26T15:33:35Z","title":"Sparse Double Descent in Vision Transformers: real or phantom threat?","summary":"  Vision transformers (ViT) have been of broad interest in recent theoretical\nand empirical works. They are state-of-the-art thanks to their attention-based\napproach, which boosts the identification of key features and patterns within\nimages thanks to the capability of avoiding inductive bias, resulting in highly\naccurate image analysis. Meanwhile, neoteric studies have reported a ``sparse\ndouble descent'' phenomenon that can occur in modern deep-learning models,\nwhere extremely over-parametrized models can generalize well. This raises\npractical questions about the optimal size of the model and the quest over\nfinding the best trade-off between sparsity and performance is launched: are\nVision Transformers also prone to sparse double descent? Can we find a way to\navoid such a phenomenon? Our work tackles the occurrence of sparse double\ndescent on ViTs. Despite some works that have shown that traditional\narchitectures, like Resnet, are condemned to the sparse double descent\nphenomenon, for ViTs we observe that an optimally-tuned $\\ell_2$ regularization\nrelieves such a phenomenon. However, everything comes at a cost: optimal lambda\nwill sacrifice the potential compression of the ViT.\n","authors":["Victor Quétu","Marta Milovanovic","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2307.14253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14243v1","updated":"2023-07-26T15:14:10Z","published":"2023-07-26T15:14:10Z","title":"Fluorescent Neuronal Cells v2: Multi-Task, Multi-Format Annotations for\n  Deep Learning in Microscopy","summary":"  Fluorescent Neuronal Cells v2 is a collection of fluorescence microscopy\nimages and the corresponding ground-truth annotations, designed to foster\ninnovative research in the domains of Life Sciences and Deep Learning. This\ndataset encompasses three image collections in which rodent neuronal cells'\nnuclei and cytoplasm are stained with diverse markers to highlight their\nanatomical or functional characteristics. Alongside the images, we provide\nground-truth annotations for several learning tasks, including semantic\nsegmentation, object detection, and counting. The contribution is two-fold.\nFirst, given the variety of annotations and their accessible formats, we\nenvision our work facilitating methodological advancements in computer vision\napproaches for segmentation, detection, feature learning, unsupervised and\nself-supervised learning, transfer learning, and related areas. Second, by\nenabling extensive exploration and benchmarking, we hope Fluorescent Neuronal\nCells v2 will catalyze breakthroughs in fluorescence microscopy analysis and\npromote cutting-edge discoveries in life sciences. The data are available at:\nhttps://amsacta.unibo.it/id/eprint/7347\n","authors":["Luca Clissa","Antonio Macaluso","Roberto Morelli","Alessandra Occhinegro","Emiliana Piscitiello","Ludovico Taddei","Marco Luppi","Roberto Amici","Matteo Cerri","Timna Hitrec","Lorenzo Rinaldi","Antonio Zoccoli"],"pdf_url":"https://arxiv.org/pdf/2307.14243v1.pdf","comment":"11 pages; 5 figures; 2 tables"},{"id":"http://arxiv.org/abs/2307.14242v1","updated":"2023-07-26T15:11:51Z","published":"2023-07-26T15:11:51Z","title":"Defending Adversarial Patches via Joint Region Localizing and Inpainting","summary":"  Deep neural networks are successfully used in various applications, but show\ntheir vulnerability to adversarial examples. With the development of\nadversarial patches, the feasibility of attacks in physical scenes increases,\nand the defenses against patch attacks are urgently needed. However, defending\nsuch adversarial patch attacks is still an unsolved problem. In this paper, we\nanalyse the properties of adversarial patches, and find that: on the one hand,\nadversarial patches will lead to the appearance or contextual inconsistency in\nthe target objects; on the other hand, the patch region will show abnormal\nchanges on the high-level feature maps of the objects extracted by a backbone\nnetwork. Considering the above two points, we propose a novel defense method\nbased on a ``localizing and inpainting\" mechanism to pre-process the input\nexamples. Specifically, we design an unified framework, where the ``localizing\"\nsub-network utilizes a two-branch structure to represent the above two aspects\nto accurately detect the adversarial patch region in the image. For the\n``inpainting\" sub-network, it utilizes the surrounding contextual cues to\nrecover the original content covered by the adversarial patch. The quality of\ninpainted images is also evaluated by measuring the appearance consistency and\nthe effects of adversarial attacks. These two sub-networks are then jointly\ntrained via an iterative optimization manner. In this way, the ``localizing\"\nand ``inpainting\" modules can interact closely with each other, and thus learn\na better solution. A series of experiments versus traffic sign classification\nand detection tasks are conducted to defend against various adversarial patch\nattacks.\n","authors":["Junwen Chen","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2307.14242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14241v1","updated":"2023-07-26T15:10:54Z","published":"2023-07-26T15:10:54Z","title":"DisguisOR: Holistic Face Anonymization for the Operating Room","summary":"  Purpose: Recent advances in Surgical Data Science (SDS) have contributed to\nan increase in video recordings from hospital environments. While methods such\nas surgical workflow recognition show potential in increasing the quality of\npatient care, the quantity of video data has surpassed the scale at which\nimages can be manually anonymized. Existing automated 2D anonymization methods\nunder-perform in Operating Rooms (OR), due to occlusions and obstructions. We\npropose to anonymize multi-view OR recordings using 3D data from multiple\ncamera streams. Methods: RGB and depth images from multiple cameras are fused\ninto a 3D point cloud representation of the scene. We then detect each\nindividual's face in 3D by regressing a parametric human mesh model onto\ndetected 3D human keypoints and aligning the face mesh with the fused 3D point\ncloud. The mesh model is rendered into every acquired camera view, replacing\neach individual's face. Results: Our method shows promise in locating faces at\na higher rate than existing approaches. DisguisOR produces geometrically\nconsistent anonymizations for each camera view, enabling more realistic\nanonymization that is less detrimental to downstream tasks. Conclusion:\nFrequent obstructions and crowding in operating rooms leaves significant room\nfor improvement for off-the-shelf anonymization methods. DisguisOR addresses\nprivacy on a scene level and has the potential to facilitate further research\nin SDS.\n","authors":["Lennart Bastian","Tony Danjun Wang","Tobias Czempiel","Benjamin Busam","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2307.14241v1.pdf","comment":"Accepted at IPCAI 2023; 21 pages, 11 figures"},{"id":"http://arxiv.org/abs/2303.05123v2","updated":"2023-07-26T14:57:53Z","published":"2023-03-09T09:12:21Z","title":"Dominating Set Database Selection for Visual Place Recognition","summary":"  This paper presents an approach for creating a visual place recognition (VPR)\ndatabase for localization in indoor environments from RGBD scanning sequences.\nThe proposed approach is formulated as a minimization problem in terms of\ndominating set algorithm for graph, constructed from spatial information, and\nreferred as DominatingSet. Our algorithm shows better scene coverage in\ncomparison to other methodologies that are used for database creation. Also, we\ndemonstrate that using DominatingSet, a database size could be up to 250-1400\ntimes smaller than the original scanning sequence while maintaining a recall\nrate of more than 80% on testing sequences. We evaluated our algorithm on\n7-scenes and BundleFusion datasets and an additionally recorded sequence in a\nhighly repetitive office setting. In addition, the database selection can\nproduce weakly-supervised labels for fine-tuning neural place recognition\nalgorithms to particular settings, improving even more their accuracy. The\npaper also presents a fully automated pipeline for VPR database creation from\nRGBD scanning sequences, as well as a set of metrics for VPR database\nevaluation. The code and released data are available on our web-page~ --\nhttps://prime-slam.github.io/place-recognition-db/\n","authors":["Anastasiia Kornilova","Ivan Moskalenko","Timofei Pushkin","Fakhriddin Tojiboev","Rahim Tariverdizadeh","Gonzalo Ferrer"],"pdf_url":"https://arxiv.org/pdf/2303.05123v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01415v2","updated":"2023-07-26T14:53:23Z","published":"2023-06-02T10:04:57Z","title":"Learning Landmarks Motion from Speech for Speaker-Agnostic 3D Talking\n  Heads Generation","summary":"  This paper presents a novel approach for generating 3D talking heads from raw\naudio inputs. Our method grounds on the idea that speech related movements can\nbe comprehensively and efficiently described by the motion of a few control\npoints located on the movable parts of the face, i.e., landmarks. The\nunderlying musculoskeletal structure then allows us to learn how their motion\ninfluences the geometrical deformations of the whole face. The proposed method\nemploys two distinct models to this aim: the first one learns to generate the\nmotion of a sparse set of landmarks from the given audio. The second model\nexpands such landmarks motion to a dense motion field, which is utilized to\nanimate a given 3D mesh in neutral state. Additionally, we introduce a novel\nloss function, named Cosine Loss, which minimizes the angle between the\ngenerated motion vectors and the ground truth ones. Using landmarks in 3D\ntalking head generation offers various advantages such as consistency,\nreliability, and obviating the need for manual-annotation. Our approach is\ndesigned to be identity-agnostic, enabling high-quality facial animations for\nany users without additional data or training.\n","authors":["Federico Nocentini","Claudio Ferrari","Stefano Berretti"],"pdf_url":"https://arxiv.org/pdf/2306.01415v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14227v1","updated":"2023-07-26T14:50:01Z","published":"2023-07-26T14:50:01Z","title":"Computational Approaches for Traditional Chinese Painting: From the \"Six\n  Principles of Painting\" Perspective","summary":"  Traditional Chinese Painting (TCP) is an invaluable cultural heritage\nresource and a unique visual art style. In recent years, increasing interest\nhas been placed on digitalizing TCPs to preserve and revive the culture. The\nresulting digital copies have enabled the advancement of computational methods\nfor structured and systematic understanding of TCPs. To explore this topic, we\nconducted an in-depth analysis of 92 pieces of literature. We examined the\ncurrent use of computer technologies on TCPs from three perspectives, based on\nnumerous conversations with specialists. First, in light of the \"Six Principles\nof Painting\" theory, we categorized the articles according to their research\nfocus on artistic elements. Second, we created a four-stage framework to\nillustrate the purposes of TCP applications. Third, we summarized the popular\ncomputational techniques applied to TCPs. The framework also provides insights\ninto potential applications and future prospects, with professional opinion.\nThe list of surveyed publications and related information is available online\nat https://ca4tcp.com.\n","authors":["Wei Zhang","Jian-Wei Zhang","Kam Kwai Wong","Yifang Wang","Yingchaojie Feng","Luwei Wang","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2307.14227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.03906v2","updated":"2023-07-26T14:49:39Z","published":"2021-12-07T18:58:33Z","title":"Cross-modal Manifold Cutmix for Self-supervised Video Representation\n  Learning","summary":"  In this paper, we address the challenge of obtaining large-scale unlabelled\nvideo datasets for contrastive representation learning in real-world\napplications. We present a novel video augmentation technique for\nself-supervised learning, called Cross-Modal Manifold Cutmix (CMMC), which\ngenerates augmented samples by combining different modalities in videos. By\nembedding a video tesseract into another across two modalities in the feature\nspace, our method enhances the quality of learned video representations. We\nperform extensive experiments on two small-scale video datasets, UCF101 and\nHMDB51, for action recognition and video retrieval tasks. Our approach is also\nshown to be effective on the NTU dataset with limited domain knowledge. Our\nCMMC achieves comparable performance to other self-supervised methods while\nusing less training data for both downstream tasks.\n","authors":["Srijan Das","Michael S. Ryoo"],"pdf_url":"https://arxiv.org/pdf/2112.03906v2.pdf","comment":"Accepted at MVA 2023"},{"id":"http://arxiv.org/abs/2203.04317v2","updated":"2023-07-26T13:43:04Z","published":"2022-03-08T18:07:47Z","title":"MICDIR: Multi-scale Inverse-consistent Deformable Image Registration\n  using UNetMSS with Self-Constructing Graph Latent","summary":"  Image registration is the process of bringing different images into a common\ncoordinate system - a technique widely used in various applications of computer\nvision, such as remote sensing, image retrieval, and, most commonly, medical\nimaging. Deep learning based techniques have been applied successfully to\ntackle various complex medical image processing problems, including medical\nimage registration. Over the years, several image registration techniques have\nbeen proposed using deep learning. Deformable image registration techniques\nsuch as Voxelmorph have been successful in capturing finer changes and\nproviding smoother deformations. However, Voxelmorph, as well as ICNet and\nFIRE, do not explicitly encode global dependencies (i.e. the overall anatomical\nview of the supplied image) and, therefore, cannot track large deformations. In\norder to tackle the aforementioned problems, this paper extends the Voxelmorph\napproach in three different ways. To improve the performance in case of small\nas well as large deformations, supervision of the model at different\nresolutions has been integrated using a multi-scale UNet. To support the\nnetwork to learn and encode the minute structural co-relations of the given\nimage-pairs, a self-constructing graph network (SCGNet) has been used as the\nlatent of the multi-scale UNet - which can improve the learning process of the\nmodel and help the model to generalise better. And finally, to make the\ndeformations inverse-consistent, cycle consistency loss has been employed. On\nthe task of registration of brain MRIs, the proposed method achieved\nsignificant improvements over ANTs and VoxelMorph, obtaining a Dice score of\n0.8013 \\pm 0.0243 for intramodal and 0.6211 \\pm 0.0309 for intermodal, while\nVoxelMorph achieved 0.7747 \\pm 0.0260 and 0.6071 \\pm 0.0510, respectively\n","authors":["Soumick Chatterjee","Himanshi Bajaj","Istiyak H. Siddiquee","Nandish Bandi Subbarayappa","Steve Simon","Suraj Bangalore Shashidhar","Oliver Speck","Andreas Nürnberge"],"pdf_url":"https://arxiv.org/pdf/2203.04317v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14187v1","updated":"2023-07-26T13:41:51Z","published":"2023-07-26T13:41:51Z","title":"ADAPT: Efficient Multi-Agent Trajectory Prediction with Adaptation","summary":"  Forecasting future trajectories of agents in complex traffic scenes requires\nreliable and efficient predictions for all agents in the scene. However,\nexisting methods for trajectory prediction are either inefficient or sacrifice\naccuracy. To address this challenge, we propose ADAPT, a novel approach for\njointly predicting the trajectories of all agents in the scene with dynamic\nweight learning. Our approach outperforms state-of-the-art methods in both\nsingle-agent and multi-agent settings on the Argoverse and Interaction\ndatasets, with a fraction of their computational overhead. We attribute the\nimprovement in our performance: first, to the adaptive head augmenting the\nmodel capacity without increasing the model size; second, to our design choices\nin the endpoint-conditioned prediction, reinforced by gradient stopping. Our\nanalyses show that ADAPT can focus on each agent with adaptive prediction,\nallowing for accurate predictions efficiently. https://KUIS-AI.github.io/adapt\n","authors":["Görkay Aydemir","Adil Kaan Akan","Fatma Güney"],"pdf_url":"https://arxiv.org/pdf/2307.14187v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2302.14831v2","updated":"2023-07-26T13:20:49Z","published":"2023-02-28T18:28:35Z","title":"FacEDiM: A Face Embedding Distribution Model for Few-Shot Biometric\n  Authentication of Cattle","summary":"  This work proposes to solve the problem of few-shot biometric authentication\nby computing the Mahalanobis distance between testing embeddings and a\nmultivariate Gaussian distribution of training embeddings obtained using\npre-trained CNNs. Experimental results show that models pre-trained on the\nImageNet dataset significantly outperform models pre-trained on human faces.\nWith a VGG16 model, we obtain a FRR of 1.25% for a FAR of 1.18% on a dataset of\n20 cattle identities.\n","authors":["Meshia Cédric Oveneke","Rucha Vaishampayan","Deogratias Lukamba Nsadisa","Jenny Ambukiyenyi Onya"],"pdf_url":"https://arxiv.org/pdf/2302.14831v2.pdf","comment":"4 pages, 1 figure, 1 table, paper accepted at Black In AI at the 36th\n  Conference on Neural Information Processing Systems (NeurIPS 2022), New\n  Orleans, USA"},{"id":"http://arxiv.org/abs/2307.14179v1","updated":"2023-07-26T13:11:48Z","published":"2023-07-26T13:11:48Z","title":"Resolution-Aware Design of Atrous Rates for Semantic Segmentation\n  Networks","summary":"  DeepLab is a widely used deep neural network for semantic segmentation, whose\nsuccess is attributed to its parallel architecture called atrous spatial\npyramid pooling (ASPP). ASPP uses multiple atrous convolutions with different\natrous rates to extract both local and global information. However, fixed\nvalues of atrous rates are used for the ASPP module, which restricts the size\nof its field of view. In principle, atrous rate should be a hyperparameter to\nchange the field of view size according to the target task or dataset. However,\nthe manipulation of atrous rate is not governed by any guidelines. This study\nproposes practical guidelines for obtaining an optimal atrous rate. First, an\neffective receptive field for semantic segmentation is introduced to analyze\nthe inner behavior of segmentation networks. We observed that the use of ASPP\nmodule yielded a specific pattern in the effective receptive field, which was\ntraced to reveal the module's underlying mechanism. Accordingly, we derive\npractical guidelines for obtaining the optimal atrous rate, which should be\ncontrolled based on the size of input image. Compared to other values, using\nthe optimal atrous rate consistently improved the segmentation results across\nmultiple datasets, including the STARE, CHASE_DB1, HRF, Cityscapes, and iSAID\ndatasets.\n","authors":["Bum Jun Kim","Hyeyeon Choi","Hyeonah Jang","Sang Woo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.14179v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.04956v2","updated":"2023-07-26T13:11:41Z","published":"2023-07-11T01:17:00Z","title":"PKU-GoodsAD: A Supermarket Goods Dataset for Unsupervised Anomaly\n  Detection and Segmentation","summary":"  Visual anomaly detection is essential and commonly used for many tasks in the\nfield of computer vision. Recent anomaly detection datasets mainly focus on\nindustrial automated inspection, medical image analysis and video surveillance.\nIn order to broaden the application and research of anomaly detection in\nunmanned supermarkets and smart manufacturing, we introduce the supermarket\ngoods anomaly detection (GoodsAD) dataset. It contains 6124 high-resolution\nimages of 484 different appearance goods divided into 6 categories. Each\ncategory contains several common different types of anomalies such as\ndeformation, surface damage and opened. Anomalies contain both texture changes\nand structural changes. It follows the unsupervised setting and only normal\n(defect-free) images are used for training. Pixel-precise ground truth regions\nare provided for all anomalies. Moreover, we also conduct a thorough evaluation\nof current state-of-the-art unsupervised anomaly detection methods. This\ninitial benchmark indicates that some methods which perform well on the\nindustrial anomaly detection dataset (e.g., MVTec AD), show poor performance on\nour dataset. This is a comprehensive, multi-object dataset for supermarket\ngoods anomaly detection that focuses on real-world applications.\n","authors":["Jian Zhang","Runwei Ding","Miaoju Ban","Ge Yang"],"pdf_url":"https://arxiv.org/pdf/2307.04956v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.12721v2","updated":"2023-07-26T13:07:32Z","published":"2023-07-24T12:03:50Z","title":"AMAE: Adaptation of Pre-Trained Masked Autoencoder for Dual-Distribution\n  Anomaly Detection in Chest X-Rays","summary":"  Unsupervised anomaly detection in medical images such as chest radiographs is\nstepping into the spotlight as it mitigates the scarcity of the labor-intensive\nand costly expert annotation of anomaly data. However, nearly all existing\nmethods are formulated as a one-class classification trained only on\nrepresentations from the normal class and discard a potentially significant\nportion of the unlabeled data. This paper focuses on a more practical setting,\ndual distribution anomaly detection for chest X-rays, using the entire training\ndata, including both normal and unlabeled images. Inspired by a modern\nself-supervised vision transformer model trained using partial image inputs to\nreconstruct missing image regions -- we propose AMAE, a two-stage algorithm for\nadaptation of the pre-trained masked autoencoder (MAE). Starting from MAE\ninitialization, AMAE first creates synthetic anomalies from only normal\ntraining images and trains a lightweight classifier on frozen transformer\nfeatures. Subsequently, we propose an adaptation strategy to leverage unlabeled\nimages containing anomalies. The adaptation scheme is accomplished by assigning\npseudo-labels to unlabeled images and using two separate MAE based modules to\nmodel the normative and anomalous distributions of pseudo-labeled images. The\neffectiveness of the proposed adaptation strategy is evaluated with different\nanomaly ratios in an unlabeled training set. AMAE leads to consistent\nperformance gains over competing self-supervised and dual distribution anomaly\ndetection methods, setting the new state-of-the-art on three public chest X-ray\nbenchmarks: RSNA, NIH-CXR, and VinDr-CXR.\n","authors":["Behzad Bozorgtabar","Dwarikanath Mahapatra","Jean-Philippe Thiran"],"pdf_url":"https://arxiv.org/pdf/2307.12721v2.pdf","comment":"To be presented at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.14177v1","updated":"2023-07-26T13:06:35Z","published":"2023-07-26T13:06:35Z","title":"High-definition event frame generation using SoC FPGA devices","summary":"  In this paper we have addressed the implementation of the accumulation and\nprojection of high-resolution event data stream (HD -1280 x 720 pixels) onto\nthe image plane in FPGA devices. The results confirm the feasibility of this\napproach, but there are a number of challenges, limitations and trade-offs to\nbe considered. The required hardware resources of selected data\nrepresentations, such as binary frame, event frame, exponentially decaying time\nsurface and event frequency, were compared with those available on several\npopular platforms from AMD Xilinx. The resulting event frames can be used for\ntypical vision algorithms, such as object classification and detection, using\nboth classical and deep neural network methods.\n","authors":["Krzysztof Blachut","Tomasz Kryjak"],"pdf_url":"https://arxiv.org/pdf/2307.14177v1.pdf","comment":"Paper accepted for the SPA 2023 conference"},{"id":"http://arxiv.org/abs/2305.11467v3","updated":"2023-07-26T12:58:52Z","published":"2023-05-19T06:39:10Z","title":"Learning Sequence Descriptor based on Spatio-Temporal Attention for\n  Visual Place Recognition","summary":"  Visual Place Recognition (VPR) aims to retrieve frames from a geotagged\ndatabase that are located at the same place as the query frame. To improve the\nrobustness of VPR in perceptually aliasing scenarios, sequence-based VPR\nmethods are proposed. These methods are either based on matching between frame\nsequences or extracting sequence descriptors for direct retrieval. However, the\nformer is usually based on the assumption of constant velocity, which is\ndifficult to hold in practice, and is computationally expensive and subject to\nsequence length. Although the latter overcomes these problems, existing\nsequence descriptors are constructed by aggregating features of multiple frames\nonly, without interaction on temporal information, and thus cannot obtain\ndescriptors with spatio-temporal discrimination. In this paper, we propose a\nsequence descriptor that effectively incorporates spatio-temporal information.\nSpecifically, spatial attention within the same frame is utilized to learn\nspatial feature patterns, while attention in corresponding local regions of\ndifferent frames is utilized to learn the persistence or change of features\nover time. We use a sliding window to control the temporal range of attention\nand use relative position encoding to construct sequential relationships\nbetween different features. This allows our descriptors to capture the\nintrinsic dynamics in a sequence of frames. Comprehensive experiments on\nchallenging benchmark datasets show that the proposed approach outperforms\nrecent state-of-the-art methods.\n","authors":["Fenglin Zhang","Junqiao Zhao","Yingfeng Cai","Gengxuan Tian","Wenjie Mu","Chen Ye"],"pdf_url":"https://arxiv.org/pdf/2305.11467v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14142v1","updated":"2023-07-26T12:13:00Z","published":"2023-07-26T12:13:00Z","title":"LOIS: Looking Out of Instance Semantics for Visual Question Answering","summary":"  Visual question answering (VQA) has been intensively studied as a multimodal\ntask that requires effort in bridging vision and language to infer answers\ncorrectly. Recent attempts have developed various attention-based modules for\nsolving VQA tasks. However, the performance of model inference is largely\nbottlenecked by visual processing for semantics understanding. Most existing\ndetection methods rely on bounding boxes, remaining a serious challenge for VQA\nmodels to understand the causal nexus of object semantics in images and\ncorrectly infer contextual information. To this end, we propose a finer model\nframework without bounding boxes in this work, termed Looking Out of Instance\nSemantics (LOIS) to tackle this important issue. LOIS enables more fine-grained\nfeature descriptions to produce visual facts. Furthermore, to overcome the\nlabel ambiguity caused by instance masks, two types of relation attention\nmodules: 1) intra-modality and 2) inter-modality, are devised to infer the\ncorrect answers from the different multi-view features. Specifically, we\nimplement a mutual relation attention module to model sophisticated and deeper\nvisual semantic relations between instance objects and background information.\nIn addition, our proposed attention model can further analyze salient image\nregions by focusing on important word-related questions. Experimental results\non four benchmark VQA datasets prove that our proposed method has favorable\nperformance in improving visual reasoning capability.\n","authors":["Siyu Zhang","Yeming Chen","Yaoru Sun","Fang Wang","Haibo Shi","Haoran Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05511v2","updated":"2023-07-26T11:59:11Z","published":"2023-05-09T14:58:13Z","title":"Self-supervised dense representation learning for live-cell microscopy\n  with time arrow prediction","summary":"  State-of-the-art object detection and segmentation methods for microscopy\nimages rely on supervised machine learning, which requires laborious manual\nannotation of training data. Here we present a self-supervised method based on\ntime arrow prediction pre-training that learns dense image representations from\nraw, unlabeled live-cell microscopy videos. Our method builds upon the task of\npredicting the correct order of time-flipped image regions via a single-image\nfeature extractor followed by a time arrow prediction head that operates on the\nfused features. We show that the resulting dense representations capture\ninherently time-asymmetric biological processes such as cell divisions on a\npixel-level. We furthermore demonstrate the utility of these representations on\nseveral live-cell microscopy datasets for detection and segmentation of\ndividing cells, as well as for cell state classification. Our method\noutperforms supervised methods, particularly when only limited ground truth\nannotations are available as is commonly the case in practice. We provide code\nat https://github.com/weigertlab/tarrow.\n","authors":["Benjamin Gallusser","Max Stieber","Martin Weigert"],"pdf_url":"https://arxiv.org/pdf/2305.05511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09630v4","updated":"2023-07-26T11:55:03Z","published":"2023-04-19T13:05:18Z","title":"Few-shot Medical Image Segmentation via Cross-Reference Transformer","summary":"  Deep learning models have become the mainstream method for medical image\nsegmentation, but they require a large manually labeled dataset for training\nand are difficult to extend to unseen categories. Few-shot segmentation(FSS)\nhas the potential to address these challenges by learning new categories from a\nsmall number of labeled samples. The majority of the current methods employ a\nprototype learning architecture, which involves expanding support prototype\nvectors and concatenating them with query features to conduct conditional\nsegmentation. However, such framework potentially focuses more on query\nfeatures while may neglect the correlation between support and query features.\nIn this paper, we propose a novel self-supervised few shot medical image\nsegmentation network with Cross-Reference Transformer, which addresses the lack\nof interaction between the support image and the query image. We first enhance\nthe correlation features between the support set image and the query image\nusing a bidirectional cross-attention module. Then, we employ a cross-reference\nmechanism to mine and enhance the similar parts of support features and query\nfeatures in high-dimensional channels. Experimental results show that the\nproposed model achieves good results on both CT dataset and MRI dataset.\n","authors":["Yao Huang","Jianming Liu"],"pdf_url":"https://arxiv.org/pdf/2304.09630v4.pdf","comment":"6 pages,4 figures"},{"id":"http://arxiv.org/abs/2307.14127v1","updated":"2023-07-26T11:47:44Z","published":"2023-07-26T11:47:44Z","title":"Creative Birds: Self-Supervised Single-View 3D Style Transfer","summary":"  In this paper, we propose a novel method for single-view 3D style transfer\nthat generates a unique 3D object with both shape and texture transfer. Our\nfocus lies primarily on birds, a popular subject in 3D reconstruction, for\nwhich no existing single-view 3D transfer methods have been developed.The\nmethod we propose seeks to generate a 3D mesh shape and texture of a bird from\ntwo single-view images. To achieve this, we introduce a novel shape transfer\ngenerator that comprises a dual residual gated network (DRGNet), and a\nmulti-layer perceptron (MLP). DRGNet extracts the features of source and target\nimages using a shared coordinate gate unit, while the MLP generates spatial\ncoordinates for building a 3D mesh. We also introduce a semantic UV texture\ntransfer module that implements textural style transfer using semantic UV\nsegmentation, which ensures consistency in the semantic meaning of the\ntransferred regions. This module can be widely adapted to many existing\napproaches. Finally, our method constructs a novel 3D bird using a\ndifferentiable renderer. Experimental results on the CUB dataset verify that\nour method achieves state-of-the-art performance on the single-view 3D style\ntransfer task. Code is available in\nhttps://github.com/wrk226/2D-to-3D-Evolution-Transfer.\n","authors":["Renke Wang","Guimin Que","Shuo Chen","Xiang Li","Jun Li","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2307.14127v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14126v1","updated":"2023-07-26T11:45:39Z","published":"2023-07-26T11:45:39Z","title":"Multi-modal Learning with Missing Modality via Shared-Specific Feature\n  Modelling","summary":"  The missing modality issue is critical but non-trivial to be solved by\nmulti-modal models. Current methods aiming to handle the missing modality\nproblem in multi-modal tasks, either deal with missing modalities only during\nevaluation or train separate models to handle specific missing modality\nsettings. In addition, these models are designed for specific tasks, so for\nexample, classification models are not easily adapted to segmentation tasks and\nvice versa. In this paper, we propose the Shared-Specific Feature Modelling\n(ShaSpec) method that is considerably simpler and more effective than competing\napproaches that address the issues above. ShaSpec is designed to take advantage\nof all available input modalities during training and evaluation by learning\nshared and specific features to better represent the input data. This is\nachieved from a strategy that relies on auxiliary tasks based on distribution\nalignment and domain classification, in addition to a residual feature fusion\nprocedure. Also, the design simplicity of ShaSpec enables its easy adaptation\nto multiple tasks, such as classification and segmentation. Experiments are\nconducted on both medical image segmentation and computer vision\nclassification, with results indicating that ShaSpec outperforms competing\nmethods by a large margin. For instance, on BraTS2018, ShaSpec improves the\nSOTA by more than 3% for enhancing tumour, 5% for tumour core and 3% for whole\ntumour.\n","authors":["Hu Wang","Yuanhong Chen","Congbo Ma","Jodie Avery","Louise Hull","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2307.14126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14124v1","updated":"2023-07-26T11:44:44Z","published":"2023-07-26T11:44:44Z","title":"Memory-Efficient Graph Convolutional Networks for Object Classification\n  and Detection with Event Cameras","summary":"  Recent advances in event camera research emphasize processing data in its\noriginal sparse form, which allows the use of its unique features such as high\ntemporal resolution, high dynamic range, low latency, and resistance to image\nblur. One promising approach for analyzing event data is through graph\nconvolutional networks (GCNs). However, current research in this domain\nprimarily focuses on optimizing computational costs, neglecting the associated\nmemory costs. In this paper, we consider both factors together in order to\nachieve satisfying results and relatively low model complexity. For this\npurpose, we performed a comparative analysis of different graph convolution\noperations, considering factors such as execution time, the number of trainable\nmodel parameters, data format requirements, and training outcomes. Our results\nshow a 450-fold reduction in the number of parameters for the feature\nextraction module and a 4.5-fold reduction in the size of the data\nrepresentation while maintaining a classification accuracy of 52.3%, which is\n6.3% higher compared to the operation used in state-of-the-art approaches. To\nfurther evaluate performance, we implemented the object detection architecture\nand evaluated its performance on the N-Caltech101 dataset. The results showed\nan accuracy of 53.7 % mAP@0.5 and reached an execution rate of 82 graphs per\nsecond.\n","authors":["Kamil Jeziorek","Andrea Pinna","Tomasz Kryjak"],"pdf_url":"https://arxiv.org/pdf/2307.14124v1.pdf","comment":"Accepted for the SPA 2023 conference"},{"id":"http://arxiv.org/abs/2302.01226v2","updated":"2023-07-26T11:40:03Z","published":"2023-02-02T17:06:50Z","title":"Factor Fields: A Unified Framework for Neural Fields and Beyond","summary":"  We present Factor Fields, a novel framework for modeling and representing\nsignals. Factor Fields decomposes a signal into a product of factors, each of\nwhich is represented by a neural or regular field representation operating on a\ncoordinate transformed input signal. We show that this decomposition yields a\nunified framework that generalizes several recent signal representations\nincluding NeRF, PlenOxels, EG3D, Instant-NGP, and TensoRF. Moreover, the\nframework allows for the creation of powerful new signal representations, such\nas the Coefficient-Basis Factorization (CoBaFa) which we propose in this paper.\nAs evidenced by our experiments, CoBaFa leads to improvements over previous\nfast reconstruction methods in terms of the three critical goals in neural\nsignal representation: approximation quality, compactness and efficiency.\nExperimentally, we demonstrate that our representation achieves better image\napproximation quality on 2D image regression tasks, higher geometric quality\nwhen reconstructing 3D signed distance fields and higher compactness for\nradiance field reconstruction tasks compared to previous fast reconstruction\nmethods. Besides, our CoBaFa representation enables generalization by sharing\nthe basis across signals during training, enabling generalization tasks such as\nimage regression with sparse observations and few-shot radiance field\nreconstruction. Project Page: https://apchenstu.github.io/FactorFields/\n","authors":["Anpei Chen","Zexiang Xu","Xinyue Wei","Siyu Tang","Hao Su","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2302.01226v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.14119v1","updated":"2023-07-26T11:38:45Z","published":"2023-07-26T11:38:45Z","title":"A semantics-driven methodology for high-quality image annotation","summary":"  Recent work in Machine Learning and Computer Vision has highlighted the\npresence of various types of systematic flaws inside ground truth object\nrecognition benchmark datasets. Our basic tenet is that these flaws are rooted\nin the many-to-many mappings which exist between the visual information encoded\nin images and the intended semantics of the labels annotating them. The net\nconsequence is that the current annotation process is largely under-specified,\nthus leaving too much freedom to the subjective judgment of annotators. In this\npaper, we propose vTelos, an integrated Natural Language Processing, Knowledge\nRepresentation, and Computer Vision methodology whose main goal is to make\nexplicit the (otherwise implicit) intended annotation semantics, thus\nminimizing the number and role of subjective choices. A key element of vTelos\nis the exploitation of the WordNet lexico-semantic hierarchy as the main means\nfor providing the meaning of natural language labels and, as a consequence, for\ndriving the annotation of images based on the objects and the visual properties\nthey depict. The methodology is validated on images populating a subset of the\nImageNet hierarchy.\n","authors":["Fausto Giunchiglia","Mayukh Bagchi","Xiaolei Diao"],"pdf_url":"https://arxiv.org/pdf/2307.14119v1.pdf","comment":"Accepted @ 26th European Conference on Artificial Intelligence (ECAI)\n  2023, Krak\\'ow, Poland"},{"id":"http://arxiv.org/abs/2307.14111v1","updated":"2023-07-26T11:14:36Z","published":"2023-07-26T11:14:36Z","title":"Periocular biometrics: databases, algorithms and directions","summary":"  Periocular biometrics has been established as an independent modality due to\nconcerns on the performance of iris or face systems in uncontrolled conditions.\nPeriocular refers to the facial region in the eye vicinity, including eyelids,\nlashes and eyebrows. It is available over a wide range of acquisition\ndistances, representing a trade-off between the whole face (which can be\noccluded at close distances) and the iris texture (which do not have enough\nresolution at long distances). Since the periocular region appears in face or\niris images, it can be used also in conjunction with these modalities. Features\nextracted from the periocular region have been also used successfully for\ngender classification and ethnicity classification, and to study the impact of\ngender transformation or plastic surgery in the recognition performance. This\npaper presents a review of the state of the art in periocular biometric\nresearch, providing an insight of the most relevant issues and giving a\nthorough coverage of the existing literature. Future research trends are also\nbriefly discussed.\n","authors":["Fernando Alonso-Fernandez","Josef Bigun"],"pdf_url":"https://arxiv.org/pdf/2307.14111v1.pdf","comment":"Published in: 2016 4th International Conference on Biometrics and\n  Forensics (IWBF). arXiv admin note: substantial text overlap with\n  arXiv:1810.03360"},{"id":"http://arxiv.org/abs/2303.17595v3","updated":"2023-07-26T11:06:32Z","published":"2023-03-30T17:59:02Z","title":"Neglected Free Lunch -- Learning Image Classifiers Using Annotation\n  Byproducts","summary":"  Supervised learning of image classifiers distills human knowledge into a\nparametric model through pairs of images and corresponding labels (X,Y). We\nargue that this simple and widely used representation of human knowledge\nneglects rich auxiliary information from the annotation procedure, such as the\ntime-series of mouse traces and clicks left after image selection. Our insight\nis that such annotation byproducts Z provide approximate human attention that\nweakly guides the model to focus on the foreground cues, reducing spurious\ncorrelations and discouraging shortcut learning. To verify this, we create\nImageNet-AB and COCO-AB. They are ImageNet and COCO training sets enriched with\nsample-wise annotation byproducts, collected by replicating the respective\noriginal annotation tasks. We refer to the new paradigm of training models with\nannotation byproducts as learning using annotation byproducts (LUAB). We show\nthat a simple multitask loss for regressing Z together with Y already improves\nthe generalisability and robustness of the learned models. Compared to the\noriginal supervised learning, LUAB does not require extra annotation costs.\nImageNet-AB and COCO-AB are at https://github.com/naver-ai/NeglectedFreeLunch.\n","authors":["Dongyoon Han","Junsuk Choe","Seonghyeok Chun","John Joon Young Chung","Minsuk Chang","Sangdoo Yun","Jean Y. Song","Seong Joon Oh"],"pdf_url":"https://arxiv.org/pdf/2303.17595v3.pdf","comment":"Code & data at https://github.com/naver-ai/NeglectedFreeLunch. To be\n  presented at ICCV'23"},{"id":"http://arxiv.org/abs/2307.14073v1","updated":"2023-07-26T09:50:44Z","published":"2023-07-26T09:50:44Z","title":"VideoControlNet: A Motion-Guided Video-to-Video Translation Framework by\n  Using Diffusion Model with ControlNet","summary":"  Recently, diffusion models like StableDiffusion have achieved impressive\nimage generation results. However, the generation process of such diffusion\nmodels is uncontrollable, which makes it hard to generate videos with\ncontinuous and consistent content. In this work, by using the diffusion model\nwith ControlNet, we proposed a new motion-guided video-to-video translation\nframework called VideoControlNet to generate various videos based on the given\nprompts and the condition from the input video. Inspired by the video codecs\nthat use motion information for reducing temporal redundancy, our framework\nuses motion information to prevent the regeneration of the redundant areas for\ncontent consistency. Specifically, we generate the first frame (i.e., the\nI-frame) by using the diffusion model with ControlNet. Then we generate other\nkey frames (i.e., the P-frame) based on the previous I/P-frame by using our\nnewly proposed motion-guided P-frame generation (MgPG) method, in which the\nP-frames are generated based on the motion information and the occlusion areas\nare inpainted by using the diffusion model. Finally, the rest frames (i.e., the\nB-frame) are generated by using our motion-guided B-frame interpolation (MgBI)\nmodule. Our experiments demonstrate that our proposed VideoControlNet inherits\nthe generation capability of the pre-trained large diffusion model and extends\nthe image diffusion model to the video diffusion model by using motion\ninformation. More results are provided at our project page.\n","authors":["Zhihao Hu","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2307.14073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14071v1","updated":"2023-07-26T09:47:37Z","published":"2023-07-26T09:47:37Z","title":"Uncertainty Guided Adaptive Warping for Robust and Efficient Stereo\n  Matching","summary":"  Correlation based stereo matching has achieved outstanding performance, which\npursues cost volume between two feature maps. Unfortunately, current methods\nwith a fixed model do not work uniformly well across various datasets, greatly\nlimiting their real-world applicability. To tackle this issue, this paper\nproposes a new perspective to dynamically calculate correlation for robust\nstereo matching. A novel Uncertainty Guided Adaptive Correlation (UGAC) module\nis introduced to robustly adapt the same model for different scenarios.\nSpecifically, a variance-based uncertainty estimation is employed to adaptively\nadjust the sampling area during warping operation. Additionally, we improve the\ntraditional non-parametric warping with learnable parameters, such that the\nposition-specific weights can be learned. We show that by empowering the\nrecurrent network with the UGAC module, stereo matching can be exploited more\nrobustly and effectively. Extensive experiments demonstrate that our method\nachieves state-of-the-art performance over the ETH3D, KITTI, and Middlebury\ndatasets when employing the same fixed model over these datasets without any\nretraining procedure. To target real-time applications, we further design a\nlightweight model based on UGAC, which also outperforms other methods over\nKITTI benchmarks with only 0.6 M parameters.\n","authors":["Junpeng Jing","Jiankun Li","Pengfei Xiong","Jiangyu Liu","Shuaicheng Liu","Yichen Guo","Xin Deng","Mai Xu","Lai Jiang","Leonid Sigal"],"pdf_url":"https://arxiv.org/pdf/2307.14071v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.14070v1","updated":"2023-07-26T09:45:17Z","published":"2023-07-26T09:45:17Z","title":"PNT-Edge: Towards Robust Edge Detection with Noisy Labels by Learning\n  Pixel-level Noise Transitions","summary":"  Relying on large-scale training data with pixel-level labels, previous edge\ndetection methods have achieved high performance. However, it is hard to\nmanually label edges accurately, especially for large datasets, and thus the\ndatasets inevitably contain noisy labels. This label-noise issue has been\nstudied extensively for classification, while still remaining under-explored\nfor edge detection. To address the label-noise issue for edge detection, this\npaper proposes to learn Pixel-level NoiseTransitions to model the\nlabel-corruption process. To achieve it, we develop a novel Pixel-wise Shift\nLearning (PSL) module to estimate the transition from clean to noisy labels as\na displacement field. Exploiting the estimated noise transitions, our model,\nnamed PNT-Edge, is able to fit the prediction to clean labels. In addition, a\nlocal edge density regularization term is devised to exploit local structure\ninformation for better transition learning. This term encourages learning large\nshifts for the edges with complex local structures. Experiments on SBD and\nCityscapes demonstrate the effectiveness of our method in relieving the impact\nof label noise. Codes will be available at github.\n","authors":["Wenjie Xuan","Shanshan Zhao","Yu Yao","Juhua Liu","Tongliang Liu","Yixin Chen","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2307.14070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08000v2","updated":"2023-07-26T09:43:15Z","published":"2023-05-13T20:45:17Z","title":"DNN-Compressed Domain Visual Recognition with Feature Adaptation","summary":"  Learning-based image compression was shown to achieve a competitive\nperformance with state-of-the-art transform-based codecs. This motivated the\ndevelopment of new learning-based visual compression standards such as JPEG-AI.\nOf particular interest to these emerging standards is the development of\nlearning-based image compression systems targeting both humans and machines.\nThis paper is concerned with learning-based compression schemes whose\ncompressed-domain representations can be utilized to perform visual processing\nand computer vision tasks directly in the compressed domain. In our work, we\nadopt a learning-based compressed-domain classification framework for\nperforming visual recognition using the compressed-domain latent representation\nat varying bit-rates. We propose a novel feature adaptation module integrating\na lightweight attention model to adaptively emphasize and enhance the key\nfeatures within the extracted channel-wise information. Also, we design an\nadaptation training strategy to utilize the pretrained pixel-domain weights.\nFor comparison, in addition to the performance results that are obtained using\nour proposed latent-based compressed-domain method, we also present performance\nresults using compressed but fully decoded images in the pixel domain as well\nas original uncompressed images. The obtained performance results show that our\nproposed compressed-domain classification model can distinctly outperform the\nexisting compressed-domain classification models, and that it can also yield\nsimilar accuracy results with a much higher computational efficiency as\ncompared to the pixel-domain models that are trained using fully decoded\nimages.\n","authors":["Yingpeng Deng","Lina J. Karam"],"pdf_url":"https://arxiv.org/pdf/2305.08000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14066v1","updated":"2023-07-26T09:33:24Z","published":"2023-07-26T09:33:24Z","title":"Pre-Training with Diffusion models for Dental Radiography segmentation","summary":"  Medical radiography segmentation, and specifically dental radiography, is\nhighly limited by the cost of labeling which requires specific expertise and\nlabor-intensive annotations. In this work, we propose a straightforward\npre-training method for semantic segmentation leveraging Denoising Diffusion\nProbabilistic Models (DDPM), which have shown impressive results for generative\nmodeling. Our straightforward approach achieves remarkable performance in terms\nof label efficiency and does not require architectural modifications between\npre-training and downstream tasks. We propose to first pre-train a Unet by\nexploiting the DDPM training objective, and then fine-tune the resulting model\non a segmentation task. Our experimental results on the segmentation of dental\nradiographs demonstrate that the proposed method is competitive with\nstate-of-the-art pre-training methods.\n","authors":["Jérémy Rousseau","Christian Alaka","Emma Covili","Hippolyte Mayard","Laura Misrachi","Willy Au"],"pdf_url":"https://arxiv.org/pdf/2307.14066v1.pdf","comment":"13 pages, 6 figures, Deep Generative Models workshop @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.14063v1","updated":"2023-07-26T09:31:06Z","published":"2023-07-26T09:31:06Z","title":"ECO: Ensembling Context Optimization for Vision-Language Models","summary":"  Image recognition has recently witnessed a paradigm shift, where\nvision-language models are now used to perform few-shot classification based on\ntextual prompts. Among these, the CLIP model has shown remarkable capabilities\nfor zero-shot transfer by matching an image and a custom textual prompt in its\nlatent space. This has paved the way for several works that focus on\nengineering or learning textual contexts for maximizing CLIP's classification\ncapabilities. In this paper, we follow this trend by learning an ensemble of\nprompts for image classification. We show that learning diverse and possibly\nshorter contexts improves considerably and consistently the results rather than\nrelying on a single trainable prompt. In particular, we report better few-shot\ncapabilities with no additional cost at inference time. We demonstrate the\ncapabilities of our approach on 11 different benchmarks.\n","authors":["Lorenzo Agnolucci","Alberto Baldrati","Francesco Todino","Federico Becattini","Marco Bertini","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2307.14063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18120v2","updated":"2023-07-26T09:19:29Z","published":"2023-05-29T14:31:54Z","title":"TD-GEM: Text-Driven Garment Editing Mapper","summary":"  Language-based fashion image editing allows users to try out variations of\ndesired garments through provided text prompts. Inspired by research on\nmanipulating latent representations in StyleCLIP and HairCLIP, we focus on\nthese latent spaces for editing fashion items of full-body human datasets.\nCurrently, there is a gap in handling fashion image editing due to the\ncomplexity of garment shapes and textures and the diversity of human poses. In\nthis paper, we propose an editing optimizer scheme method called Text-Driven\nGarment Editing Mapper (TD-GEM), aiming to edit fashion items in a disentangled\nway. To this end, we initially obtain a latent representation of an image\nthrough generative adversarial network inversions such as Encoder for Editing\n(e4e) or Pivotal Tuning Inversion (PTI) for more accurate results. An\noptimization-based Contrastive Language-Image Pre-training (CLIP) is then\nutilized to guide the latent representation of a fashion image in the direction\nof a target attribute expressed in terms of a text prompt. Our TD-GEM\nmanipulates the image accurately according to the target attribute, while other\nparts of the image are kept untouched. In the experiments, we evaluate TD-GEM\non two different attributes (i.e., \"color\" and \"sleeve length\"), which\neffectively generates realistic images compared to the recent manipulation\nschemes.\n","authors":["Reza Dadfar","Sanaz Sabzevari","Mårten Björkman","Danica Kragic"],"pdf_url":"https://arxiv.org/pdf/2305.18120v2.pdf","comment":"The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2307.14061v1","updated":"2023-07-26T09:19:21Z","published":"2023-07-26T09:19:21Z","title":"Set-level Guidance Attack: Boosting Adversarial Transferability of\n  Vision-Language Pre-training Models","summary":"  Vision-language pre-training (VLP) models have shown vulnerability to\nadversarial examples in multimodal tasks. Furthermore, malicious adversaries\ncan be deliberately transferred to attack other black-box models. However,\nexisting work has mainly focused on investigating white-box attacks. In this\npaper, we present the first study to investigate the adversarial\ntransferability of recent VLP models. We observe that existing methods exhibit\nmuch lower transferability, compared to the strong attack performance in\nwhite-box settings. The transferability degradation is partly caused by the\nunder-utilization of cross-modal interactions. Particularly, unlike unimodal\nlearning, VLP models rely heavily on cross-modal interactions and the\nmultimodal alignments are many-to-many, e.g., an image can be described in\nvarious natural languages. To this end, we propose a highly transferable\nSet-level Guidance Attack (SGA) that thoroughly leverages modality interactions\nand incorporates alignment-preserving augmentation with cross-modal guidance.\nExperimental results demonstrate that SGA could generate adversarial examples\nthat can strongly transfer across different VLP models on multiple downstream\nvision-language tasks. On image-text retrieval, SGA significantly enhances the\nattack success rate for transfer attacks from ALBEF to TCL by a large margin\n(at least 9.78% and up to 30.21%), compared to the state-of-the-art.\n","authors":["Dong Lu","Zhiqiang Wang","Teng Wang","Weili Guan","Hongchang Gao","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.14061v1.pdf","comment":"To appear in ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14058v1","updated":"2023-07-26T09:12:05Z","published":"2023-07-26T09:12:05Z","title":"Towards Establishing Systematic Classification Requirements for\n  Automated Driving","summary":"  Despite the presence of the classification task in many different benchmark\ndatasets for perception in the automotive domain, few efforts have been\nundertaken to define consistent classification requirements. This work\naddresses the topic by proposing a structured method to generate a\nclassification structure. First, legal categories are identified based on\nbehavioral requirements for the vehicle. This structure is further\nsubstantiated by considering the two aspects of collision safety for objects as\nwell as perceptual categories. A classification hierarchy is obtained by\napplying the method to an exemplary legal text. A comparison of the results\nwith benchmark dataset categories shows limited agreement. This indicates the\nnecessity for explicit consideration of legal requirements regarding\nperception.\n","authors":["Ken T. Mori","Trent Brown","Steven Peters"],"pdf_url":"https://arxiv.org/pdf/2307.14058v1.pdf","comment":"Accepted to IEEE IV 2023"},{"id":"http://arxiv.org/abs/2307.14052v1","updated":"2023-07-26T09:04:35Z","published":"2023-07-26T09:04:35Z","title":"Unite-Divide-Unite: Joint Boosting Trunk and Structure for High-accuracy\n  Dichotomous Image Segmentation","summary":"  High-accuracy Dichotomous Image Segmentation (DIS) aims to pinpoint\ncategory-agnostic foreground objects from natural scenes. The main challenge\nfor DIS involves identifying the highly accurate dominant area while rendering\ndetailed object structure. However, directly using a general encoder-decoder\narchitecture may result in an oversupply of high-level features and neglect the\nshallow spatial information necessary for partitioning meticulous structures.\nTo fill this gap, we introduce a novel Unite-Divide-Unite Network (UDUN} that\nrestructures and bipartitely arranges complementary features to simultaneously\nboost the effectiveness of trunk and structure identification. The proposed\nUDUN proceeds from several strengths. First, a dual-size input feeds into the\nshared backbone to produce more holistic and detailed features while keeping\nthe model lightweight. Second, a simple Divide-and-Conquer Module (DCM) is\nproposed to decouple multiscale low- and high-level features into our structure\ndecoder and trunk decoder to obtain structure and trunk information\nrespectively. Moreover, we design a Trunk-Structure Aggregation module (TSA) in\nour union decoder that performs cascade integration for uniform high-accuracy\nsegmentation. As a result, UDUN performs favorably against state-of-the-art\ncompetitors in all six evaluation metrics on overall DIS-TE, i.e., achieving\n0.772 weighted F-measure and 977 HCE. Using 1024*1024 input, our model enables\nreal-time inference at 65.3 fps with ResNet-18.\n","authors":["Jialun Pei","Zhangjun Zhou","Yueming Jin","He Tang","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2307.14052v1.pdf","comment":"This paper has been accepted by ACM MM2023"},{"id":"http://arxiv.org/abs/2307.14051v1","updated":"2023-07-26T09:04:27Z","published":"2023-07-26T09:04:27Z","title":"3D Semantic Subspace Traverser: Empowering 3D Generative Model with\n  Shape Editing Capability","summary":"  Shape generation is the practice of producing 3D shapes as various\nrepresentations for 3D content creation. Previous studies on 3D shape\ngeneration have focused on shape quality and structure, without or less\nconsidering the importance of semantic information. Consequently, such\ngenerative models often fail to preserve the semantic consistency of shape\nstructure or enable manipulation of the semantic attributes of shapes during\ngeneration. In this paper, we proposed a novel semantic generative model named\n3D Semantic Subspace Traverser that utilizes semantic attributes for\ncategory-specific 3D shape generation and editing. Our method utilizes implicit\nfunctions as the 3D shape representation and combines a novel latent-space GAN\nwith a linear subspace model to discover semantic dimensions in the local\nlatent space of 3D shapes. Each dimension of the subspace corresponds to a\nparticular semantic attribute, and we can edit the attributes of generated\nshapes by traversing the coefficients of those dimensions. Experimental results\ndemonstrate that our method can produce plausible shapes with complex\nstructures and enable the editing of semantic attributes. The code and trained\nmodels are available at\nhttps://github.com/TrepangCat/3D_Semantic_Subspace_Traverser\n","authors":["Ruowei Wang","Yu Liu","Pei Su","Jianwei Zhang","Qijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.14051v1.pdf","comment":"Published in ICCV 2023. Code:\n  https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser"},{"id":"http://arxiv.org/abs/2211.15595v3","updated":"2023-07-26T08:50:12Z","published":"2022-11-28T17:49:46Z","title":"FsaNet: Frequency Self-attention for Semantic Segmentation","summary":"  Considering the spectral properties of images, we propose a new\nself-attention mechanism with highly reduced computational complexity, up to a\nlinear rate. To better preserve edges while promoting similarity within\nobjects, we propose individualized processes over different frequency bands. In\nparticular, we study a case where the process is merely over low-frequency\ncomponents. By ablation study, we show that low frequency self-attention can\nachieve very close or better performance relative to full frequency even\nwithout retraining the network. Accordingly, we design and embed novel\nplug-and-play modules to the head of a CNN network that we refer to as FsaNet.\nThe frequency self-attention 1) requires only a few low frequency coefficients\nas input, 2) can be mathematically equivalent to spatial domain self-attention\nwith linear structures, 3) simplifies token mapping ($1\\times1$ convolution)\nstage and token mixing stage simultaneously. We show that frequency\nself-attention requires $87.29\\% \\sim 90.04\\%$ less memory, $96.13\\% \\sim\n98.07\\%$ less FLOPs, and $97.56\\% \\sim 98.18\\%$ in run time than the regular\nself-attention. Compared to other ResNet101-based self-attention networks,\n\\ourM achieves a new \\sArt result ($83.0\\%$ mIoU) on Cityscape test dataset and\ncompetitive results on ADE20k and VOCaug. \\ourM can also enhance MASK R-CNN for\ninstance segmentation on COCO. In addition, utilizing the proposed module,\nSegformer can be boosted on a series of models with different scales, and\nSegformer-B5 can be improved even without retraining. Code is accessible at\n\\url{https://github.com/zfy-csu/FsaNet\n","authors":["Fengyu Zhang","Ashkan Panahi","Guangjun Gao"],"pdf_url":"https://arxiv.org/pdf/2211.15595v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14708v2","updated":"2023-07-26T08:44:50Z","published":"2023-05-24T04:25:51Z","title":"EgoVSR: Towards High-Quality Egocentric Video Super-Resolution","summary":"  Due to the limitations of capture devices and scenarios, egocentric videos\nfrequently have low visual quality, mainly caused by high compression and\nsevere motion blur. With the increasing application of egocentric videos, there\nis an urgent need to enhance the quality of these videos through\nsuper-resolution. However, existing Video Super-Resolution (VSR) works,\nfocusing on third-person view videos, are actually unsuitable for handling\nblurring artifacts caused by rapid ego-motion and object motion in egocentric\nvideos. To this end, we propose EgoVSR, a VSR framework specifically designed\nfor egocentric videos. We explicitly tackle motion blurs in egocentric videos\nusing a Dual Branch Deblur Network (DB$^2$Net) in the VSR framework. Meanwhile,\na blurring mask is introduced to guide the DB$^2$Net learning, and can be used\nto localize blurred areas in video frames. We also design a MaskNet to predict\nthe mask, as well as a mask loss to optimize the mask estimation. Additionally,\nan online motion blur synthesis model for common VSR training data is proposed\nto simulate motion blurs as in egocentric videos. In order to validate the\neffectiveness of our proposed method, we introduce an EgoVSR dataset containing\na large amount of fast-motion egocentric video sequences. Extensive experiments\ndemonstrate that our EgoVSR model can efficiently super-resolve low-quality\negocentric videos and outperform strong comparison baselines. Our code,\npre-trained models and data can be found at https://github.com/chiyich/EGOVSR/.\n","authors":["Yichen Chi","Junhao Gu","Jiamiao Zhang","Wenming Yang","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2305.14708v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14039v1","updated":"2023-07-26T08:43:12Z","published":"2023-07-26T08:43:12Z","title":"Controllable Guide-Space for Generalizable Face Forgery Detection","summary":"  Recent studies on face forgery detection have shown satisfactory performance\nfor methods involved in training datasets, but are not ideal enough for unknown\ndomains. This motivates many works to improve the generalization, but\nforgery-irrelevant information, such as image background and identity, still\nexists in different domain features and causes unexpected clustering, limiting\nthe generalization. In this paper, we propose a controllable guide-space (GS)\nmethod to enhance the discrimination of different forgery domains, so as to\nincrease the forgery relevance of features and thereby improve the\ngeneralization. The well-designed guide-space can simultaneously achieve both\nthe proper separation of forgery domains and the large distance between\nreal-forgery domains in an explicit and controllable manner. Moreover, for\nbetter discrimination, we use a decoupling module to weaken the interference of\nforgery-irrelevant correlations between domains. Furthermore, we make\nadjustments to the decision boundary manifold according to the clustering\ndegree of the same domain features within the neighborhood. Extensive\nexperiments in multiple in-domain and cross-domain settings confirm that our\nmethod can achieve state-of-the-art generalization.\n","authors":["Ying Guo","Cheng Zhen","Pengfei Yan"],"pdf_url":"https://arxiv.org/pdf/2307.14039v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14030v1","updated":"2023-07-26T08:25:46Z","published":"2023-07-26T08:25:46Z","title":"Consensus-Adaptive RANSAC","summary":"  RANSAC and its variants are widely used for robust estimation, however, they\ncommonly follow a greedy approach to finding the highest scoring model while\nignoring other model hypotheses. In contrast, Iteratively Reweighted Least\nSquares (IRLS) techniques gradually approach the model by iteratively updating\nthe weight of each correspondence based on the residuals from previous\niterations. Inspired by these methods, we propose a new RANSAC framework that\nlearns to explore the parameter space by considering the residuals seen so far\nvia a novel attention layer. The attention mechanism operates on a batch of\npoint-to-model residuals, and updates a per-point estimation state to take into\naccount the consensus found through a lightweight one-step transformer. This\nrich state then guides the minimal sampling between iterations as well as the\nmodel refinement. We evaluate the proposed approach on essential and\nfundamental matrix estimation on a number of indoor and outdoor datasets. It\noutperforms state-of-the-art estimators by a significant margin adding only a\nsmall runtime overhead. Moreover, we demonstrate good generalization properties\nof our trained model, indicating its effectiveness across different datasets\nand tasks. The proposed attention mechanism and one-step transformer provide an\nadaptive behavior that enhances the performance of RANSAC, making it a more\neffective tool for robust estimation. Code is available at\nhttps://github.com/cavalli1234/CA-RANSAC.\n","authors":["Luca Cavalli","Daniel Barath","Marc Pollefeys","Viktor Larsson"],"pdf_url":"https://arxiv.org/pdf/2307.14030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06710v3","updated":"2023-07-26T08:24:10Z","published":"2023-05-11T10:36:52Z","title":"Null-text Guidance in Diffusion Models is Secretly a Cartoon-style\n  Creator","summary":"  Classifier-free guidance is an effective sampling technique in diffusion\nmodels that has been widely adopted. The main idea is to extrapolate the model\nin the direction of text guidance and away from null-text guidance. In this\npaper, we demonstrate that null-text guidance in diffusion models is secretly a\ncartoon-style creator, i.e., the generated images can be efficiently\ntransformed into cartoons by simply perturbing the null-text guidance.\nSpecifically, we proposed two disturbance methods, i.e., Rollback disturbance\n(Back-D) and Image disturbance (Image-D), to construct misalignment between the\nnoisy images used for predicting null-text guidance and text guidance\n(subsequently referred to as \\textbf{null-text noisy image} and \\textbf{text\nnoisy image} respectively) in the sampling process. Back-D achieves\ncartoonization by altering the noise level of null-text noisy image via\nreplacing $x_t$ with $x_{t+\\Delta t}$. Image-D, alternatively, produces\nhigh-fidelity, diverse cartoons by defining $x_t$ as a clean input image, which\nfurther improves the incorporation of finer image details. Through\ncomprehensive experiments, we delved into the principle of noise disturbing for\nnull-text and uncovered that the efficacy of disturbance depends on the\ncorrelation between the null-text noisy image and the source image. Moreover,\nour proposed techniques, which can generate cartoon images and cartoonize\nspecific ones, are training-free and easily integrated as a plug-and-play\ncomponent in any classifier-free guided diffusion model. Project page is\navailable at \\url{https://nulltextforcartoon.github.io/}.\n","authors":["Jing Zhao","Heliang Zheng","Chaoyue Wang","Long Lan","Wanrong Huang","Wenjing Yang"],"pdf_url":"https://arxiv.org/pdf/2305.06710v3.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2103.05423v3","updated":"2023-07-26T08:14:39Z","published":"2021-03-09T13:58:35Z","title":"Deep Learning Based 3D Segmentation: A Survey","summary":"  3D segmentation is a fundamental and challenging problem in computer vision\nwith applications in autonomous driving, robotics, augmented reality and\nmedical image analysis. It has received significant attention from the computer\nvision, graphics and machine learning communities. Conventional methods for 3D\nsegmentation, based on hand-crafted features and machine learning classifiers,\nlack generalization ability. Driven by their success in 2D computer vision,\ndeep learning techniques have recently become the tool of choice for 3D\nsegmentation tasks. This has led to an influx of a large number of methods in\nthe literature that have been evaluated on different benchmark datasets.\nWhereas survey papers on RGB-D and point cloud segmentation exist, there is a\nlack of an in-depth and recent survey that covers all 3D data modalities and\napplication domains. This paper fills the gap and provides a comprehensive\nsurvey of the recent progress made in deep learning based 3D segmentation. It\ncovers over 180 works, analyzes their strengths and limitations and discusses\ntheir competitive results on benchmark datasets. The survey provides a summary\nof the most commonly used pipelines and finally highlights promising research\ndirections for the future.\n","authors":["Yong He","Hongshan Yu","Xiaoyan Liu","Zhengeng Yang","Wei Sun","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2103.05423v3.pdf","comment":"28 pages, 10 tables, 8 figures, update the transformer-based methods\n  for 3D segmentation"},{"id":"http://arxiv.org/abs/2307.14025v1","updated":"2023-07-26T08:14:18Z","published":"2023-07-26T08:14:18Z","title":"Topologically-Regularized Multiple Instance Learning for Red Blood Cell\n  Disease Classification","summary":"  Diagnosing rare anemia disorders using microscopic images is challenging for\nskilled specialists and machine-learning methods alike. Due to thousands of\ndisease-relevant cells in a single blood sample, this constitutes a complex\nmultiple-instance learning (MIL) problem. While the spatial neighborhood of red\nblood cells is not meaningful per se, the topology, i.e., the geometry of blood\nsamples as a whole, contains informative features to remedy typical MIL issues,\nsuch as vanishing gradients and overfitting when training on limited data. We\nthus develop a topology-based approach that extracts multi-scale topological\nfeatures from bags of single red blood cell images. The topological features\nare used to regularize the model, enforcing the preservation of characteristic\ntopological properties of the data. Applied to a dataset of 71 patients\nsuffering from rare anemia disorders with 521 microscopic images of red blood\ncells, our experiments show that topological regularization is an effective\nmethod that leads to more than 3% performance improvements for the automated\nclassification of rare anemia disorders based on single-cell images. This is\nthe first approach that uses topological properties for regularizing the MIL\nprocess.\n","authors":["Salome Kazeminia","Ario Sadafi","Asya Makhro","Anna Bogdanova","Carsten Marr","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2307.14025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14021v1","updated":"2023-07-26T08:06:40Z","published":"2023-07-26T08:06:40Z","title":"Retinotopy Inspired Brain Encoding Model and the All-for-One Training\n  Recipe","summary":"  Brain encoding models aim to predict brain voxel-wise responses to stimuli\nimages, replicating brain signals captured by neuroimaging techniques. There is\na large volume of publicly available data, but training a comprehensive brain\nencoding model is challenging. The main difficulties stem from a) diversity\nwithin individual brain, with functional heterogeneous brain regions; b)\ndiversity of brains from different subjects, due to genetic and developmental\ndifferences; c) diversity of imaging modalities and processing pipelines. We\nuse this diversity to our advantage by introducing the All-for-One training\nrecipe, which divides the challenging one-big-model problem into multiple small\nmodels, with the small models aggregating the knowledge while preserving the\ndistinction between the different functional regions. Agnostic of the training\nrecipe, we use biological knowledge of the brain, specifically retinotopy, to\nintroduce inductive bias to learn a 3D brain-to-image mapping that ensures a)\neach neuron knows which image regions and semantic levels to gather\ninformation, and b) no neurons are left behind in the model.\n  We pre-trained a brain encoding model using over one million data points from\nfive public datasets spanning three imaging modalities. To the best of our\nknowledge, this is the most comprehensive brain encoding model to the date. We\ndemonstrate the effectiveness of the pre-trained model as a drop-in replacement\nfor commonly used vision backbone models. Furthermore, we demonstrate the\napplication of the model to brain decoding. Code and the model checkpoint will\nbe made available.\n","authors":["Huzheng Yang","Jianbo Shi","James Gee"],"pdf_url":"https://arxiv.org/pdf/2307.14021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14019v1","updated":"2023-07-26T08:04:01Z","published":"2023-07-26T08:04:01Z","title":"One-Nearest Neighborhood Guides Inlier Estimation for Unsupervised Point\n  Cloud Registration","summary":"  The precision of unsupervised point cloud registration methods is typically\nlimited by the lack of reliable inlier estimation and self-supervised signal,\nespecially in partially overlapping scenarios. In this paper, we propose an\neffective inlier estimation method for unsupervised point cloud registration by\ncapturing geometric structure consistency between the source point cloud and\nits corresponding reference point cloud copy. Specifically, to obtain a high\nquality reference point cloud copy, an One-Nearest Neighborhood (1-NN) point\ncloud is generated by input point cloud. This facilitates matching map\nconstruction and allows for integrating dual neighborhood matching scores of\n1-NN point cloud and input point cloud to improve matching confidence.\nBenefiting from the high quality reference copy, we argue that the neighborhood\ngraph formed by inlier and its neighborhood should have consistency between\nsource point cloud and its corresponding reference copy. Based on this\nobservation, we construct transformation-invariant geometric structure\nrepresentations and capture geometric structure consistency to score the inlier\nconfidence for estimated correspondences between source point cloud and its\nreference copy. This strategy can simultaneously provide the reliable\nself-supervised signal for model optimization. Finally, we further calculate\ntransformation estimation by the weighted SVD algorithm with the estimated\ncorrespondences and corresponding inlier confidence. We train the proposed\nmodel in an unsupervised manner, and extensive experiments on synthetic and\nreal-world datasets illustrate the effectiveness of the proposed method.\n","authors":["Yongzhe Yuan","Yue Wu","Maoguo Gong","Qiguang Miao","A. K. Qin"],"pdf_url":"https://arxiv.org/pdf/2307.14019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14016v1","updated":"2023-07-26T07:57:56Z","published":"2023-07-26T07:57:56Z","title":"RPG-Palm: Realistic Pseudo-data Generation for Palmprint Recognition","summary":"  Palmprint recently shows great potential in recognition applications as it is\na privacy-friendly and stable biometric. However, the lack of large-scale\npublic palmprint datasets limits further research and development of palmprint\nrecognition. In this paper, we propose a novel realistic pseudo-palmprint\ngeneration (RPG) model to synthesize palmprints with massive identities. We\nfirst introduce a conditional modulation generator to improve the intra-class\ndiversity. Then an identity-aware loss is proposed to ensure identity\nconsistency against unpaired training. We further improve the B\\'ezier palm\ncreases generation strategy to guarantee identity independence. Extensive\nexperimental results demonstrate that synthetic pretraining significantly\nboosts the recognition model performance. For example, our model improves the\nstate-of-the-art B\\'ezierPalm by more than $5\\%$ and $14\\%$ in terms of\nTAR@FAR=1e-6 under the $1:1$ and $1:3$ Open-set protocol. When accessing only\n$10\\%$ of the real training data, our method still outperforms ArcFace with\n$100\\%$ real training data, indicating that we are closer to real-data-free\npalmprint recognition.\n","authors":["Lei Shen","Jianlong Jin","Ruixin Zhang","Huaen Li","Yingyi Zhang","Jingyun Zhang","Shouhong Ding","Yang Zhao","Wei Jia"],"pdf_url":"https://arxiv.org/pdf/2307.14016v1.pdf","comment":"12 pages,8 figures"},{"id":"http://arxiv.org/abs/2307.14010v1","updated":"2023-07-26T07:45:14Z","published":"2023-07-26T07:45:14Z","title":"ESSAformer: Efficient Transformer for Hyperspectral Image\n  Super-resolution","summary":"  Single hyperspectral image super-resolution (single-HSI-SR) aims to restore a\nhigh-resolution hyperspectral image from a low-resolution observation. However,\nthe prevailing CNN-based approaches have shown limitations in building\nlong-range dependencies and capturing interaction information between spectral\nfeatures. This results in inadequate utilization of spectral information and\nartifacts after upsampling. To address this issue, we propose ESSAformer, an\nESSA attention-embedded Transformer network for single-HSI-SR with an iterative\nrefining structure. Specifically, we first introduce a robust and\nspectral-friendly similarity metric, \\ie, the spectral correlation coefficient\nof the spectrum (SCC), to replace the original attention matrix and\nincorporates inductive biases into the model to facilitate training. Built upon\nit, we further utilize the kernelizable attention technique with theoretical\nsupport to form a novel efficient SCC-kernel-based self-attention (ESSA) and\nreduce attention computation to linear complexity. ESSA enlarges the receptive\nfield for features after upsampling without bringing much computation and\nallows the model to effectively utilize spatial-spectral information from\ndifferent scales, resulting in the generation of more natural high-resolution\nimages. Without the need for pretraining on large-scale datasets, our\nexperiments demonstrate ESSA's effectiveness in both visual quality and\nquantitative results.\n","authors":["Mingjin Zhang","Chi Zhang","Qiming Zhang","Jie Guo","Xinbo Gao","Jing Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.14010v1.pdf","comment":"16 pages, 18 figures"},{"id":"http://arxiv.org/abs/2307.14009v1","updated":"2023-07-26T07:44:34Z","published":"2023-07-26T07:44:34Z","title":"Car-Studio: Learning Car Radiance Fields from Single-View and Endless\n  In-the-wild Images","summary":"  Compositional neural scene graph studies have shown that radiance fields can\nbe an efficient tool in an editable autonomous driving simulator. However,\nprevious studies learned within a sequence of autonomous driving datasets,\nresulting in unsatisfactory blurring when rotating the car in the simulator. In\nthis letter, we propose a pipeline for learning unconstrained images and\nbuilding a dataset from processed images. To meet the requirements of the\nsimulator, which demands that the vehicle maintain clarity when the perspective\nchanges and that the contour remains sharp from the background to avoid\nartifacts when editing, we design a radiation field of the vehicle, a crucial\npart of the urban scene foreground. Through experiments, we demonstrate that\nour model achieves competitive performance compared to baselines. Using the\ndatasets built from in-the-wild images, our method gradually presents a\ncontrollable appearance editing function. We will release the dataset and code\non https://lty2226262.github.io/car-studio/ to facilitate further research in\nthe field.\n","authors":["Tianyu Liu","Hao Zhao","Yang Yu","Guyue Zhou","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2307.14009v1.pdf","comment":"Submissions to the IEEE Robotics and Automation Letters (RA-L),\n  Project Page: https://lty2226262.github.io/car-studio/"},{"id":"http://arxiv.org/abs/2307.08579v2","updated":"2023-07-26T07:44:32Z","published":"2023-07-17T15:47:48Z","title":"Scale-Aware Modulation Meet Transformer","summary":"  This paper presents a new vision Transformer, Scale-Aware Modulation\nTransformer (SMT), that can handle various downstream tasks efficiently by\ncombining the convolutional network and vision Transformer. The proposed\nScale-Aware Modulation (SAM) in the SMT includes two primary novel designs.\nFirstly, we introduce the Multi-Head Mixed Convolution (MHMC) module, which can\ncapture multi-scale features and expand the receptive field. Secondly, we\npropose the Scale-Aware Aggregation (SAA) module, which is lightweight but\neffective, enabling information fusion across different heads. By leveraging\nthese two modules, convolutional modulation is further enhanced. Furthermore,\nin contrast to prior works that utilized modulations throughout all stages to\nbuild an attention-free network, we propose an Evolutionary Hybrid Network\n(EHN), which can effectively simulate the shift from capturing local to global\ndependencies as the network becomes deeper, resulting in superior performance.\nExtensive experiments demonstrate that SMT significantly outperforms existing\nstate-of-the-art models across a wide range of visual tasks. Specifically, SMT\nwith 11.5M / 2.4GFLOPs and 32M / 7.7GFLOPs can achieve 82.2% and 84.3% top-1\naccuracy on ImageNet-1K, respectively. After pretrained on ImageNet-22K in\n224^2 resolution, it attains 87.1% and 88.1% top-1 accuracy when finetuned with\nresolution 224^2 and 384^2, respectively. For object detection with Mask R-CNN,\nthe SMT base trained with 1x and 3x schedule outperforms the Swin Transformer\ncounterpart by 4.2 and 1.3 mAP on COCO, respectively. For semantic segmentation\nwith UPerNet, the SMT base test at single- and multi-scale surpasses Swin by\n2.0 and 1.1 mIoU respectively on the ADE20K.\n","authors":["Weifeng Lin","Ziheng Wu","Jiayu Chen","Jun Huang","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2307.08579v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14008v1","updated":"2023-07-26T07:42:28Z","published":"2023-07-26T07:42:28Z","title":"Adaptive Frequency Filters As Efficient Global Token Mixers","summary":"  Recent vision transformers, large-kernel CNNs and MLPs have attained\nremarkable successes in broad vision tasks thanks to their effective\ninformation fusion in the global scope. However, their efficient deployments,\nespecially on mobile devices, still suffer from noteworthy challenges due to\nthe heavy computational costs of self-attention mechanisms, large kernels, or\nfully connected layers. In this work, we apply conventional convolution theorem\nto deep learning for addressing this and reveal that adaptive frequency filters\ncan serve as efficient global token mixers. With this insight, we propose\nAdaptive Frequency Filtering (AFF) token mixer. This neural operator transfers\na latent representation to the frequency domain via a Fourier transform and\nperforms semantic-adaptive frequency filtering via an elementwise\nmultiplication, which mathematically equals to a token mixing operation in the\noriginal latent space with a dynamic convolution kernel as large as the spatial\nresolution of this latent representation. We take AFF token mixers as primary\nneural operators to build a lightweight neural network, dubbed AFFNet.\nExtensive experiments demonstrate the effectiveness of our proposed AFF token\nmixer and show that AFFNet achieve superior accuracy and efficiency trade-offs\ncompared to other lightweight network designs on broad visual tasks, including\nvisual recognition and dense prediction tasks.\n","authors":["Zhipeng Huang","Zhizheng Zhang","Cuiling Lan","Zheng-Jun Zha","Yan Lu","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2307.14008v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.14006v1","updated":"2023-07-26T07:36:38Z","published":"2023-07-26T07:36:38Z","title":"Learning Snippet-to-Motion Progression for Skeleton-based Human Motion\n  Prediction","summary":"  Existing Graph Convolutional Networks to achieve human motion prediction\nlargely adopt a one-step scheme, which output the prediction straight from\nhistory input, failing to exploit human motion patterns. We observe that human\nmotions have transitional patterns and can be split into snippets\nrepresentative of each transition. Each snippet can be reconstructed from its\nstarting and ending poses referred to as the transitional poses. We propose a\nsnippet-to-motion multi-stage framework that breaks motion prediction into\nsub-tasks easier to accomplish. Each sub-task integrates three modules:\ntransitional pose prediction, snippet reconstruction, and snippet-to-motion\nprediction. Specifically, we propose to first predict only the transitional\nposes. Then we use them to reconstruct the corresponding snippets, obtaining a\nclose approximation to the true motion sequence. Finally we refine them to\nproduce the final prediction output. To implement the network, we propose a\nnovel unified graph modeling, which allows for direct and effective feature\npropagation compared to existing approaches which rely on separate space-time\nmodeling. Extensive experiments on Human 3.6M, CMU Mocap and 3DPW datasets\nverify the effectiveness of our method which achieves state-of-the-art\nperformance.\n","authors":["Xinshun Wang","Qiongjie Cui","Chen Chen","Shen Zhao","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2307.14006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15786v3","updated":"2023-07-26T07:21:59Z","published":"2023-03-28T07:54:54Z","title":"HOICLIP: Efficient Knowledge Transfer for HOI Detection with\n  Vision-Language Models","summary":"  Human-Object Interaction (HOI) detection aims to localize human-object pairs\nand recognize their interactions. Recently, Contrastive Language-Image\nPre-training (CLIP) has shown great potential in providing interaction prior\nfor HOI detectors via knowledge distillation. However, such approaches often\nrely on large-scale training data and suffer from inferior performance under\nfew/zero-shot scenarios. In this paper, we propose a novel HOI detection\nframework that efficiently extracts prior knowledge from CLIP and achieves\nbetter generalization. In detail, we first introduce a novel interaction\ndecoder to extract informative regions in the visual feature map of CLIP via a\ncross-attention mechanism, which is then fused with the detection backbone by a\nknowledge integration block for more accurate human-object pair detection. In\naddition, prior knowledge in CLIP text encoder is leveraged to generate a\nclassifier by embedding HOI descriptions. To distinguish fine-grained\ninteractions, we build a verb classifier from training data via visual semantic\narithmetic and a lightweight verb representation adapter. Furthermore, we\npropose a training-free enhancement to exploit global HOI predictions from\nCLIP. Extensive experiments demonstrate that our method outperforms the state\nof the art by a large margin on various settings, e.g. +4.04 mAP on HICO-Det.\nThe source code is available in https://github.com/Artanic30/HOICLIP.\n","authors":["Shan Ning","Longtian Qiu","Yongfei Liu","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2303.15786v3.pdf","comment":"CVPR 2023.Open sourced, Code and Model Available"},{"id":"http://arxiv.org/abs/2303.15435v2","updated":"2023-07-26T07:19:58Z","published":"2023-03-27T17:57:33Z","title":"The Stable Signature: Rooting Watermarks in Latent Diffusion Models","summary":"  Generative image modeling enables a wide range of applications but raises\nethical concerns about responsible deployment. This paper introduces an active\nstrategy combining image watermarking and Latent Diffusion Models. The goal is\nfor all generated images to conceal an invisible watermark allowing for future\ndetection and/or identification. The method quickly fine-tunes the latent\ndecoder of the image generator, conditioned on a binary signature. A\npre-trained watermark extractor recovers the hidden signature from any\ngenerated image and a statistical test then determines whether it comes from\nthe generative model. We evaluate the invisibility and robustness of the\nwatermarks on a variety of generation tasks, showing that Stable Signature\nworks even after the images are modified. For instance, it detects the origin\nof an image generated from a text prompt, then cropped to keep $10\\%$ of the\ncontent, with $90$+$\\%$ accuracy at a false positive rate below 10$^{-6}$.\n","authors":["Pierre Fernandez","Guillaume Couairon","Hervé Jégou","Matthijs Douze","Teddy Furon"],"pdf_url":"https://arxiv.org/pdf/2303.15435v2.pdf","comment":"Published at ICCV 2023. Code at\n  https://github.com/facebookresearch/stable_signature - webpage at\n  https://pierrefdz.github.io/publications/stablesignature"},{"id":"http://arxiv.org/abs/2307.13992v1","updated":"2023-07-26T07:01:57Z","published":"2023-07-26T07:01:57Z","title":"Causal reasoning in typical computer vision tasks","summary":"  Deep learning has revolutionized the field of artificial intelligence. Based\non the statistical correlations uncovered by deep learning-based methods,\ncomputer vision technology has contributed to tremendous growth in areas such\nas autonomous driving and robotics. Despite being the basis of deep learning,\nsuch correlation is not stable and is susceptible to uncontrolled factors. In\nthe absence of the guidance of prior knowledge, statistical correlations can\neasily turn into spurious correlations and cause confounders. As a result,\nresearchers are beginning to refine deep learning-based methods with causal\ntheory. Causal theory models the intrinsic causal structure unaffected by data\nbias and is effective in avoiding spurious correlations. This paper aims to\ncomprehensively review the existing causal methods in typical vision and\nvision-language tasks such as semantic segmentation, object detection, and\nimage captioning. The advantages of causality and the approaches for building\ncausal paradigms will be summarized. Future roadmaps are also proposed,\nincluding facilitating the development of causal theory and its application in\nother complex scenes and systems.\n","authors":[" Zhang"," Kexuan"," Sun"," Qiyu"," Zhao"," Chaoqiang"," Tang"," Yang"],"pdf_url":"https://arxiv.org/pdf/2307.13992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13991v1","updated":"2023-07-26T06:58:19Z","published":"2023-07-26T06:58:19Z","title":"METAVerse: Meta-Learning Traversability Cost Map for Off-Road Navigation","summary":"  Autonomous navigation in off-road conditions requires an accurate estimation\nof terrain traversability. However, traversability estimation in unstructured\nenvironments is subject to high uncertainty due to the variability of numerous\nfactors that influence vehicle-terrain interaction. Consequently, it is\nchallenging to obtain a generalizable model that can accurately predict\ntraversability in a variety of environments. This paper presents METAVerse, a\nmeta-learning framework for learning a global model that accurately and\nreliably predicts terrain traversability across diverse environments. We train\nthe traversability prediction network to generate a dense and continuous-valued\ncost map from a sparse LiDAR point cloud, leveraging vehicle-terrain\ninteraction feedback in a self-supervised manner. Meta-learning is utilized to\ntrain a global model with driving data collected from multiple environments,\neffectively minimizing estimation uncertainty. During deployment, online\nadaptation is performed to rapidly adapt the network to the local environment\nby exploiting recent interaction experiences. To conduct a comprehensive\nevaluation, we collect driving data from various terrains and demonstrate that\nour method can obtain a global model that minimizes uncertainty. Moreover, by\nintegrating our model with a model predictive controller, we demonstrate that\nthe reduced uncertainty results in safe and stable navigation in unstructured\nand unknown terrains.\n","authors":["Junwon Seo","Taekyung Kim","Seongyong Ahn","Kiho Kwak"],"pdf_url":"https://arxiv.org/pdf/2307.13991v1.pdf","comment":"Our video can be found at https://youtu.be/4rIAMM1ZKMo"},{"id":"http://arxiv.org/abs/2307.13986v1","updated":"2023-07-26T06:52:29Z","published":"2023-07-26T06:52:29Z","title":"Hybrid Representation-Enhanced Sampling for Bayesian Active Learning in\n  Musculoskeletal Segmentation of Lower Extremities","summary":"  Purpose: Obtaining manual annotations to train deep learning (DL) models for\nauto-segmentation is often time-consuming. Uncertainty-based Bayesian active\nlearning (BAL) is a widely-adopted method to reduce annotation efforts. Based\non BAL, this study introduces a hybrid representation-enhanced sampling\nstrategy that integrates density and diversity criteria to save manual\nannotation costs by efficiently selecting the most informative samples.\n  Methods: The experiments are performed on two lower extremity (LE) datasets\nof MRI and CT images by a BAL framework based on Bayesian U-net. Our method\nselects uncertain samples with high density and diversity for manual revision,\noptimizing for maximal similarity to unlabeled instances and minimal similarity\nto existing training data. We assess the accuracy and efficiency using Dice and\na proposed metric called reduced annotation cost (RAC), respectively. We\nfurther evaluate the impact of various acquisition rules on BAL performance and\ndesign an ablation study for effectiveness estimation.\n  Results: The proposed method showed superiority or non-inferiority to other\nmethods on both datasets across two acquisition rules, and quantitative results\nreveal the pros and cons of the acquisition rules. Our ablation study in\nvolume-wise acquisition shows that the combination of density and diversity\ncriteria outperforms solely using either of them in musculoskeletal\nsegmentation.\n  Conclusion: Our sampling method is proven efficient in reducing annotation\ncosts in image segmentation tasks. The combination of the proposed method and\nour BAL framework provides a semi-automatic way for efficient annotation of\nmedical image datasets.\n","authors":["Ganping Li","Yoshito Otake","Mazen Soufi","Masashi Taniguchi","Masahide Yagi","Noriaki Ichihashi","Keisuke Uemura","Masaki Takao","Nobuhiko Sugano","Yoshinobu Sato"],"pdf_url":"https://arxiv.org/pdf/2307.13986v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.13985v1","updated":"2023-07-26T06:50:58Z","published":"2023-07-26T06:50:58Z","title":"Enhanced Security against Adversarial Examples Using a Random Ensemble\n  of Encrypted Vision Transformer Models","summary":"  Deep neural networks (DNNs) are well known to be vulnerable to adversarial\nexamples (AEs). In addition, AEs have adversarial transferability, which means\nAEs generated for a source model can fool another black-box model (target\nmodel) with a non-trivial probability. In previous studies, it was confirmed\nthat the vision transformer (ViT) is more robust against the property of\nadversarial transferability than convolutional neural network (CNN) models such\nas ConvMixer, and moreover encrypted ViT is more robust than ViT without any\nencryption. In this article, we propose a random ensemble of encrypted ViT\nmodels to achieve much more robust models. In experiments, the proposed scheme\nis verified to be more robust against not only black-box attacks but also\nwhite-box ones than convention methods.\n","authors":["Ryota Iijima","Miki Tanaka","Sayaka Shiota","Hitoshi Kiya"],"pdf_url":"https://arxiv.org/pdf/2307.13985v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.13981v1","updated":"2023-07-26T06:38:33Z","published":"2023-07-26T06:38:33Z","title":"Analysis of Video Quality Datasets via Design of Minimalistic Video\n  Quality Models","summary":"  Blind video quality assessment (BVQA) plays an indispensable role in\nmonitoring and improving the end-users' viewing experience in various\nreal-world video-enabled media applications. As an experimental field, the\nimprovements of BVQA models have been measured primarily on a few human-rated\nVQA datasets. Thus, it is crucial to gain a better understanding of existing\nVQA datasets in order to properly evaluate the current progress in BVQA.\nTowards this goal, we conduct a first-of-its-kind computational analysis of VQA\ndatasets via designing minimalistic BVQA models. By minimalistic, we restrict\nour family of BVQA models to build only upon basic blocks: a video preprocessor\n(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an\noptional temporal quality analyzer, and a quality regressor, all with the\nsimplest possible instantiations. By comparing the quality prediction\nperformance of different model variants on eight VQA datasets with realistic\ndistortions, we find that nearly all datasets suffer from the easy dataset\nproblem of varying severity, some of which even admit blind image quality\nassessment (BIQA) solutions. We additionally justify our claims by contrasting\nour model generalizability on these VQA datasets, and by ablating a dizzying\nset of BVQA design choices related to the basic building blocks. Our results\ncast doubt on the current progress in BVQA, and meanwhile shed light on good\npractices of constructing next-generation VQA datasets and models.\n","authors":["Wei Sun","Wen Wen","Xiongkuo Min","Long Lan","Guangtao Zhai","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2307.13981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13978v1","updated":"2023-07-26T06:34:24Z","published":"2023-07-26T06:34:24Z","title":"Controlling the Latent Space of GANs through Reinforcement Learning: A\n  Case Study on Task-based Image-to-Image Translation","summary":"  Generative Adversarial Networks (GAN) have emerged as a formidable AI tool to\ngenerate realistic outputs based on training datasets. However, the challenge\nof exerting control over the generation process of GANs remains a significant\nhurdle. In this paper, we propose a novel methodology to address this issue by\nintegrating a reinforcement learning (RL) agent with a latent-space GAN\n(l-GAN), thereby facilitating the generation of desired outputs. More\nspecifically, we have developed an actor-critic RL agent with a meticulously\ndesigned reward policy, enabling it to acquire proficiency in navigating the\nlatent space of the l-GAN and generating outputs based on specified tasks. To\nsubstantiate the efficacy of our approach, we have conducted a series of\nexperiments employing the MNIST dataset, including arithmetic addition as an\nillustrative task. The outcomes of these experiments serve to validate our\nmethodology. Our pioneering integration of an RL agent with a GAN model\nrepresents a novel advancement, holding great potential for enhancing\ngenerative networks in the future.\n","authors":["Mahyar Abbasian","Taha Rajabzadeh","Ahmadreza Moradipari","Seyed Amir Hossein Aqajari","Hongsheng Lu","Amir Rahmani"],"pdf_url":"https://arxiv.org/pdf/2307.13978v1.pdf","comment":"7 pages, 7 figures, 2 tables, conference paper"},{"id":"http://arxiv.org/abs/2307.13974v1","updated":"2023-07-26T06:19:46Z","published":"2023-07-26T06:19:46Z","title":"Tracking Anything in High Quality","summary":"  Visual object tracking is a fundamental video task in computer vision.\nRecently, the notably increasing power of perception algorithms allows the\nunification of single/multiobject and box/mask-based tracking. Among them, the\nSegment Anything Model (SAM) attracts much attention. In this report, we\npropose HQTrack, a framework for High Quality Tracking anything in videos.\nHQTrack mainly consists of a video multi-object segmenter (VMOS) and a mask\nrefiner (MR). Given the object to be tracked in the initial frame of a video,\nVMOS propagates the object masks to the current frame. The mask results at this\nstage are not accurate enough since VMOS is trained on several closeset video\nobject segmentation (VOS) datasets, which has limited ability to generalize to\ncomplex and corner scenes. To further improve the quality of tracking masks, a\npretrained MR model is employed to refine the tracking results. As a compelling\ntestament to the effectiveness of our paradigm, without employing any tricks\nsuch as test-time data augmentations and model ensemble, HQTrack ranks the 2nd\nplace in the Visual Object Tracking and Segmentation (VOTS2023) challenge. Code\nand models are available at https://github.com/jiawen-zhu/HQTrack.\n","authors":["Jiawen Zhu","Zhenyu Chen","Zeqi Hao","Shijie Chang","Lu Zhang","Dong Wang","Huchuan Lu","Bin Luo","Jun-Yan He","Jin-Peng Lan","Hanyuan Chen","Chenyang Li"],"pdf_url":"https://arxiv.org/pdf/2307.13974v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2306.17723v3","updated":"2023-07-26T05:10:57Z","published":"2023-06-30T15:11:00Z","title":"FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis","summary":"  Neural Radiance Field (NeRF) has been a mainstream in novel view synthesis\nwith its remarkable quality of rendered images and simple architecture.\nAlthough NeRF has been developed in various directions improving continuously\nits performance, the necessity of a dense set of multi-view images still exists\nas a stumbling block to progress for practical application. In this work, we\npropose FlipNeRF, a novel regularization method for few-shot novel view\nsynthesis by utilizing our proposed flipped reflection rays. The flipped\nreflection rays are explicitly derived from the input ray directions and\nestimated normal vectors, and play a role of effective additional training rays\nwhile enabling to estimate more accurate surface normals and learn the 3D\ngeometry effectively. Since the surface normal and the scene depth are both\nderived from the estimated densities along a ray, the accurate surface normal\nleads to more exact depth estimation, which is a key factor for few-shot novel\nview synthesis. Furthermore, with our proposed Uncertainty-aware Emptiness Loss\nand Bottleneck Feature Consistency Loss, FlipNeRF is able to estimate more\nreliable outputs with reducing floating artifacts effectively across the\ndifferent scene structures, and enhance the feature-level consistency between\nthe pair of the rays cast toward the photo-consistent pixels without any\nadditional feature extractor, respectively. Our FlipNeRF achieves the SOTA\nperformance on the multiple benchmarks across all the scenarios.\n","authors":["Seunghyeon Seo","Yeonjin Chang","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2306.17723v3.pdf","comment":"ICCV 2023. Project Page: https://shawn615.github.io/flipnerf/"},{"id":"http://arxiv.org/abs/2307.13958v1","updated":"2023-07-26T05:06:41Z","published":"2023-07-26T05:06:41Z","title":"Visual Prompt Flexible-Modal Face Anti-Spoofing","summary":"  Recently, vision transformer based multimodal learning methods have been\nproposed to improve the robustness of face anti-spoofing (FAS) systems.\nHowever, multimodal face data collected from the real world is often imperfect\ndue to missing modalities from various imaging sensors. Recently,\nflexible-modal FAS~\\cite{yu2023flexible} has attracted more attention, which\naims to develop a unified multimodal FAS model using complete multimodal face\ndata but is insensitive to test-time missing modalities. In this paper, we\ntackle one main challenge in flexible-modal FAS, i.e., when missing modality\noccurs either during training or testing in real-world situations. Inspired by\nthe recent success of the prompt learning in language models, we propose\n\\textbf{V}isual \\textbf{P}rompt flexible-modal \\textbf{FAS} (VP-FAS), which\nlearns the modal-relevant prompts to adapt the frozen pre-trained foundation\nmodel to downstream flexible-modal FAS task. Specifically, both vanilla visual\nprompts and residual contextual prompts are plugged into multimodal\ntransformers to handle general missing-modality cases, while only requiring\nless than 4\\% learnable parameters compared to training the entire model.\nFurthermore, missing-modality regularization is proposed to force models to\nlearn consistent multimodal feature embeddings when missing partial modalities.\nExtensive experiments conducted on two multimodal FAS benchmark datasets\ndemonstrate the effectiveness of our VP-FAS framework that improves the\nperformance under various missing-modality cases while alleviating the\nrequirement of heavy model re-training.\n","authors":["Zitong Yu","Rizhao Cai","Yawen Cui","Ajian Liu","Changsheng Chen"],"pdf_url":"https://arxiv.org/pdf/2307.13958v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.03369 by other authors"},{"id":"http://arxiv.org/abs/2307.13957v1","updated":"2023-07-26T04:33:05Z","published":"2023-07-26T04:33:05Z","title":"Heterogeneous Embodied Multi-Agent Collaboration","summary":"  Multi-agent embodied tasks have recently been studied in complex indoor\nvisual environments. Collaboration among multiple agents can improve work\nefficiency and has significant practical value. However, most of the existing\nresearch focuses on homogeneous multi-agent tasks. Compared with homogeneous\nagents, heterogeneous agents can leverage their different capabilities to\nallocate corresponding sub-tasks and cooperate to complete complex tasks.\nHeterogeneous multi-agent tasks are common in real-world scenarios, and the\ncollaboration strategy among heterogeneous agents is a challenging and\nimportant problem to be solved. To study collaboration among heterogeneous\nagents, we propose the heterogeneous multi-agent tidying-up task, in which\nmultiple heterogeneous agents with different capabilities collaborate with each\nother to detect misplaced objects and place them in reasonable locations. This\nis a demanding task since it requires agents to make the best use of their\ndifferent capabilities to conduct reasonable task planning and complete the\nwhole task. To solve this task, we build a heterogeneous multi-agent tidying-up\nbenchmark dataset in a large number of houses with multiple rooms based on\nProcTHOR-10K. We propose the hierarchical decision model based on misplaced\nobject detection, reasonable receptacle prediction, as well as the\nhandshake-based group communication mechanism. Extensive experiments are\nconducted to demonstrate the effectiveness of the proposed model. The project's\nwebsite and videos of experiments can be found at https://hetercol.github.io/.\n","authors":["Xinzhu Liu","Di Guo","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13953v1","updated":"2023-07-26T04:08:12Z","published":"2023-07-26T04:08:12Z","title":"The Hidden Dance of Phonemes and Visage: Unveiling the Enigmatic Link\n  between Phonemes and Facial Features","summary":"  This work unveils the enigmatic link between phonemes and facial features.\nTraditional studies on voice-face correlations typically involve using a long\nperiod of voice input, including generating face images from voices and\nreconstructing 3D face meshes from voices. However, in situations like\nvoice-based crimes, the available voice evidence may be short and limited.\nAdditionally, from a physiological perspective, each segment of speech --\nphoneme -- corresponds to different types of airflow and movements in the face.\nTherefore, it is advantageous to discover the hidden link between phonemes and\nface attributes. In this paper, we propose an analysis pipeline to help us\nexplore the voice-face relationship in a fine-grained manner, i.e., phonemes\nv.s. facial anthropometric measurements (AM). We build an estimator for each\nphoneme-AM pair and evaluate the correlation through hypothesis testing. Our\nresults indicate that AMs are more predictable from vowels compared to\nconsonants, particularly with plosives. Additionally, we observe that if a\nspecific AM exhibits more movement during phoneme pronunciation, it is more\npredictable. Our findings support those in physiology regarding correlation and\nlay the groundwork for future research on speech-face multimodal learning.\n","authors":["Liao Qu","Xianwei Zou","Xiang Li","Yandong Wen","Rita Singh","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2307.13953v1.pdf","comment":"Interspeech 2023"},{"id":"http://arxiv.org/abs/2008.07073v2","updated":"2023-07-26T04:03:47Z","published":"2020-08-17T03:31:39Z","title":"AlphaNet: Improving Long-Tail Classification By Combining Classifiers","summary":"  Methods in long-tail learning focus on improving performance for data-poor\n(rare) classes; however, performance for such classes remains much lower than\nperformance for more data-rich (frequent) classes. Analyzing the predictions of\nlong-tail methods for rare classes reveals that a large number of errors are\ndue to misclassification of rare items as visually similar frequent classes. To\naddress this problem, we introduce AlphaNet, a method that can be applied to\nexisting models, performing post hoc correction on classifiers of rare classes.\nStarting with a pre-trained model, we find frequent classes that are closest to\nrare classes in the model's representation space and learn weights to update\nrare class classifiers with a linear combination of frequent class classifiers.\nAlphaNet, applied to several models, greatly improves test accuracy for rare\nclasses in multiple long-tailed datasets, with very little change to overall\naccuracy. Our method also provides a way to control the trade-off between rare\nclass and overall accuracy, making it practical for long-tail classification in\nthe wild.\n","authors":["Nadine Chang","Jayanth Koushik","Aarti Singh","Martial Hebert","Yu-Xiong Wang","Michael J. Tarr"],"pdf_url":"https://arxiv.org/pdf/2008.07073v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13948v1","updated":"2023-07-26T04:03:10Z","published":"2023-07-26T04:03:10Z","title":"Rethinking Voice-Face Correlation: A Geometry View","summary":"  Previous works on voice-face matching and voice-guided face synthesis\ndemonstrate strong correlations between voice and face, but mainly rely on\ncoarse semantic cues such as gender, age, and emotion. In this paper, we aim to\ninvestigate the capability of reconstructing the 3D facial shape from voice\nfrom a geometry perspective without any semantic information. We propose a\nvoice-anthropometric measurement (AM)-face paradigm, which identifies\npredictable facial AMs from the voice and uses them to guide 3D face\nreconstruction. By leveraging AMs as a proxy to link the voice and face\ngeometry, we can eliminate the influence of unpredictable AMs and make the face\ngeometry tractable. Our approach is evaluated on our proposed dataset with\nground-truth 3D face scans and corresponding voice recordings, and we find\nsignificant correlations between voice and specific parts of the face geometry,\nsuch as the nasal cavity and cranium. Our work offers a new perspective on\nvoice-face correlation and can serve as a good empirical study for\nanthropometry science.\n","authors":["Xiang Li","Yandong Wen","Muqiao Yang","Jinglu Wang","Rita Singh","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2307.13948v1.pdf","comment":"ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2307.13947v1","updated":"2023-07-26T04:01:57Z","published":"2023-07-26T04:01:57Z","title":"Centroid-aware feature recalibration for cancer grading in pathology\n  images","summary":"  Cancer grading is an essential task in pathology. The recent developments of\nartificial neural networks in computational pathology have shown that these\nmethods hold great potential for improving the accuracy and quality of cancer\ndiagnosis. However, the issues with the robustness and reliability of such\nmethods have not been fully resolved yet. Herein, we propose a centroid-aware\nfeature recalibration network that can conduct cancer grading in an accurate\nand robust manner. The proposed network maps an input pathology image into an\nembedding space and adjusts it by using centroids embedding vectors of\ndifferent cancer grades via attention mechanism. Equipped with the recalibrated\nembedding vector, the proposed network classifiers the input pathology image\ninto a pertinent class label, i.e., cancer grade. We evaluate the proposed\nnetwork using colorectal cancer datasets that were collected under different\nenvironments. The experimental results confirm that the proposed network is\nable to conduct cancer grading in pathology images with high accuracy\nregardless of the environmental changes in the datasets.\n","authors":["Jaeung Lee","Keunho Byeon","Jin Tae Kwak"],"pdf_url":"https://arxiv.org/pdf/2307.13947v1.pdf","comment":"MICCAI 2023; 10 pages; 1 figure; Project code:\n  https://github.com/colin19950703/CaFeNet"},{"id":"http://arxiv.org/abs/2206.02070v2","updated":"2023-07-26T03:53:19Z","published":"2022-06-04T23:33:34Z","title":"Priors in Deep Image Restoration and Enhancement: A Survey","summary":"  Image restoration and enhancement is a process of improving the image quality\nby removing degradations, such as noise, blur, and resolution degradation. Deep\nlearning (DL) has recently been applied to image restoration and enhancement.\nDue to its ill-posed property, plenty of works have been explored priors to\nfacilitate training deep neural networks (DNNs). However, the importance of\npriors has not been systematically studied and analyzed by far in the research\ncommunity. Therefore, this paper serves as the first study that provides a\ncomprehensive overview of recent advancements in priors for deep image\nrestoration and enhancement. Our work covers five primary contents: (1) A\ntheoretical analysis of priors for deep image restoration and enhancement; (2)\nA hierarchical and structural taxonomy of priors commonly used in the DL-based\nmethods; (3) An insightful discussion on each prior regarding its principle,\npotential, and applications; (4) A summary of crucial problems by highlighting\nthe potential future directions, especially adopting the large-scale foundation\nmodels as prior, to spark more research in the community; (5) An open-source\nrepository that provides a taxonomy of all mentioned works and code links.\n","authors":["Yunfan Lu","Yiqi Lin","Hao Wu","Yunhao Luo","Xu Zheng","Hui Xiong","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2206.02070v2.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2303.06842v2","updated":"2023-07-26T03:53:19Z","published":"2023-03-13T04:16:42Z","title":"Scene Graph Generation from Hierarchical Relationship Reasoning","summary":"  This paper presents a novel approach for inferring relationships between\nobjects in visual scenes. It explicitly exploits an informative hierarchical\nstructure that can be imposed to divide the object and relationship categories\ninto disjoint super-categories. Specifically, our proposed method incorporates\na Bayes prediction head, enabling joint predictions of the super-category as\nthe type of relationship between the two objects, along with the detailed\nrelationship within that super-category. This design reduces the impact of\nclass imbalance problems. Furthermore, we also modify the supervised\ncontrastive learning to adapt our hierarchical classification scheme.\nExperimental evaluations on the Visual Genome and OpenImage V6 datasets\ndemonstrate that this factorized approach allows a relatively simple model to\nachieve competitive performance, particularly in predicate classification and\nzero-shot tasks.\n","authors":["Bowen Jiang","Camillo J. Taylor"],"pdf_url":"https://arxiv.org/pdf/2303.06842v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13938v1","updated":"2023-07-26T03:30:28Z","published":"2023-07-26T03:30:28Z","title":"Improving Semi-Supervised Semantic Segmentation with Dual-Level Siamese\n  Structure Network","summary":"  Semi-supervised semantic segmentation (SSS) is an important task that\nutilizes both labeled and unlabeled data to reduce expenses on labeling\ntraining examples. However, the effectiveness of SSS algorithms is limited by\nthe difficulty of fully exploiting the potential of unlabeled data. To address\nthis, we propose a dual-level Siamese structure network (DSSN) for pixel-wise\ncontrastive learning. By aligning positive pairs with a pixel-wise contrastive\nloss using strong augmented views in both low-level image space and high-level\nfeature space, the proposed DSSN is designed to maximize the utilization of\navailable unlabeled data. Additionally, we introduce a novel class-aware\npseudo-label selection strategy for weak-to-strong supervision, which addresses\nthe limitations of most existing methods that do not perform selection or apply\na predefined threshold for all classes. Specifically, our strategy selects the\ntop high-confidence prediction of the weak view for each class to generate\npseudo labels that supervise the strong augmented views. This strategy is\ncapable of taking into account the class imbalance and improving the\nperformance of long-tailed classes. Our proposed method achieves\nstate-of-the-art results on two datasets, PASCAL VOC 2012 and Cityscapes,\noutperforming other SSS algorithms by a significant margin.\n","authors":["Zhibo Tain","Xiaolin Zhang","Peng Zhang","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2307.13938v1.pdf","comment":"ACM MM 2023 accpeted"},{"id":"http://arxiv.org/abs/2307.13933v1","updated":"2023-07-26T03:12:05Z","published":"2023-07-26T03:12:05Z","title":"AIDE: A Vision-Driven Multi-View, Multi-Modal, Multi-Tasking Dataset for\n  Assistive Driving Perception","summary":"  Driver distraction has become a significant cause of severe traffic accidents\nover the past decade. Despite the growing development of vision-driven driver\nmonitoring systems, the lack of comprehensive perception datasets restricts\nroad safety and traffic security. In this paper, we present an AssIstive\nDriving pErception dataset (AIDE) that considers context information both\ninside and outside the vehicle in naturalistic scenarios. AIDE facilitates\nholistic driver monitoring through three distinctive characteristics, including\nmulti-view settings of driver and scene, multi-modal annotations of face, body,\nposture, and gesture, and four pragmatic task designs for driving\nunderstanding. To thoroughly explore AIDE, we provide experimental benchmarks\non three kinds of baseline frameworks via extensive methods. Moreover, two\nfusion strategies are introduced to give new insights into learning effective\nmulti-stream/modal representations. We also systematically investigate the\nimportance and rationality of the key components in AIDE and benchmarks. The\nproject link is https://github.com/ydk122024/AIDE.\n","authors":["Dingkang Yang","Shuai Huang","Zhi Xu","Zhenpeng Li","Shunli Wang","Mingcheng Li","Yuzheng Wang","Yang Liu","Kun Yang","Zhaoyu Chen","Yan Wang","Jing Liu","Peixuan Zhang","Peng Zhai","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13933v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13929v1","updated":"2023-07-26T03:00:31Z","published":"2023-07-26T03:00:31Z","title":"Spatio-Temporal Domain Awareness for Multi-Agent Collaborative\n  Perception","summary":"  Multi-agent collaborative perception as a potential application for\nvehicle-to-everything communication could significantly improve the perception\nperformance of autonomous vehicles over single-agent perception. However,\nseveral challenges remain in achieving pragmatic information sharing in this\nemerging research. In this paper, we propose SCOPE, a novel collaborative\nperception framework that aggregates the spatio-temporal awareness\ncharacteristics across on-road agents in an end-to-end manner. Specifically,\nSCOPE has three distinct strengths: i) it considers effective semantic cues of\nthe temporal context to enhance current representations of the target agent;\nii) it aggregates perceptually critical spatial information from heterogeneous\nagents and overcomes localization errors via multi-scale feature interactions;\niii) it integrates multi-source representations of the target agent based on\ntheir complementary contributions by an adaptive fusion paradigm. To thoroughly\nevaluate SCOPE, we consider both real-world and simulated scenarios of\ncollaborative 3D object detection tasks on three datasets. Extensive\nexperiments demonstrate the superiority of our approach and the necessity of\nthe proposed components.\n","authors":["Kun Yang","Dingkang Yang","Jingyu Zhang","Mingcheng Li","Yang Liu","Jing Liu","Hanqi Wang","Peng Sun","Liang Song"],"pdf_url":"https://arxiv.org/pdf/2307.13929v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13927v1","updated":"2023-07-26T02:53:29Z","published":"2023-07-26T02:53:29Z","title":"DFR-Net: Density Feature Refinement Network for Image Dehazing Utilizing\n  Haze Density Difference","summary":"  In image dehazing task, haze density is a key feature and affects the\nperformance of dehazing methods. However, some of the existing methods lack a\ncomparative image to measure densities, and others create intermediate results\nbut lack the exploitation of their density differences, which can facilitate\nperception of density. To address these deficiencies, we propose a\ndensity-aware dehazing method named Density Feature Refinement Network\n(DFR-Net) that extracts haze density features from density differences and\nleverages density differences to refine density features. In DFR-Net, we first\ngenerate a proposal image that has lower overall density than the hazy input,\nbringing in global density differences. Additionally, the dehazing residual of\nthe proposal image reflects the level of dehazing performance and provides\nlocal density differences that indicate localized hard dehazing or high density\nareas. Subsequently, we introduce a Global Branch (GB) and a Local Branch (LB)\nto achieve density-awareness. In GB, we use Siamese networks for feature\nextraction of hazy inputs and proposal images, and we propose a Global Density\nFeature Refinement (GDFR) module that can refine features by pushing features\nwith different global densities further away. In LB, we explore local density\nfeatures from the dehazing residuals between hazy inputs and proposal images\nand introduce an Intermediate Dehazing Residual Feedforward (IDRF) module to\nupdate local features and pull them closer to clear image features. Sufficient\nexperiments demonstrate that the proposed method achieves results beyond the\nstate-of-the-art methods on various datasets.\n","authors":["Zhongze Wang","Haitao Zhao","Lujian Yao","Jingchao Peng","Kaijie Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.13927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00773v2","updated":"2023-07-26T02:49:44Z","published":"2023-07-03T06:33:49Z","title":"DifFSS: Diffusion Model for Few-Shot Semantic Segmentation","summary":"  Diffusion models have demonstrated excellent performance in image generation.\nAlthough various few-shot semantic segmentation (FSS) models with different\nnetwork structures have been proposed, performance improvement has reached a\nbottleneck. This paper presents the first work to leverage the diffusion model\nfor FSS task, called DifFSS. DifFSS, a novel FSS paradigm, can further improve\nthe performance of the state-of-the-art FSS models by a large margin without\nmodifying their network structure. Specifically, we utilize the powerful\ngeneration ability of diffusion models to generate diverse auxiliary support\nimages by using the semantic mask, scribble or soft HED boundary of the support\nimage as control conditions. This generation process simulates the variety\nwithin the class of the query image, such as color, texture variation,\nlighting, $etc$. As a result, FSS models can refer to more diverse support\nimages, yielding more robust representations, thereby achieving a consistent\nimprovement in segmentation performance. Extensive experiments on three\npublicly available datasets based on existing advanced FSS models demonstrate\nthe effectiveness of the diffusion model for FSS task. Furthermore, we explore\nin detail the impact of different input settings of the diffusion model on\nsegmentation performance. Hopefully, this completely new paradigm will bring\ninspiration to the study of FSS task integrated with AI-generated content.\n","authors":["Weimin Tan","Siyuan Chen","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2307.00773v2.pdf","comment":"code is available at https://github.com/TrinitialChan/DifFSS"},{"id":"http://arxiv.org/abs/2307.13925v1","updated":"2023-07-26T02:46:50Z","published":"2023-07-26T02:46:50Z","title":"EasyNet: An Easy Network for 3D Industrial Anomaly Detection","summary":"  3D anomaly detection is an emerging and vital computer vision task in\nindustrial manufacturing (IM). Recently many advanced algorithms have been\npublished, but most of them cannot meet the needs of IM. There are several\ndisadvantages: i) difficult to deploy on production lines since their\nalgorithms heavily rely on large pre-trained models; ii) hugely increase\nstorage overhead due to overuse of memory banks; iii) the inference speed\ncannot be achieved in real-time. To overcome these issues, we propose an easy\nand deployment-friendly network (called EasyNet) without using pre-trained\nmodels and memory banks: firstly, we design a multi-scale multi-modality\nfeature encoder-decoder to accurately reconstruct the segmentation maps of\nanomalous regions and encourage the interaction between RGB images and depth\nimages; secondly, we adopt a multi-modality anomaly segmentation network to\nachieve a precise anomaly map; thirdly, we propose an attention-based\ninformation entropy fusion module for feature fusion during inference, making\nit suitable for real-time deployment. Extensive experiments show that EasyNet\nachieves an anomaly detection AUROC of 92.6% without using pre-trained models\nand memory banks. In addition, EasyNet is faster than existing methods, with a\nhigh frame rate of 94.55 FPS on a Tesla V100 GPU.\n","authors":["Ruitao Chen","Guoyang Xie","Jiaqi Liu","Jinbao Wang","Ziqi Luo","Jinfan Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.13925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13924v1","updated":"2023-07-26T02:45:59Z","published":"2023-07-26T02:45:59Z","title":"trajdata: A Unified Interface to Multiple Human Trajectory Datasets","summary":"  The field of trajectory forecasting has grown significantly in recent years,\npartially owing to the release of numerous large-scale, real-world human\ntrajectory datasets for autonomous vehicles (AVs) and pedestrian motion\ntracking. While such datasets have been a boon for the community, they each use\ncustom and unique data formats and APIs, making it cumbersome for researchers\nto train and evaluate methods across multiple datasets. To remedy this, we\npresent trajdata: a unified interface to multiple human trajectory datasets. At\nits core, trajdata provides a simple, uniform, and efficient representation and\nAPI for trajectory and map data. As a demonstration of its capabilities, in\nthis work we conduct a comprehensive empirical evaluation of existing\ntrajectory datasets, providing users with a rich understanding of the data\nunderpinning much of current pedestrian and AV motion forecasting research, and\nproposing suggestions for future datasets from these insights. trajdata is\npermissively licensed (Apache 2.0) and can be accessed online at\nhttps://github.com/NVlabs/trajdata\n","authors":["Boris Ivanovic","Guanyu Song","Igor Gilitschenski","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2307.13924v1.pdf","comment":"15 pages, 15 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.04192v2","updated":"2023-07-26T02:29:03Z","published":"2023-07-09T14:54:30Z","title":"SAS Video-QA: Self-Adaptive Sampling for Efficient Video\n  Question-Answering","summary":"  Video question--answering is a fundamental task in the field of video\nunderstanding. Although current vision--language models (VLMs) equipped with\nVideo Transformers have enabled temporal modeling and yielded superior results,\nthey are at the cost of huge computational power and thus too expensive to\ndeploy in real-time application scenarios. An economical workaround only\nsamples a small portion of frames to represent the main content of that video\nand tune an image--text model on these sampled frames. Recent video\nunderstanding models usually randomly sample a set of frames or clips,\nregardless of internal correlations between their visual contents, nor their\nrelevance to the problem. We argue that such kinds of aimless sampling may omit\nthe key frames from which the correct answer can be deduced, and the situation\ngets worse when the sampling sparsity increases, which always happens as the\nvideo lengths increase. To mitigate this issue, we propose two frame sampling\nstrategies, namely the most domain frames (MDF) and most implied frames (MIF),\nto maximally preserve those frames that are most likely vital to the given\nquestions. MDF passively minimizes the risk of key frame omission in a\nbootstrap manner, while MIS actively searches key frames customized for each\nvideo--question pair with the assistance of auxiliary models. The experimental\nresults on three public datasets from three advanced VLMs (CLIP, GIT and\nAll-in-one) demonstrate that our proposed strategies can boost the performance\nfor image--text pretrained models. The source codes pertaining to the method\nproposed in this paper are publicly available at\nhttps://github.com/declare-lab/sas-vqa.\n","authors":["Wei Han","Hui Chen","Min-Yen Kan","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2307.04192v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.02115v2","updated":"2023-07-26T02:20:30Z","published":"2023-06-03T14:01:54Z","title":"Table and Image Generation for Investigating Knowledge of Entities in\n  Pre-trained Vision and Language Models","summary":"  In this paper, we propose a table and image generation task to verify how the\nknowledge about entities acquired from natural language is retained in Vision &\nLanguage (V&L) models. This task consists of two parts: the first is to\ngenerate a table containing knowledge about an entity and its related image,\nand the second is to generate an image from an entity with a caption and a\ntable containing related knowledge of the entity. In both tasks, the model must\nknow the entities used to perform the generation properly. We created the\nWikipedia Table and Image Generation (WikiTIG) dataset from about 200,000\ninfoboxes in English Wikipedia articles to perform the proposed tasks. We\nevaluated the performance on the tasks with respect to the above research\nquestion using the V&L model OFA, which has achieved state-of-the-art results\nin multiple tasks. Experimental results show that OFA forgets part of its\nentity knowledge by pre-training as a complement to improve the performance of\nimage related tasks.\n","authors":["Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2306.02115v2.pdf","comment":"Accepted at ACL 2023"},{"id":"http://arxiv.org/abs/2307.13908v1","updated":"2023-07-26T02:16:55Z","published":"2023-07-26T02:16:55Z","title":"Points-to-3D: Bridging the Gap between Sparse Points and\n  Shape-Controllable Text-to-3D Generation","summary":"  Text-to-3D generation has recently garnered significant attention, fueled by\n2D diffusion models trained on billions of image-text pairs. Existing methods\nprimarily rely on score distillation to leverage the 2D diffusion priors to\nsupervise the generation of 3D models, e.g., NeRF. However, score distillation\nis prone to suffer the view inconsistency problem, and implicit NeRF modeling\ncan also lead to an arbitrary shape, thus leading to less realistic and\nuncontrollable 3D generation. In this work, we propose a flexible framework of\nPoints-to-3D to bridge the gap between sparse yet freely available 3D points\nand realistic shape-controllable 3D generation by distilling the knowledge from\nboth 2D and 3D diffusion models. The core idea of Points-to-3D is to introduce\ncontrollable sparse 3D points to guide the text-to-3D generation. Specifically,\nwe use the sparse point cloud generated from the 3D diffusion model, Point-E,\nas the geometric prior, conditioned on a single reference image. To better\nutilize the sparse 3D points, we propose an efficient point cloud guidance loss\nto adaptively drive the NeRF's geometry to align with the shape of the sparse\n3D points. In addition to controlling the geometry, we propose to optimize the\nNeRF for a more view-consistent appearance. To be specific, we perform score\ndistillation to the publicly available 2D image diffusion model ControlNet,\nconditioned on text as well as depth map of the learned compact geometry.\nQualitative and quantitative comparisons demonstrate that Points-to-3D improves\nview consistency and achieves good shape controllability for text-to-3D\ngeneration. Points-to-3D provides users with a new way to improve and control\ntext-to-3D generation.\n","authors":["Chaohui Yu","Qiang Zhou","Jingliang Li","Zhe Zhang","Zhibin Wang","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.13908v1.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2307.02100v2","updated":"2023-07-26T02:13:29Z","published":"2023-07-05T08:19:29Z","title":"MDViT: Multi-domain Vision Transformer for Small Medical Image\n  Segmentation Datasets","summary":"  Despite its clinical utility, medical image segmentation (MIS) remains a\ndaunting task due to images' inherent complexity and variability. Vision\ntransformers (ViTs) have recently emerged as a promising solution to improve\nMIS; however, they require larger training datasets than convolutional neural\nnetworks. To overcome this obstacle, data-efficient ViTs were proposed, but\nthey are typically trained using a single source of data, which overlooks the\nvaluable knowledge that could be leveraged from other available datasets.\nNaivly combining datasets from different domains can result in negative\nknowledge transfer (NKT), i.e., a decrease in model performance on some domains\nwith non-negligible inter-domain heterogeneity. In this paper, we propose\nMDViT, the first multi-domain ViT that includes domain adapters to mitigate\ndata-hunger and combat NKT by adaptively exploiting knowledge in multiple small\ndata resources (domains). Further, to enhance representation learning across\ndomains, we integrate a mutual knowledge distillation paradigm that transfers\nknowledge between a universal network (spanning all the domains) and auxiliary\ndomain-specific branches. Experiments on 4 skin lesion segmentation datasets\nshow that MDViT outperforms state-of-the-art algorithms, with superior\nsegmentation performance and a fixed model size, at inference time, even as\nmore domains are added. Our code is available at\nhttps://github.com/siyi-wind/MDViT.\n","authors":["Siyi Du","Nourhan Bayasi","Ghassan Harmarneh","Rafeef Garbi"],"pdf_url":"https://arxiv.org/pdf/2307.02100v2.pdf","comment":"10 pages, 2 figures, accepted by 26th International Conference on\n  Medical Image Computing and Computer Assisted Intervention (MICCAI 2023)"},{"id":"http://arxiv.org/abs/2307.07928v3","updated":"2023-07-26T01:59:06Z","published":"2023-07-16T02:44:19Z","title":"Reinforced Disentanglement for Face Swapping without Skip Connection","summary":"  The SOTA face swap models still suffer the problem of either target identity\n(i.e., shape) being leaked or the target non-identity attributes (i.e.,\nbackground, hair) failing to be fully preserved in the final results. We show\nthat this insufficient disentanglement is caused by two flawed designs that\nwere commonly adopted in prior models: (1) counting on only one compressed\nencoder to represent both the semantic-level non-identity facial\nattributes(i.e., pose) and the pixel-level non-facial region details, which is\ncontradictory to satisfy at the same time; (2) highly relying on long\nskip-connections between the encoder and the final generator, leaking a certain\namount of target face identity into the result. To fix them, we introduce a new\nface swap framework called 'WSC-swap' that gets rid of skip connections and\nuses two target encoders to respectively capture the pixel-level non-facial\nregion attributes and the semantic non-identity attributes in the face region.\nTo further reinforce the disentanglement learning for the target encoder, we\nemploy both identity removal loss via adversarial training (i.e., GAN) and the\nnon-identity preservation loss via prior 3DMM models like [11]. Extensive\nexperiments on both FaceForensics++ and CelebA-HQ show that our results\nsignificantly outperform previous works on a rich set of metrics, including one\nnovel metric for measuring identity consistency that was completely neglected\nbefore.\n","authors":["Xiaohang Ren","Xingyu Chen","Pengfei Yao","Heung-Yeung Shum","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.07928v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13901v1","updated":"2023-07-26T01:51:10Z","published":"2023-07-26T01:51:10Z","title":"YOLOBench: Benchmarking Efficient Object Detectors on Embedded Systems","summary":"  We present YOLOBench, a benchmark comprised of 550+ YOLO-based object\ndetection models on 4 different datasets and 4 different embedded hardware\nplatforms (x86 CPU, ARM CPU, Nvidia GPU, NPU). We collect accuracy and latency\nnumbers for a variety of YOLO-based one-stage detectors at different model\nscales by performing a fair, controlled comparison of these detectors with a\nfixed training environment (code and training hyperparameters).\nPareto-optimality analysis of the collected data reveals that, if modern\ndetection heads and training techniques are incorporated into the learning\nprocess, multiple architectures of the YOLO series achieve a good\naccuracy-latency trade-off, including older models like YOLOv3 and YOLOv4. We\nalso evaluate training-free accuracy estimators used in neural architecture\nsearch on YOLOBench and demonstrate that, while most state-of-the-art zero-cost\naccuracy estimators are outperformed by a simple baseline like MAC count, some\nof them can be effectively used to predict Pareto-optimal detection models. We\nshowcase that by using a zero-cost proxy to identify a YOLO architecture\ncompetitive against a state-of-the-art YOLOv8 model on a Raspberry Pi 4 CPU.\nThe code and data are available at\nhttps://github.com/Deeplite/deeplite-torch-zoo\n","authors":["Ivan Lazarevich","Matteo Grimaldi","Ravish Kumar","Saptarshi Mitra","Shahrukh Khan","Sudhakar Sah"],"pdf_url":"https://arxiv.org/pdf/2307.13901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13899v1","updated":"2023-07-26T01:47:49Z","published":"2023-07-26T01:47:49Z","title":"Regularizing Neural Networks with Meta-Learning Generative Models","summary":"  This paper investigates methods for improving generative data augmentation\nfor deep learning. Generative data augmentation leverages the synthetic samples\nproduced by generative models as an additional dataset for classification with\nsmall dataset settings. A key challenge of generative data augmentation is that\nthe synthetic data contain uninformative samples that degrade accuracy. This is\nbecause the synthetic samples do not perfectly represent class categories in\nreal data and uniform sampling does not necessarily provide useful samples for\ntasks. In this paper, we present a novel strategy for generative data\naugmentation called meta generative regularization (MGR). To avoid the\ndegradation of generative data augmentation, MGR utilizes synthetic samples in\nthe regularization term for feature extractors instead of in the loss function,\ne.g., cross-entropy. These synthetic samples are dynamically determined to\nminimize the validation losses through meta-learning. We observed that MGR can\navoid the performance degradation of na\\\"ive generative data augmentation and\nboost the baselines. Experiments on six datasets showed that MGR is effective\nparticularly when datasets are smaller and stably outperforms baselines.\n","authors":["Shin'ya Yamaguchi","Daiki Chijiwa","Sekitoshi Kanai","Atsutoshi Kumagai","Hisashi Kashima"],"pdf_url":"https://arxiv.org/pdf/2307.13899v1.pdf","comment":"Accepted to Data-centric Machine Learning Research (DMLR) Workshop at\n  ICML 2023"},{"id":"http://arxiv.org/abs/2206.13803v3","updated":"2023-07-26T01:46:05Z","published":"2022-06-28T07:37:38Z","title":"FedIIC: Towards Robust Federated Learning for Class-Imbalanced Medical\n  Image Classification","summary":"  Federated learning (FL), training deep models from decentralized data without\nprivacy leakage, has shown great potential in medical image computing recently.\nHowever, considering the ubiquitous class imbalance in medical data, FL can\nexhibit performance degradation, especially for minority classes (e.g. rare\ndiseases). Existing methods towards this problem mainly focus on training a\nbalanced classifier to eliminate class prior bias among classes, but neglect to\nexplore better representation to facilitate classification performance. In this\npaper, we present a privacy-preserving FL method named FedIIC to combat class\nimbalance from two perspectives: feature learning and classifier learning. In\nfeature learning, two levels of contrastive learning are designed to extract\nbetter class-specific features with imbalanced data in FL. In classifier\nlearning, per-class margins are dynamically set according to real-time\ndifficulty and class priors, which helps the model learn classes equally.\nExperimental results on publicly-available datasets demonstrate the superior\nperformance of FedIIC in dealing with both real-world and simulated\nmulti-source medical imaging data under class imbalance. Code is available at\nhttps://github.com/wnn2000/FedIIC.\n","authors":["Nannan Wu","Li Yu","Xin Yang","Kwang-Ting Cheng","Zengqiang Yan"],"pdf_url":"https://arxiv.org/pdf/2206.13803v3.pdf","comment":"This paper has been accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.13897v1","updated":"2023-07-26T01:44:31Z","published":"2023-07-26T01:44:31Z","title":"AViT: Adapting Vision Transformers for Small Skin Lesion Segmentation\n  Datasets","summary":"  Skin lesion segmentation (SLS) plays an important role in skin lesion\nanalysis. Vision transformers (ViTs) are considered an auspicious solution for\nSLS, but they require more training data compared to convolutional neural\nnetworks (CNNs) due to their inherent parameter-heavy structure and lack of\nsome inductive biases. To alleviate this issue, current approaches fine-tune\npre-trained ViT backbones on SLS datasets, aiming to leverage the knowledge\nlearned from a larger set of natural images to lower the amount of skin\ntraining data needed. However, fully fine-tuning all parameters of large\nbackbones is computationally expensive and memory intensive. In this paper, we\npropose AViT, a novel efficient strategy to mitigate ViTs' data-hunger by\ntransferring any pre-trained ViTs to the SLS task. Specifically, we integrate\nlightweight modules (adapters) within the transformer layers, which modulate\nthe feature representation of a ViT without updating its pre-trained weights.\nIn addition, we employ a shallow CNN as a prompt generator to create a prompt\nembedding from the input image, which grasps fine-grained information and CNN's\ninductive biases to guide the segmentation task on small datasets. Our\nquantitative experiments on 4 skin lesion datasets demonstrate that AViT\nachieves competitive, and at times superior, performance to SOTA but with\nsignificantly fewer trainable parameters. Our code is available at\nhttps://github.com/siyi-wind/AViT.\n","authors":["Siyi Du","Nourhan Bayasi","Ghassan Harmarneh","Rafeef Garbi"],"pdf_url":"https://arxiv.org/pdf/2307.13897v1.pdf","comment":"10 pages, 2 figures, accepted by MICCAI ISIC Workshop 2023"},{"id":"http://arxiv.org/abs/2202.09559v2","updated":"2023-07-26T01:36:38Z","published":"2022-02-19T09:30:08Z","title":"Priming Cross-Session Motor Imagery Classification with A Universal Deep\n  Domain Adaptation Framework","summary":"  Motor imagery (MI) is a common brain computer interface (BCI) paradigm. EEG\nis non-stationary with low signal-to-noise, classifying motor imagery tasks of\nthe same participant from different EEG recording sessions is generally\nchallenging, as EEG data distribution may vary tremendously among different\nacquisition sessions. Although it is intuitive to consider the cross-session MI\nclassification as a domain adaptation problem, the rationale and feasible\napproach is not elucidated. In this paper, we propose a Siamese deep domain\nadaptation (SDDA) framework for cross-session MI classification based on\nmathematical models in domain adaptation theory. The proposed framework can be\neasily applied to most existing artificial neural networks without altering the\nnetwork structure, which facilitates our method with great flexibility and\ntransferability. In the proposed framework, domain invariants were firstly\nconstructed jointly with channel normalization and Euclidean alignment. Then,\nembedding features from source and target domain were mapped into the\nReproducing Kernel Hilbert Space (RKHS) and aligned accordingly. A cosine-based\ncenter loss was also integrated into the framework to improve the\ngeneralizability of the SDDA. The proposed framework was validated with two\nclassic and popular convolutional neural networks from BCI research field\n(EEGNet and ConvNet) in two MI-EEG public datasets (BCI Competition IV IIA,\nIIB). Compared to the vanilla EEGNet and ConvNet, the proposed SDDA framework\nwas able to boost the MI classification accuracy by 15.2%, 10.2% respectively\nin IIA dataset, and 5.5%, 4.2% in IIB dataset. The final MI classification\naccuracy reached 82.01% in IIA dataset and 87.52% in IIB, which outperformed\nthe state-of-the-art methods in the literature.\n","authors":["Zhengqing Miao","Xin Zhang","Carlo Menon","Yelong Zheng","Meirong Zhao","Dong Ming"],"pdf_url":"https://arxiv.org/pdf/2202.09559v2.pdf","comment":"17 pages, 5figures"},{"id":"http://arxiv.org/abs/2203.01482v2","updated":"2023-07-26T00:49:29Z","published":"2022-03-03T01:53:47Z","title":"MetaDT: Meta Decision Tree with Class Hierarchy for Interpretable\n  Few-Shot Learning","summary":"  Few-Shot Learning (FSL) is a challenging task, which aims to recognize novel\nclasses with few examples. Recently, lots of methods have been proposed from\nthe perspective of meta-learning and representation learning. However, few\nworks focus on the interpretability of FSL decision process. In this paper, we\ntake a step towards the interpretable FSL by proposing a novel meta-learning\nbased decision tree framework, namely, MetaDT. In particular, the FSL\ninterpretability is achieved from two aspects, i.e., a concept aspect and a\nvisual aspect. On the concept aspect, we first introduce a tree-like concept\nhierarchy as FSL prior. Then, resorting to the prior, we split each few-shot\ntask to a set of subtasks with different concept levels and then perform class\nprediction via a model of decision tree. The advantage of such design is that a\nsequence of high-level concept decisions that lead up to a final class\nprediction can be obtained, which clarifies the FSL decision process. On the\nvisual aspect, a set of subtask-specific classifiers with visual attention\nmechanism is designed to perform decision at each node of the decision tree. As\na result, a subtask-specific heatmap visualization can be obtained to achieve\nthe decision interpretability of each tree node. At last, to alleviate the data\nscarcity issue of FSL, we regard the prior of concept hierarchy as an\nundirected graph, and then design a graph convolution-based decision tree\ninference network as our meta-learner to infer parameters of the decision tree.\nExtensive experiments on performance comparison and interpretability analysis\nshow superiority of our MetaDT.\n","authors":["Baoquan Zhang","Hao Jiang","Xutao Li","Shanshan Feng","Yunming Ye","Rui Ye"],"pdf_url":"https://arxiv.org/pdf/2203.01482v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.12917v2","updated":"2023-07-26T00:05:59Z","published":"2023-07-24T16:18:22Z","title":"Hierarchical Skeleton Meta-Prototype Contrastive Learning with Hard\n  Skeleton Mining for Unsupervised Person Re-Identification","summary":"  With rapid advancements in depth sensors and deep learning, skeleton-based\nperson re-identification (re-ID) models have recently achieved remarkable\nprogress with many advantages. Most existing solutions learn single-level\nskeleton features from body joints with the assumption of equal skeleton\nimportance, while they typically lack the ability to exploit more informative\nskeleton features from various levels such as limb level with more global body\npatterns. The label dependency of these methods also limits their flexibility\nin learning more general skeleton representations. This paper proposes a\ngeneric unsupervised Hierarchical skeleton Meta-Prototype Contrastive learning\n(Hi-MPC) approach with Hard Skeleton Mining (HSM) for person re-ID with\nunlabeled 3D skeletons. Firstly, we construct hierarchical representations of\nskeletons to model coarse-to-fine body and motion features from the levels of\nbody joints, components, and limbs. Then a hierarchical meta-prototype\ncontrastive learning model is proposed to cluster and contrast the most typical\nskeleton features (\"prototypes\") from different-level skeletons. By converting\noriginal prototypes into meta-prototypes with multiple homogeneous\ntransformations, we induce the model to learn the inherent consistency of\nprototypes to capture more effective skeleton features for person re-ID.\nFurthermore, we devise a hard skeleton mining mechanism to adaptively infer the\ninformative importance of each skeleton, so as to focus on harder skeletons to\nlearn more discriminative skeleton representations. Extensive evaluations on\nfive datasets demonstrate that our approach outperforms a wide variety of\nstate-of-the-art skeleton-based methods. We further show the general\napplicability of our method to cross-view person re-ID and RGB-based scenarios\nwith estimated skeletons.\n","authors":["Haocong Rao","Cyril Leung","Chunyan Miao"],"pdf_url":"https://arxiv.org/pdf/2307.12917v2.pdf","comment":"Accepted by International Journal of Computer Vision (IJCV). Codes\n  are available at https://github.com/Kali-Hac/Hi-MPC. Supplemental materials\n  will be included in the published version"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.14298v1","updated":"2023-07-26T16:58:10Z","published":"2023-07-26T16:58:10Z","title":"ChatGPT and Persuasive Technologies for the Management and Delivery of\n  Personalized Recommendations in Hotel Hospitality","summary":"  Recommender systems have become indispensable tools in the hotel hospitality\nindustry, enabling personalized and tailored experiences for guests. Recent\nadvancements in large language models (LLMs), such as ChatGPT, and persuasive\ntechnologies, have opened new avenues for enhancing the effectiveness of those\nsystems. This paper explores the potential of integrating ChatGPT and\npersuasive technologies for automating and improving hotel hospitality\nrecommender systems. First, we delve into the capabilities of ChatGPT, which\ncan understand and generate human-like text, enabling more accurate and\ncontext-aware recommendations. We discuss the integration of ChatGPT into\nrecommender systems, highlighting the ability to analyze user preferences,\nextract valuable insights from online reviews, and generate personalized\nrecommendations based on guest profiles. Second, we investigate the role of\npersuasive technology in influencing user behavior and enhancing the persuasive\nimpact of hotel recommendations. By incorporating persuasive techniques, such\nas social proof, scarcity and personalization, recommender systems can\neffectively influence user decision-making and encourage desired actions, such\nas booking a specific hotel or upgrading their room. To investigate the\nefficacy of ChatGPT and persuasive technologies, we present a pilot experi-ment\nwith a case study involving a hotel recommender system. We aim to study the\nimpact of integrating ChatGPT and persua-sive techniques on user engagement,\nsatisfaction, and conversion rates. The preliminary results demonstrate the\npotential of these technologies in enhancing the overall guest experience and\nbusiness performance. Overall, this paper contributes to the field of hotel\nhospitality by exploring the synergistic relationship between LLMs and\npersuasive technology in recommender systems, ultimately influencing guest\nsatisfaction and hotel revenue.\n","authors":["Manolis Remountakis","Konstantinos Kotis","Babis Kourtzis","George E. Tsekouras"],"pdf_url":"https://arxiv.org/pdf/2307.14298v1.pdf","comment":"17 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.14225v1","updated":"2023-07-26T14:47:15Z","published":"2023-07-26T14:47:15Z","title":"Large Language Models are Competitive Near Cold-start Recommenders for\n  Language- and Item-based Preferences","summary":"  Traditional recommender systems leverage users' item preference history to\nrecommend novel content that users may like. However, modern dialog interfaces\nthat allow users to express language-based preferences offer a fundamentally\ndifferent modality for preference input. Inspired by recent successes of\nprompting paradigms for large language models (LLMs), we study their use for\nmaking recommendations from both item-based and language-based preferences in\ncomparison to state-of-the-art item-based collaborative filtering (CF) methods.\nTo support this investigation, we collect a new dataset consisting of both\nitem-based and language-based preferences elicited from users along with their\nratings on a variety of (biased) recommended items and (unbiased) random items.\nAmong numerous experimental results, we find that LLMs provide competitive\nrecommendation performance for pure language-based preferences (no item\npreferences) in the near cold-start case in comparison to item-based CF\nmethods, despite having no supervised training for this specific task\n(zero-shot) or only a few labels (few-shot). This is particularly promising as\nlanguage-based preference representations are more explainable and scrutable\nthan item-based or vector-based representations.\n","authors":["Scott Sanner","Krisztian Balog","Filip Radlinski","Ben Wedin","Lucas Dixon"],"pdf_url":"https://arxiv.org/pdf/2307.14225v1.pdf","comment":"To appear at RecSys'23"},{"id":"http://arxiv.org/abs/2307.14059v1","updated":"2023-07-26T09:12:49Z","published":"2023-07-26T09:12:49Z","title":"A Probabilistic Position Bias Model for Short-Video Recommendation Feeds","summary":"  Modern web-based platforms show ranked lists of recommendations to users,\nattempting to maximise user satisfaction or business metrics. Typically, the\ngoal of such systems boils down to maximising the exposure probability for\nitems that are deemed \"reward-maximising\" according to a metric of interest.\nThis general framing comprises streaming applications, as well as e-commerce or\njob recommendations, and even web search. Position bias or user models can be\nused to estimate exposure probabilities for each use-case, specifically\ntailored to how users interact with the presented rankings. A unifying factor\nin these diverse problem settings is that typically only one or several items\nwill be engaged with (clicked, streamed,...) before a user leaves the ranked\nlist. Short-video feeds on social media platforms diverge from this general\nframing in several ways, most notably that users do not tend to leave the feed\nafter e.g. liking a post. Indeed, seemingly infinite feeds invite users to\nscroll further down the ranked list. For this reason, existing position bias or\nuser models tend to fall short in such settings, as they do not accurately\ncapture users' interaction modalities.\n  In this work, we propose a novel and probabilistically sound personalised\nposition bias model for feed recommendations. We focus on a 1st-level feed in a\nhierarchical structure, where users may enter a 2nd-level feed via any given\n1st-level item. We posit that users come to the platform with a scrolling\nbudget drawn according to some distribution, and show how the survival function\nof said distribution can be used to obtain closed-form estimates for\npersonalised exposure probabilities. Empirical insights from a large-scale\nsocial media platform show how our probabilistic position bias model more\naccurately captures empirical exposure than existing models, and paves the way\nfor unbiased evaluation and learning-to-rank.\n","authors":["Olivier Jeunen"],"pdf_url":"https://arxiv.org/pdf/2307.14059v1.pdf","comment":"Appearing in the Proceedings of the Seventeenth ACM Conference on\n  Recommender Systems (RecSys '23)"},{"id":"http://arxiv.org/abs/2305.15145v2","updated":"2023-07-26T08:18:38Z","published":"2023-05-24T13:39:14Z","title":"Bert4XMR: Cross-Market Recommendation with Bidirectional Encoder\n  Representations from Transformer","summary":"  Real-world multinational e-commerce companies, such as Amazon and eBay, serve\nin multiple countries and regions. Some markets are data-scarce, while others\nare data-rich. In recent years, cross-market recommendation (XMR) has been\nproposed to bolster data-scarce markets by leveraging auxiliary information\nfrom data-rich markets. Previous XMR algorithms have employed techniques such\nas sharing bottom or incorporating inter-market similarity to optimize the\nperformance of XMR. However, the existing approaches suffer from two crucial\nlimitations: (1) They ignore the co-occurrences of items provided by data-rich\nmarkets. (2) They do not adequately tackle the issue of negative transfer\nstemming from disparities across diverse markets. In order to address these\nlimitations, we propose a novel session-based model called Bert4XMR, which is\nable to model item co-occurrences across markets and mitigate negative\ntransfer. Specifically, we employ the pre-training and fine-tuning paradigm to\nfacilitate knowledge transfer across markets. Pre-training occurs on global\nmarkets to learn item co-occurrences, while fine-tuning happens in the target\nmarket for model customization. To mitigate potential negative transfer, we\nseparate the item representations into market embeddings and item embeddings.\nMarket embeddings model the bias associated with different markets, while item\nembeddings learn generic item representations. Extensive experiments conducted\non seven real-world datasets illustrate our model's effectiveness. It\noutperforms the suboptimal model by an average of $4.82\\%$, $4.73\\%$, $7.66\\%$,\nand $6.49\\%$ across four metrics. Through the ablation study, we experimentally\ndemonstrate that the market embedding approach helps prevent negative transfer,\nespecially in data-scarce markets. Our implementations are available at\nhttps://github.com/laowangzi/Bert4XMR.\n","authors":["Zheng Hu","Satoshi Nakagawa","Shi-Min Cai","Fuji Ren"],"pdf_url":"https://arxiv.org/pdf/2305.15145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02575v2","updated":"2023-07-26T08:16:50Z","published":"2023-05-04T06:14:34Z","title":"Towards Hierarchical Policy Learning for Conversational Recommendation\n  with Hypergraph-based Reinforcement Learning","summary":"  Conversational recommendation systems (CRS) aim to timely and proactively\nacquire user dynamic preferred attributes through conversations for item\nrecommendation. In each turn of CRS, there naturally have two decision-making\nprocesses with different roles that influence each other: 1) director, which is\nto select the follow-up option (i.e., ask or recommend) that is more effective\nfor reducing the action space and acquiring user preferences; and 2) actor,\nwhich is to accordingly choose primitive actions (i.e., asked attribute or\nrecommended item) that satisfy user preferences and give feedback to estimate\nthe effectiveness of the director's option. However, existing methods heavily\nrely on a unified decision-making module or heuristic rules, while neglecting\nto distinguish the roles of different decision procedures, as well as the\nmutual influences between them. To address this, we propose a novel\nDirector-Actor Hierarchical Conversational Recommender (DAHCR), where the\ndirector selects the most effective option, followed by the actor accordingly\nchoosing primitive actions that satisfy user preferences. Specifically, we\ndevelop a dynamic hypergraph to model user preferences and introduce an\nintrinsic motivation to train from weak supervision over the director. Finally,\nto alleviate the bad effect of model bias on the mutual influence between the\ndirector and actor, we model the director's option by sampling from a\ncategorical distribution. Extensive experiments demonstrate that DAHCR\noutperforms state-of-the-art methods.\n","authors":["Sen Zhao","Wei Wei","Yifan Liu","Ziyang Wang","Wendi Li","Xian-Ling Mao","Shuai Zhu","Minghui Yang","Zujie Wen"],"pdf_url":"https://arxiv.org/pdf/2305.02575v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14024v1","updated":"2023-07-26T08:08:05Z","published":"2023-07-26T08:08:05Z","title":"Multi-view Hypergraph Contrastive Policy Learning for Conversational\n  Recommendation","summary":"  Conversational recommendation systems (CRS) aim to interactively acquire user\npreferences and accordingly recommend items to users. Accurately learning the\ndynamic user preferences is of crucial importance for CRS. Previous works learn\nthe user preferences with pairwise relations from the interactive conversation\nand item knowledge, while largely ignoring the fact that factors for a\nrelationship in CRS are multiplex. Specifically, the user likes/dislikes the\nitems that satisfy some attributes (Like/Dislike view). Moreover social\ninfluence is another important factor that affects user preference towards the\nitem (Social view), while is largely ignored by previous works in CRS. The user\npreferences from these three views are inherently different but also correlated\nas a whole. The user preferences from the same views should be more similar\nthan that from different views. The user preferences from Like View should be\nsimilar to Social View while different from Dislike View. To this end, we\npropose a novel model, namely Multi-view Hypergraph Contrastive Policy Learning\n(MHCPL). Specifically, MHCPL timely chooses useful social information according\nto the interactive history and builds a dynamic hypergraph with three types of\nmultiplex relations from different views. The multiplex relations in each view\nare successively connected according to their generation order.\n","authors":["Sen Zhao","Wei Wei","Xian-Ling Mao","Shuai Zhu","Minghui Yang","Zujie Wen","Dangyang Chen","Feida Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.14024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.14226v2","updated":"2023-07-26T06:24:26Z","published":"2021-06-27T12:57:31Z","title":"Sequential Recommendation with Graph Neural Networks","summary":"  Sequential recommendation aims to leverage users' historical behaviors to\npredict their next interaction. Existing works have not yet addressed two main\nchallenges in sequential recommendation. First, user behaviors in their rich\nhistorical sequences are often implicit and noisy preference signals, they\ncannot sufficiently reflect users' actual preferences. In addition, users'\ndynamic preferences often change rapidly over time, and hence it is difficult\nto capture user patterns in their historical sequences. In this work, we\npropose a graph neural network model called SURGE (short for SeqUential\nRecommendation with Graph neural nEtworks) to address these two issues.\nSpecifically, SURGE integrates different types of preferences in long-term user\nbehaviors into clusters in the graph by re-constructing loose item sequences\ninto tight item-item interest graphs based on metric learning. This helps\nexplicitly distinguish users' core interests, by forming dense clusters in the\ninterest graph. Then, we perform cluster-aware and query-aware graph\nconvolutional propagation and graph pooling on the constructed graph. It\ndynamically fuses and extracts users' current activated core interests from\nnoisy user behavior sequences. We conduct extensive experiments on both public\nand proprietary industrial datasets. Experimental results demonstrate\nsignificant performance gains of our proposed method compared to\nstate-of-the-art methods. Further studies on sequence length confirm that our\nmethod can model long behavioral sequences effectively and efficiently.\n","authors":["Jianxin Chang","Chen Gao","Yu Zheng","Yiqun Hui","Yanan Niu","Yang Song","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2106.14226v2.pdf","comment":"Accepted by SIGIR 2021"},{"id":"http://arxiv.org/abs/2205.13619v5","updated":"2023-07-26T03:20:47Z","published":"2022-05-26T20:48:53Z","title":"Fairness in Recommendation: Foundations, Methods and Applications","summary":"  As one of the most pervasive applications of machine learning, recommender\nsystems are playing an important role on assisting human decision making. The\nsatisfaction of users and the interests of platforms are closely related to the\nquality of the generated recommendation results. However, as a highly\ndata-driven system, recommender system could be affected by data or algorithmic\nbias and thus generate unfair results, which could weaken the reliance of the\nsystems. As a result, it is crucial to address the potential unfairness\nproblems in recommendation settings. Recently, there has been growing attention\non fairness considerations in recommender systems with more and more literature\non approaches to promote fairness in recommendation. However, the studies are\nrather fragmented and lack a systematic organization, thus making it difficult\nto penetrate for new researchers to the domain. This motivates us to provide a\nsystematic survey of existing works on fairness in recommendation. This survey\nfocuses on the foundations for fairness in recommendation literature. It first\npresents a brief introduction about fairness in basic machine learning tasks\nsuch as classification and ranking in order to provide a general overview of\nfairness research, as well as introduce the more complex situations and\nchallenges that need to be considered when studying fairness in recommender\nsystems. After that, the survey will introduce fairness in recommendation with\na focus on the taxonomies of current fairness definitions, the typical\ntechniques for improving fairness, as well as the datasets for fairness studies\nin recommendation. The survey also talks about the challenges and opportunities\nin fairness research with the hope of promoting the fair recommendation\nresearch area and beyond.\n","authors":["Yunqi Li","Hanxiong Chen","Shuyuan Xu","Yingqiang Ge","Juntao Tan","Shuchang Liu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2205.13619v5.pdf","comment":"38 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.13910v1","updated":"2023-07-26T02:24:23Z","published":"2023-07-26T02:24:23Z","title":"Domain Disentanglement with Interpolative Data Augmentation for\n  Dual-Target Cross-Domain Recommendation","summary":"  The conventional single-target Cross-Domain Recommendation (CDR) aims to\nimprove the recommendation performance on a sparser target domain by\ntransferring the knowledge from a source domain that contains relatively richer\ninformation. By contrast, in recent years, dual-target CDR has been proposed to\nimprove the recommendation performance on both domains simultaneously. However,\nto this end, there are two challenges in dual-target CDR: (1) how to generate\nboth relevant and diverse augmented user representations, and (2) how to\neffectively decouple domain-independent information from domain-specific\ninformation, in addition to domain-shared information, to capture comprehensive\nuser preferences. To address the above two challenges, we propose a\nDisentanglement-based framework with Interpolative Data Augmentation for\ndual-target Cross-Domain Recommendation, called DIDA-CDR. In DIDA-CDR, we first\npropose an interpolative data augmentation approach to generating both relevant\nand diverse augmented user representations to augment sparser domain and\nexplore potential user preferences. We then propose a disentanglement module to\neffectively decouple domain-specific and domain-independent information to\ncapture comprehensive user preferences. Both steps significantly contribute to\ncapturing more comprehensive user preferences, thereby improving the\nrecommendation performance on each domain. Extensive experiments conducted on\nfive real-world datasets show the significant superiority of DIDA-CDR over the\nstate-of-the-art methods.\n","authors":["Jiajie Zhu","Yan Wang","Feng Zhu","Zhu Sun"],"pdf_url":"https://arxiv.org/pdf/2307.13910v1.pdf","comment":"Accepted by RecSys 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.14338v1","updated":"2023-07-26T17:58:07Z","published":"2023-07-26T17:58:07Z","title":"TabR: Unlocking the Power of Retrieval-Augmented Tabular Deep Learning","summary":"  Deep learning (DL) models for tabular data problems are receiving\nincreasingly more attention, while the algorithms based on gradient-boosted\ndecision trees (GBDT) remain a strong go-to solution. Following the recent\ntrends in other domains, such as natural language processing and computer\nvision, several retrieval-augmented tabular DL models have been recently\nproposed. For a given target object, a retrieval-based model retrieves other\nrelevant objects, such as the nearest neighbors, from the available (training)\ndata and uses their features or even labels to make a better prediction.\nHowever, we show that the existing retrieval-based tabular DL solutions provide\nonly minor, if any, benefits over the properly tuned simple retrieval-free\nbaselines. Thus, it remains unclear whether the retrieval-based approach is a\nworthy direction for tabular DL.\n  In this work, we give a strong positive answer to this question. We start by\nincrementally augmenting a simple feed-forward architecture with an\nattention-like retrieval component similar to those of many (tabular)\nretrieval-based models. Then, we highlight several details of the attention\nmechanism that turn out to have a massive impact on the performance on tabular\ndata problems, but that were not explored in prior work. As a result, we design\nTabR -- a simple retrieval-based tabular DL model which, on a set of public\nbenchmarks, demonstrates the best average performance among tabular DL models,\nbecomes the new state-of-the-art on several datasets, and even outperforms GBDT\nmodels on the recently proposed ``GBDT-friendly'' benchmark (see the first\nfigure).\n","authors":["Yury Gorishniy","Ivan Rubachev","Nikolay Kartashev","Daniil Shlenskii","Akim Kotelnikov","Artem Babenko"],"pdf_url":"https://arxiv.org/pdf/2307.14338v1.pdf","comment":"Code: https://github.com/yandex-research/tabular-dl-tabr"},{"id":"http://arxiv.org/abs/2307.14326v1","updated":"2023-07-26T17:45:55Z","published":"2023-07-26T17:45:55Z","title":"Waypoint-Based Imitation Learning for Robotic Manipulation","summary":"  While imitation learning methods have seen a resurgent interest for robotic\nmanipulation, the well-known problem of compounding errors continues to afflict\nbehavioral cloning (BC). Waypoints can help address this problem by reducing\nthe horizon of the learning problem for BC, and thus, the errors compounded\nover time. However, waypoint labeling is underspecified, and requires\nadditional human supervision. Can we generate waypoints automatically without\nany additional human supervision? Our key insight is that if a trajectory\nsegment can be approximated by linear motion, the endpoints can be used as\nwaypoints. We propose Automatic Waypoint Extraction (AWE) for imitation\nlearning, a preprocessing module to decompose a demonstration into a minimal\nset of waypoints which when interpolated linearly can approximate the\ntrajectory up to a specified error threshold. AWE can be combined with any BC\nalgorithm, and we find that AWE can increase the success rate of\nstate-of-the-art algorithms by up to 25% in simulation and by 4-28% on\nreal-world bimanual manipulation tasks, reducing the decision making horizon by\nup to a factor of 10. Videos and code are available at\nhttps://lucys0.github.io/awe/\n","authors":["Lucy Xiaoyang Shi","Archit Sharma","Tony Z. Zhao","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2307.14326v1.pdf","comment":"The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2307.14324v1","updated":"2023-07-26T17:42:43Z","published":"2023-07-26T17:42:43Z","title":"Evaluating the Moral Beliefs Encoded in LLMs","summary":"  This paper presents a case study on the design, administration,\npost-processing, and evaluation of surveys on large language models (LLMs). It\ncomprises two components: (1) A statistical method for eliciting beliefs\nencoded in LLMs. We introduce statistical measures and evaluation metrics that\nquantify the probability of an LLM \"making a choice\", the associated\nuncertainty, and the consistency of that choice. (2) We apply this method to\nstudy what moral beliefs are encoded in different LLMs, especially in ambiguous\ncases where the right choice is not obvious. We design a large-scale survey\ncomprising 680 high-ambiguity moral scenarios (e.g., \"Should I tell a white\nlie?\") and 687 low-ambiguity moral scenarios (e.g., \"Should I stop for a\npedestrian on the road?\"). Each scenario includes a description, two possible\nactions, and auxiliary labels indicating violated rules (e.g., \"do not kill\").\nWe administer the survey to 28 open- and closed-source LLMs. We find that (a)\nin unambiguous scenarios, most models \"choose\" actions that align with\ncommonsense. In ambiguous cases, most models express uncertainty. (b) Some\nmodels are uncertain about choosing the commonsense action because their\nresponses are sensitive to the question-wording. (c) Some models reflect clear\npreferences in ambiguous scenarios. Specifically, closed-source models tend to\nagree with each other.\n","authors":["Nino Scherrer","Claudia Shi","Amir Feder","David M. Blei"],"pdf_url":"https://arxiv.org/pdf/2307.14324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14316v1","updated":"2023-07-26T17:26:21Z","published":"2023-07-26T17:26:21Z","title":"Reinforcement Learning by Guided Safe Exploration","summary":"  Safety is critical to broadening the application of reinforcement learning\n(RL). Often, we train RL agents in a controlled environment, such as a\nlaboratory, before deploying them in the real world. However, the real-world\ntarget task might be unknown prior to deployment. Reward-free RL trains an\nagent without the reward to adapt quickly once the reward is revealed. We\nconsider the constrained reward-free setting, where an agent (the guide) learns\nto explore safely without the reward signal. This agent is trained in a\ncontrolled environment, which allows unsafe interactions and still provides the\nsafety signal. After the target task is revealed, safety violations are not\nallowed anymore. Thus, the guide is leveraged to compose a safe behaviour\npolicy. Drawing from transfer learning, we also regularize a target policy (the\nstudent) towards the guide while the student is unreliable and gradually\neliminate the influence of the guide as training progresses. The empirical\nanalysis shows that this method can achieve safe transfer learning and helps\nthe student solve the target task faster.\n","authors":["Qisong Yang","Thiago D. Simão","Nils Jansen","Simon H. Tindemans","Matthijs T. J. Spaan"],"pdf_url":"https://arxiv.org/pdf/2307.14316v1.pdf","comment":"Accecpted at ECAI 2023"},{"id":"http://arxiv.org/abs/2307.14304v1","updated":"2023-07-26T17:12:04Z","published":"2023-07-26T17:12:04Z","title":"A Constraint Enforcement Deep Reinforcement Learning Framework for\n  Optimal Energy Storage Systems Dispatch","summary":"  The optimal dispatch of energy storage systems (ESSs) presents formidable\nchallenges due to the uncertainty introduced by fluctuations in dynamic prices,\ndemand consumption, and renewable-based energy generation. By exploiting the\ngeneralization capabilities of deep neural networks (DNNs), deep reinforcement\nlearning (DRL) algorithms can learn good-quality control models that adaptively\nrespond to distribution networks' stochastic nature. However, current DRL\nalgorithms lack the capabilities to enforce operational constraints strictly,\noften even providing unfeasible control actions. To address this issue, we\npropose a DRL framework that effectively handles continuous action spaces while\nstrictly enforcing the environments and action space operational constraints\nduring online operation. Firstly, the proposed framework trains an action-value\nfunction modeled using DNNs. Subsequently, this action-value function is\nformulated as a mixed-integer programming (MIP) formulation enabling the\nconsideration of the environment's operational constraints. Comprehensive\nnumerical simulations show the superior performance of the proposed MIP-DRL\nframework, effectively enforcing all constraints while delivering high-quality\ndispatch decisions when compared with state-of-the-art DRL algorithms and the\noptimal solution obtained with a perfect forecast of the stochastic variables.\n","authors":["Shengren Hou","Edgar Mauricio Salazar Duque","Peter Palensky","Pedro P. Vergara"],"pdf_url":"https://arxiv.org/pdf/2307.14304v1.pdf","comment":"This paper has been submitted to a publication in a journal. This\n  corresponds to the submitted version. After acceptance, it may be removed\n  depending on the journal's requirements for copyright"},{"id":"http://arxiv.org/abs/2206.04140v2","updated":"2023-07-26T17:05:12Z","published":"2022-06-08T20:06:23Z","title":"TreeFlow: Going beyond Tree-based Gaussian Probabilistic Regression","summary":"  The tree-based ensembles are known for their outstanding performance in\nclassification and regression problems characterized by feature vectors\nrepresented by mixed-type variables from various ranges and domains. However,\nconsidering regression problems, they are primarily designed to provide\ndeterministic responses or model the uncertainty of the output with Gaussian or\nparametric distribution. In this work, we introduce TreeFlow, the tree-based\napproach that combines the benefits of using tree ensembles with the\ncapabilities of modeling flexible probability distributions using normalizing\nflows. The main idea of the solution is to use a tree-based model as a feature\nextractor and combine it with a conditional variant of normalizing flow.\nConsequently, our approach is capable of modeling complex distributions for the\nregression outputs. We evaluate the proposed method on challenging regression\nbenchmarks with varying volume, feature characteristics, and target\ndimensionality. We obtain the SOTA results for both probabilistic and\ndeterministic metrics on datasets with multi-modal target distributions and\ncompetitive results on unimodal ones compared to tree-based regression\nbaselines.\n","authors":["Patryk Wielopolski","Maciej Zięba"],"pdf_url":"https://arxiv.org/pdf/2206.04140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14298v1","updated":"2023-07-26T16:58:10Z","published":"2023-07-26T16:58:10Z","title":"ChatGPT and Persuasive Technologies for the Management and Delivery of\n  Personalized Recommendations in Hotel Hospitality","summary":"  Recommender systems have become indispensable tools in the hotel hospitality\nindustry, enabling personalized and tailored experiences for guests. Recent\nadvancements in large language models (LLMs), such as ChatGPT, and persuasive\ntechnologies, have opened new avenues for enhancing the effectiveness of those\nsystems. This paper explores the potential of integrating ChatGPT and\npersuasive technologies for automating and improving hotel hospitality\nrecommender systems. First, we delve into the capabilities of ChatGPT, which\ncan understand and generate human-like text, enabling more accurate and\ncontext-aware recommendations. We discuss the integration of ChatGPT into\nrecommender systems, highlighting the ability to analyze user preferences,\nextract valuable insights from online reviews, and generate personalized\nrecommendations based on guest profiles. Second, we investigate the role of\npersuasive technology in influencing user behavior and enhancing the persuasive\nimpact of hotel recommendations. By incorporating persuasive techniques, such\nas social proof, scarcity and personalization, recommender systems can\neffectively influence user decision-making and encourage desired actions, such\nas booking a specific hotel or upgrading their room. To investigate the\nefficacy of ChatGPT and persuasive technologies, we present a pilot experi-ment\nwith a case study involving a hotel recommender system. We aim to study the\nimpact of integrating ChatGPT and persua-sive techniques on user engagement,\nsatisfaction, and conversion rates. The preliminary results demonstrate the\npotential of these technologies in enhancing the overall guest experience and\nbusiness performance. Overall, this paper contributes to the field of hotel\nhospitality by exploring the synergistic relationship between LLMs and\npersuasive technology in recommender systems, ultimately influencing guest\nsatisfaction and hotel revenue.\n","authors":["Manolis Remountakis","Konstantinos Kotis","Babis Kourtzis","George E. Tsekouras"],"pdf_url":"https://arxiv.org/pdf/2307.14298v1.pdf","comment":"17 pages, 12 figures"},{"id":"http://arxiv.org/abs/2303.16109v2","updated":"2023-07-26T16:58:06Z","published":"2023-03-28T16:25:16Z","title":"Multimodal Manoeuvre and Trajectory Prediction for Automated Driving on\n  Highways Using Transformer Networks","summary":"  Predicting the behaviour (i.e., manoeuvre/trajectory) of other road users,\nincluding vehicles, is critical for the safe and efficient operation of\nautonomous vehicles (AVs), a.k.a., automated driving systems (ADSs). Due to the\nuncertain future behaviour of vehicles, multiple future behaviour modes are\noften plausible for a vehicle in a given driving scene. Therefore, multimodal\nprediction can provide richer information than single-mode prediction, enabling\nAVs to perform a better risk assessment. To this end, we propose a novel\nmultimodal prediction framework that can predict multiple plausible behaviour\nmodes and their likelihoods. The proposed framework includes a bespoke problem\nformulation for manoeuvre prediction, a novel transformer-based prediction\nmodel, and a tailored training method for multimodal manoeuvre and trajectory\nprediction. The performance of the framework is evaluated using three public\nhighway driving datasets, namely NGSIM, highD, and exiD. The results show that\nour framework outperforms the state-of-the-art multimodal methods in terms of\nprediction error and is capable of predicting plausible manoeuvre and\ntrajectory modes.\n","authors":["Sajjad Mozaffari","Mreza Alipour Sormoli","Konstantinos Koufos","Mehrdad Dianati"],"pdf_url":"https://arxiv.org/pdf/2303.16109v2.pdf","comment":"8 pages, 3 figures, submitted to IEEE RAL"},{"id":"http://arxiv.org/abs/2307.14294v1","updated":"2023-07-26T16:51:18Z","published":"2023-07-26T16:51:18Z","title":"Unraveling the Complexity of Splitting Sequential Data: Tackling\n  Challenges in Video and Time Series Analysis","summary":"  Splitting of sequential data, such as videos and time series, is an essential\nstep in various data analysis tasks, including object tracking and anomaly\ndetection. However, splitting sequential data presents a variety of challenges\nthat can impact the accuracy and reliability of subsequent analyses. This\nconcept article examines the challenges associated with splitting sequential\ndata, including data acquisition, data representation, split ratio selection,\nsetting up quality criteria, and choosing suitable selection strategies. We\nexplore these challenges through two real-world examples: motor test benches\nand particle tracking in liquids.\n","authors":["Diego Botache","Kristina Dingel","Rico Huhnstock","Arno Ehresmann","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2307.14294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.01364v2","updated":"2023-07-26T16:50:34Z","published":"2022-11-02T17:59:09Z","title":"An optimal control perspective on diffusion-based generative modeling","summary":"  We establish a connection between stochastic optimal control and generative\nmodels based on stochastic differential equations (SDEs), such as recently\ndeveloped diffusion probabilistic models. In particular, we derive a\nHamilton-Jacobi-Bellman equation that governs the evolution of the\nlog-densities of the underlying SDE marginals. This perspective allows to\ntransfer methods from optimal control theory to generative modeling. First, we\nshow that the evidence lower bound is a direct consequence of the well-known\nverification theorem from control theory. Further, we can formulate\ndiffusion-based generative modeling as a minimization of the Kullback-Leibler\ndivergence between suitable measures in path space. Finally, we develop a novel\ndiffusion-based method for sampling from unnormalized densities -- a problem\nfrequently occurring in statistics and computational sciences. We demonstrate\nthat our time-reversed diffusion sampler (DIS) can outperform other\ndiffusion-based sampling approaches on multiple numerical examples.\n","authors":["Julius Berner","Lorenz Richter","Karen Ullrich"],"pdf_url":"https://arxiv.org/pdf/2211.01364v2.pdf","comment":"Accepted for oral presentation at NeurIPS 2022 Workshop on\n  Score-Based Methods"},{"id":"http://arxiv.org/abs/2307.14283v1","updated":"2023-07-26T16:35:48Z","published":"2023-07-26T16:35:48Z","title":"General Purpose Artificial Intelligence Systems (GPAIS): Properties,\n  Definition, Taxonomy, Open Challenges and Implications","summary":"  Most applications of Artificial Intelligence (AI) are designed for a confined\nand specific task. However, there are many scenarios that call for a more\ngeneral AI, capable of solving a wide array of tasks without being specifically\ndesigned for them. The term General-Purpose Artificial Intelligence Systems\n(GPAIS) has been defined to refer to these AI systems. To date, the possibility\nof an Artificial General Intelligence, powerful enough to perform any\nintellectual task as if it were human, or even improve it, has remained an\naspiration, fiction, and considered a risk for our society. Whilst we might\nstill be far from achieving that, GPAIS is a reality and sitting at the\nforefront of AI research.\n  This work discusses existing definitions for GPAIS and proposes a new\ndefinition that allows for a gradual differentiation among types of GPAIS\naccording to their properties and limitations. We distinguish between\nclosed-world and open-world GPAIS, characterising their degree of autonomy and\nability based on several factors such as adaptation to new tasks, competence in\ndomains not intentionally trained for, ability to learn from few data, or\nproactive acknowledgment of their own limitations. We then propose a taxonomy\nof approaches to realise GPAIS, describing research trends such as the use of\nAI techniques to improve another AI or foundation models. As a prime example,\nwe delve into generative AI, aligning them with the terms and concepts\npresented in the taxonomy. Through the proposed definition and taxonomy, our\naim is to facilitate research collaboration across different areas that are\ntackling general-purpose tasks, as they share many common aspects. Finally, we\ndiscuss the current state of GPAIS, its challenges and prospects, implications\nfor our society, and the need for responsible and trustworthy AI systems and\nregulation, with the goal of providing a holistic view of GPAIS.\n","authors":["Isaac Triguero","Daniel Molina","Javier Poyatos","Javier Del Ser","Francisco Herrera"],"pdf_url":"https://arxiv.org/pdf/2307.14283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09013v2","updated":"2023-07-26T16:30:45Z","published":"2023-02-17T17:29:24Z","title":"Uniformity Testing over Hypergrids with Subcube Conditioning","summary":"  We give an algorithm for testing uniformity of distributions supported on\nhypergrids $[m_1] \\times \\cdots \\times [m_n]$, which makes\n$\\smash{\\widetilde{O}(\\text{poly}(m)\\sqrt{n}/\\epsilon^2)}$ many queries to a\nsubcube conditional sampling oracle with $m=\\max_i m_i$. When $m$ is a\nconstant, our algorithm is nearly optimal and strengthens the algorithm of\n[CCK+21] which has the same query complexity but works for hypercubes $\\{\\pm\n1\\}^n$ only.\n  A key technical contribution behind the analysis of our algorithm is a proof\nof a robust version of Pisier's inequality for functions over hypergrids using\nFourier analysis.\n","authors":["Xi Chen","Cassandra Marcussen"],"pdf_url":"https://arxiv.org/pdf/2302.09013v2.pdf","comment":"Extended results to the domain [m_1] x ... x [m_n] (previously was\n  [m]^n); substantial revisions to the introduction and conclusion of the paper"},{"id":"http://arxiv.org/abs/2210.03829v2","updated":"2023-07-26T16:26:52Z","published":"2022-10-07T21:49:26Z","title":"Early Detection of Bark Beetle Attack Using Remote Sensing and Machine\n  Learning: A Review","summary":"  This paper provides a comprehensive review of past and current advances in\nthe early detection of bark beetle-induced tree mortality from three primary\nperspectives: bark beetle & host interactions, RS, and ML/DL. In contrast to\nprior efforts, this review encompasses all RS systems and emphasizes ML/DL\nmethods to investigate their strengths and weaknesses. We parse existing\nliterature based on multi- or hyper-spectral analyses and distill their\nknowledge based on: bark beetle species & attack phases with a primary emphasis\non early stages of attacks, host trees, study regions, RS platforms & sensors,\nspectral/spatial/temporal resolutions, spectral signatures, spectral vegetation\nindices (SVIs), ML approaches, learning schemes, task categories, models,\nalgorithms, classes/clusters, features, and DL networks & architectures.\nAlthough DL-based methods and the random forest (RF) algorithm showed promising\nresults, highlighting their potential to detect subtle changes across visible,\nthermal, and short-wave infrared (SWIR) spectral regions, they still have\nlimited effectiveness and high uncertainties. To inspire novel solutions to\nthese shortcomings, we delve into the principal challenges & opportunities from\ndifferent perspectives, enabling a deeper understanding of the current state of\nresearch and guiding future research directions.\n","authors":["Seyed Mojtaba Marvasti-Zadeh","Devin Goodsman","Nilanjan Ray","Nadir Erbilgin"],"pdf_url":"https://arxiv.org/pdf/2210.03829v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2209.07028v2","updated":"2023-07-26T16:21:03Z","published":"2022-09-15T03:41:09Z","title":"Estimating large causal polytrees from small samples","summary":"  We consider the problem of estimating a large causal polytree from a\nrelatively small i.i.d. sample. This is motivated by the problem of determining\ncausal structure when the number of variables is very large compared to the\nsample size, such as in gene regulatory networks. We give an algorithm that\nrecovers the tree with high accuracy in such settings. The algorithm works\nunder essentially no distributional or modeling assumptions other than some\nmild non-degeneracy conditions.\n","authors":["Sourav Chatterjee","Mathukumalli Vidyasagar"],"pdf_url":"https://arxiv.org/pdf/2209.07028v2.pdf","comment":"24 pages. The title of the paper has been slightly modified, by\n  removing the word \"skeleton\". This is because the original version of the\n  paper had an algorithm for recovering only the skeleton, while in this\n  version, we have a way of recovering the directionalities of the arrows as\n  well"},{"id":"http://arxiv.org/abs/2201.11104v4","updated":"2023-07-26T16:13:33Z","published":"2022-01-26T18:29:00Z","title":"Combining optimal path search with task-dependent learning in a neural\n  network","summary":"  Finding optimal paths in connected graphs requires determining the smallest\ntotal cost for traveling along the graph's edges. This problem can be solved by\nseveral classical algorithms where, usually, costs are predefined for all\nedges. Conventional planning methods can, thus, normally not be used when\nwanting to change costs in an adaptive way following the requirements of some\ntask. Here we show that one can define a neural network representation of path\nfinding problems by transforming cost values into synaptic weights, which\nallows for online weight adaptation using network learning mechanisms. When\nstarting with an initial activity value of one, activity propagation in this\nnetwork will lead to solutions, which are identical to those found by the\nBellman-Ford algorithm. The neural network has the same algorithmic complexity\nas Bellman-Ford and, in addition, we can show that network learning mechanisms\n(such as Hebbian learning) can adapt the weights in the network augmenting the\nresulting paths according to some task at hand. We demonstrate this by learning\nto navigate in an environment with obstacles as well as by learning to follow\ncertain sequences of path nodes. Hence, the here-presented novel algorithm may\nopen up a different regime of applications where path-augmentation (by\nlearning) is directly coupled with path finding in a natural way.\n","authors":["Tomas Kulvicius","Minija Tamosiunaite","Florentin Wörgötter"],"pdf_url":"https://arxiv.org/pdf/2201.11104v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14273v1","updated":"2023-07-26T16:11:51Z","published":"2023-07-26T16:11:51Z","title":"Deepfake Image Generation for Improved Brain Tumor Segmentation","summary":"  As the world progresses in technology and health, awareness of disease by\nrevealing asymptomatic signs improves. It is important to detect and treat\ntumors in early stage as it can be life-threatening. Computer-aided\ntechnologies are used to overcome lingering limitations facing disease\ndiagnosis, while brain tumor segmentation remains a difficult process,\nespecially when multi-modality data is involved. This is mainly attributed to\nineffective training due to lack of data and corresponding labelling. This work\ninvestigates the feasibility of employing deep-fake image generation for\neffective brain tumor segmentation. To this end, a Generative Adversarial\nNetwork was used for image-to-image translation for increasing dataset size,\nfollowed by image segmentation using a U-Net-based convolutional neural network\ntrained with deepfake images. Performance of the proposed approach is compared\nwith ground truth of four publicly available datasets. Results show improved\nperformance in terms of image segmentation quality metrics, and could\npotentially assist when training with limited data.\n","authors":["Roa'a Al-Emaryeen","Sara Al-Nahhas","Fatima Himour","Waleed Mahafza","Omar Al-Kadi"],"pdf_url":"https://arxiv.org/pdf/2307.14273v1.pdf","comment":"6 pages, 8 figures, 2 tables, conference paper"},{"id":"http://arxiv.org/abs/2203.05556v3","updated":"2023-07-26T15:57:47Z","published":"2022-03-10T18:59:21Z","title":"On Embeddings for Numerical Features in Tabular Deep Learning","summary":"  Recently, Transformer-like deep architectures have shown strong performance\non tabular data problems. Unlike traditional models, e.g., MLP, these\narchitectures map scalar values of numerical features to high-dimensional\nembeddings before mixing them in the main backbone. In this work, we argue that\nembeddings for numerical features are an underexplored degree of freedom in\ntabular DL, which allows constructing more powerful DL models and competing\nwith GBDT on some traditionally GBDT-friendly benchmarks. We start by\ndescribing two conceptually different approaches to building embedding modules:\nthe first one is based on a piecewise linear encoding of scalar values, and the\nsecond one utilizes periodic activations. Then, we empirically demonstrate that\nthese two approaches can lead to significant performance boosts compared to the\nembeddings based on conventional blocks such as linear layers and ReLU\nactivations. Importantly, we also show that embedding numerical features is\nbeneficial for many backbones, not only for Transformers. Specifically, after\nproper embeddings, simple MLP-like models can perform on par with the\nattention-based architectures. Overall, we highlight embeddings for numerical\nfeatures as an important design aspect with good potential for further\nimprovements in tabular DL.\n","authors":["Yury Gorishniy","Ivan Rubachev","Artem Babenko"],"pdf_url":"https://arxiv.org/pdf/2203.05556v3.pdf","comment":"NeurIPS 2022 camera-ready. Code:\n  https://github.com/yandex-research/tabular-dl-num-embeddings (v3: minor\n  fixes)"},{"id":"http://arxiv.org/abs/2106.11959v3","updated":"2023-07-26T15:57:25Z","published":"2021-06-22T17:58:10Z","title":"Revisiting Deep Learning Models for Tabular Data","summary":"  The existing literature on deep learning for tabular data proposes a wide\nrange of novel architectures and reports competitive results on various\ndatasets. However, the proposed models are usually not properly compared to\neach other and existing works often use different benchmarks and experiment\nprotocols. As a result, it is unclear for both researchers and practitioners\nwhat models perform best. Additionally, the field still lacks effective\nbaselines, that is, the easy-to-use models that provide competitive performance\nacross different problems.\n  In this work, we perform an overview of the main families of DL architectures\nfor tabular data and raise the bar of baselines in tabular DL by identifying\ntwo simple and powerful deep architectures. The first one is a ResNet-like\narchitecture which turns out to be a strong baseline that is often missing in\nprior works. The second model is our simple adaptation of the Transformer\narchitecture for tabular data, which outperforms other solutions on most tasks.\nBoth models are compared to many existing architectures on a diverse set of\ntasks under the same training and tuning protocols. We also compare the best DL\nmodels with Gradient Boosted Decision Trees and conclude that there is still no\nuniversally superior solution.\n","authors":["Yury Gorishniy","Ivan Rubachev","Valentin Khrulkov","Artem Babenko"],"pdf_url":"https://arxiv.org/pdf/2106.11959v3.pdf","comment":"NeurIPS 2021 camera-ready. Code:\n  https://github.com/yandex-research/tabular-dl-revisiting-models (v3: minor\n  fixes)"},{"id":"http://arxiv.org/abs/2211.01549v2","updated":"2023-07-26T15:15:58Z","published":"2022-11-03T01:51:14Z","title":"Client Selection in Federated Learning: Principles, Challenges, and\n  Opportunities","summary":"  As a privacy-preserving paradigm for training Machine Learning (ML) models,\nFederated Learning (FL) has received tremendous attention from both industry\nand academia. In a typical FL scenario, clients exhibit significant\nheterogeneity in terms of data distribution and hardware configurations. Thus,\nrandomly sampling clients in each training round may not fully exploit the\nlocal updates from heterogeneous clients, resulting in lower model accuracy,\nslower convergence rate, degraded fairness, etc. To tackle the FL client\nheterogeneity problem, various client selection algorithms have been developed,\nshowing promising performance improvement. In this paper, we systematically\npresent recent advances in the emerging field of FL client selection and its\nchallenges and research opportunities. We hope to facilitate practitioners in\nchoosing the most suitable client selection mechanisms for their applications,\nas well as inspire researchers and newcomers to better understand this exciting\nresearch topic.\n","authors":["Lei Fu","Huanle Zhang","Ge Gao","Mi Zhang","Xin Liu"],"pdf_url":"https://arxiv.org/pdf/2211.01549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14243v1","updated":"2023-07-26T15:14:10Z","published":"2023-07-26T15:14:10Z","title":"Fluorescent Neuronal Cells v2: Multi-Task, Multi-Format Annotations for\n  Deep Learning in Microscopy","summary":"  Fluorescent Neuronal Cells v2 is a collection of fluorescence microscopy\nimages and the corresponding ground-truth annotations, designed to foster\ninnovative research in the domains of Life Sciences and Deep Learning. This\ndataset encompasses three image collections in which rodent neuronal cells'\nnuclei and cytoplasm are stained with diverse markers to highlight their\nanatomical or functional characteristics. Alongside the images, we provide\nground-truth annotations for several learning tasks, including semantic\nsegmentation, object detection, and counting. The contribution is two-fold.\nFirst, given the variety of annotations and their accessible formats, we\nenvision our work facilitating methodological advancements in computer vision\napproaches for segmentation, detection, feature learning, unsupervised and\nself-supervised learning, transfer learning, and related areas. Second, by\nenabling extensive exploration and benchmarking, we hope Fluorescent Neuronal\nCells v2 will catalyze breakthroughs in fluorescence microscopy analysis and\npromote cutting-edge discoveries in life sciences. The data are available at:\nhttps://amsacta.unibo.it/id/eprint/7347\n","authors":["Luca Clissa","Antonio Macaluso","Roberto Morelli","Alessandra Occhinegro","Emiliana Piscitiello","Ludovico Taddei","Marco Luppi","Roberto Amici","Matteo Cerri","Timna Hitrec","Lorenzo Rinaldi","Antonio Zoccoli"],"pdf_url":"https://arxiv.org/pdf/2307.14243v1.pdf","comment":"11 pages; 5 figures; 2 tables"},{"id":"http://arxiv.org/abs/2307.14237v1","updated":"2023-07-26T15:05:17Z","published":"2023-07-26T15:05:17Z","title":"Evolving Multi-Objective Neural Network Controllers for Robot Swarms","summary":"  Many swarm robotics tasks consist of multiple conflicting objectives. This\nresearch proposes a multi-objective evolutionary neural network approach to\ndeveloping controllers for swarms of robots. The swarm robot controllers are\ntrained in a low-fidelity Python simulator and then tested in a high-fidelity\nsimulated environment using Webots. Simulations are then conducted to test the\nscalability of the evolved multi-objective robot controllers to environments\nwith a larger number of robots. The results presented demonstrate that the\nproposed approach can effectively control each of the robots. The robot swarm\nexhibits different behaviours as the weighting for each objective is adjusted.\nThe results also confirm that multi-objective neural network controllers\nevolved in a low-fidelity simulator can be transferred to high-fidelity\nsimulated environments and that the controllers can scale to environments with\na larger number of robots without further retraining needed.\n","authors":["Karl Mason","Sabine Hauert"],"pdf_url":"https://arxiv.org/pdf/2307.14237v1.pdf","comment":"This paper was presented at the 2023 Autonomous Robots and Multirobot\n  Systems (ARMS) Workshop, at The 22nd International Conference on Autonomous\n  Agents and Multiagent Systems (AAMAS 2023)"},{"id":"http://arxiv.org/abs/2202.12780v3","updated":"2023-07-26T14:55:02Z","published":"2022-02-25T15:52:19Z","title":"Model Comparison and Calibration Assessment: User Guide for Consistent\n  Scoring Functions in Machine Learning and Actuarial Practice","summary":"  One of the main tasks of actuaries and data scientists is to build good\npredictive models for certain phenomena such as the claim size or the number of\nclaims in insurance. These models ideally exploit given feature information to\nenhance the accuracy of prediction. This user guide revisits and clarifies\nstatistical techniques to assess the calibration or adequacy of a model on the\none hand, and to compare and rank different models on the other hand. In doing\nso, it emphasises the importance of specifying the prediction target functional\nat hand a priori (e.g. the mean or a quantile) and of choosing the scoring\nfunction in model comparison in line with this target functional. Guidance for\nthe practical choice of the scoring function is provided. Striving to bridge\nthe gap between science and daily practice in application, it focuses mainly on\nthe pedagogical presentation of existing results and of best practice. The\nresults are accompanied and illustrated by two real data case studies on\nworkers' compensation and customer churn.\n","authors":["Tobias Fissler","Christian Lorentzen","Michael Mayer"],"pdf_url":"https://arxiv.org/pdf/2202.12780v3.pdf","comment":"70 pages, 22 figures"},{"id":"http://arxiv.org/abs/2307.14225v1","updated":"2023-07-26T14:47:15Z","published":"2023-07-26T14:47:15Z","title":"Large Language Models are Competitive Near Cold-start Recommenders for\n  Language- and Item-based Preferences","summary":"  Traditional recommender systems leverage users' item preference history to\nrecommend novel content that users may like. However, modern dialog interfaces\nthat allow users to express language-based preferences offer a fundamentally\ndifferent modality for preference input. Inspired by recent successes of\nprompting paradigms for large language models (LLMs), we study their use for\nmaking recommendations from both item-based and language-based preferences in\ncomparison to state-of-the-art item-based collaborative filtering (CF) methods.\nTo support this investigation, we collect a new dataset consisting of both\nitem-based and language-based preferences elicited from users along with their\nratings on a variety of (biased) recommended items and (unbiased) random items.\nAmong numerous experimental results, we find that LLMs provide competitive\nrecommendation performance for pure language-based preferences (no item\npreferences) in the near cold-start case in comparison to item-based CF\nmethods, despite having no supervised training for this specific task\n(zero-shot) or only a few labels (few-shot). This is particularly promising as\nlanguage-based preference representations are more explainable and scrutable\nthan item-based or vector-based representations.\n","authors":["Scott Sanner","Krisztian Balog","Filip Radlinski","Ben Wedin","Lucas Dixon"],"pdf_url":"https://arxiv.org/pdf/2307.14225v1.pdf","comment":"To appear at RecSys'23"},{"id":"http://arxiv.org/abs/2307.09916v2","updated":"2023-07-26T14:21:08Z","published":"2023-07-19T11:40:15Z","title":"TimeTuner: Diagnosing Time Representations for Time-Series Forecasting\n  with Counterfactual Explanations","summary":"  Deep learning (DL) approaches are being increasingly used for time-series\nforecasting, with many efforts devoted to designing complex DL models. Recent\nstudies have shown that the DL success is often attributed to effective data\nrepresentations, fostering the fields of feature engineering and representation\nlearning. However, automated approaches for feature learning are typically\nlimited with respect to incorporating prior knowledge, identifying interactions\namong variables, and choosing evaluation metrics to ensure that the models are\nreliable. To improve on these limitations, this paper contributes a novel\nvisual analytics framework, namely TimeTuner, designed to help analysts\nunderstand how model behaviors are associated with localized correlations,\nstationarity, and granularity of time-series representations. The system mainly\nconsists of the following two-stage technique: We first leverage counterfactual\nexplanations to connect the relationships among time-series representations,\nmultivariate features and model predictions. Next, we design multiple\ncoordinated views including a partition-based correlation matrix and juxtaposed\nbivariate stripes, and provide a set of interactions that allow users to step\ninto the transformation selection process, navigate through the feature space,\nand reason the model performance. We instantiate TimeTuner with two\ntransformation methods of smoothing and sampling, and demonstrate its\napplicability on real-world time-series forecasting of univariate sunspots and\nmultivariate air pollutants. Feedback from domain experts indicates that our\nsystem can help characterize time-series representations and guide the feature\nengineering processes.\n","authors":["Jianing Hao","Qing Shi","Yilin Ye","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2307.09916v2.pdf","comment":"11 pages, 9 figures, this paper has been accepted by IEEE VIS 2023"},{"id":"http://arxiv.org/abs/2307.14208v1","updated":"2023-07-26T14:14:38Z","published":"2023-07-26T14:14:38Z","title":"Online Modeling and Monitoring of Dependent Processes under Resource\n  Constraints","summary":"  Monitoring a population of dependent processes under limited resources is\ncritical for abnormal events detection. A novel online collaborative learning\nmethod is proposed to adaptively allocate the resources for exploitation of\nhigh-risk processes and exploration of dependent dynamics. Efficiency of the\nproposed method is proved through theoretical analysis and experiments.\n","authors":["Tanapol Kosolwattana","Huazheng Wang","Ying Lin"],"pdf_url":"https://arxiv.org/pdf/2307.14208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.10848v2","updated":"2023-07-26T14:08:36Z","published":"2022-05-22T15:13:23Z","title":"Robust Quantity-Aware Aggregation for Federated Learning","summary":"  Federated learning (FL) enables multiple clients to collaboratively train\nmodels without sharing their local data, and becomes an important\nprivacy-preserving machine learning framework. However, classical FL faces\nserious security and robustness problem, e.g., malicious clients can poison\nmodel updates and at the same time claim large quantities to amplify the impact\nof their model updates in the model aggregation. Existing defense methods for\nFL, while all handling malicious model updates, either treat all quantities\nbenign or simply ignore/truncate the quantities of all clients. The former is\nvulnerable to quantity-enhanced attack, while the latter leads to sub-optimal\nperformance since the local data on different clients is usually in\nsignificantly different sizes. In this paper, we propose a robust\nquantity-aware aggregation algorithm for federated learning, called FedRA, to\nperform the aggregation with awareness of local data quantities while being\nable to defend against quantity-enhanced attacks. More specifically, we propose\na method to filter malicious clients by jointly considering the uploaded model\nupdates and data quantities from different clients, and performing\nquantity-aware weighted averaging on model updates from remaining clients.\nMoreover, as the number of malicious clients participating in the federated\nlearning may dynamically change in different rounds, we also propose a\nmalicious client number estimator to predict how many suspicious clients should\nbe filtered in each round. Experiments on four public datasets demonstrate the\neffectiveness of our FedRA method in defending FL against quantity-enhanced\nattacks.\n","authors":["Jingwei Yi","Fangzhao Wu","Huishuai Zhang","Bin Zhu","Tao Qi","Guangzhong Sun","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2205.10848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14199v1","updated":"2023-07-26T13:52:53Z","published":"2023-07-26T13:52:53Z","title":"Application of Random Forest and Support Vector Machine for\n  Investigation of Pressure Filtration Performance, a Zinc Plant Filter Cake\n  Modeling","summary":"  The hydrometallurgical method of zinc production involves leaching zinc from\nore and then separating the solid residue from the liquid solution by pressure\nfiltration. This separation process is very important since the solid residue\ncontains some moisture that can reduce the amount of zinc recovered. This study\nmodeled the pressure filtration process through Random Forest (RF) and Support\nVector Machine (SVM). The models take continuous variables (extracted features)\nfrom the lab samples as inputs. Thus, regression models namely Random Forest\nRegression (RFR) and Support Vector Regression (SVR) were chosen. A total\ndataset was obtained during the pressure filtration process in two conditions:\n1) Polypropylene (S1) and 2) Polyester fabrics (S2). To predict the cake\nmoisture, solids concentration (0.2 and 0.38), temperature (35 and 65\ncentigrade), pH (2, 3.5, and 5), pressure, cake thickness (14, 20, 26, and 34\nmm), air-blow time (2, 10 and 15 min) and filtration time were applied as input\nvariables. The models' predictive accuracy was evaluated by the coefficient of\ndetermination (R2) parameter. The results revealed that the RFR model is\nsuperior to the SVR model for cake moisture prediction.\n","authors":["Masoume Kazemi","Davood Moradkhani","Alireza Abbas Alipour"],"pdf_url":"https://arxiv.org/pdf/2307.14199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14193v1","updated":"2023-07-26T13:47:52Z","published":"2023-07-26T13:47:52Z","title":"Efficient Learning of Discrete-Continuous Computation Graphs","summary":"  Numerous models for supervised and reinforcement learning benefit from\ncombinations of discrete and continuous model components. End-to-end learnable\ndiscrete-continuous models are compositional, tend to generalize better, and\nare more interpretable. A popular approach to building discrete-continuous\ncomputation graphs is that of integrating discrete probability distributions\ninto neural networks using stochastic softmax tricks. Prior work has mainly\nfocused on computation graphs with a single discrete component on each of the\ngraph's execution paths. We analyze the behavior of more complex stochastic\ncomputations graphs with multiple sequential discrete components. We show that\nit is challenging to optimize the parameters of these models, mainly due to\nsmall gradients and local minima. We then propose two new strategies to\novercome these challenges. First, we show that increasing the scale parameter\nof the Gumbel noise perturbations during training improves the learning\nbehavior. Second, we propose dropout residual connections specifically tailored\nto stochastic, discrete-continuous computation graphs. With an extensive set of\nexperiments, we show that we can train complex discrete-continuous models which\none cannot train with standard stochastic softmax tricks. We also show that\ncomplex discrete-stochastic models generalize better than their continuous\ncounterparts on several benchmark datasets.\n","authors":["David Friede","Mathias Niepert"],"pdf_url":"https://arxiv.org/pdf/2307.14193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.04317v2","updated":"2023-07-26T13:43:04Z","published":"2022-03-08T18:07:47Z","title":"MICDIR: Multi-scale Inverse-consistent Deformable Image Registration\n  using UNetMSS with Self-Constructing Graph Latent","summary":"  Image registration is the process of bringing different images into a common\ncoordinate system - a technique widely used in various applications of computer\nvision, such as remote sensing, image retrieval, and, most commonly, medical\nimaging. Deep learning based techniques have been applied successfully to\ntackle various complex medical image processing problems, including medical\nimage registration. Over the years, several image registration techniques have\nbeen proposed using deep learning. Deformable image registration techniques\nsuch as Voxelmorph have been successful in capturing finer changes and\nproviding smoother deformations. However, Voxelmorph, as well as ICNet and\nFIRE, do not explicitly encode global dependencies (i.e. the overall anatomical\nview of the supplied image) and, therefore, cannot track large deformations. In\norder to tackle the aforementioned problems, this paper extends the Voxelmorph\napproach in three different ways. To improve the performance in case of small\nas well as large deformations, supervision of the model at different\nresolutions has been integrated using a multi-scale UNet. To support the\nnetwork to learn and encode the minute structural co-relations of the given\nimage-pairs, a self-constructing graph network (SCGNet) has been used as the\nlatent of the multi-scale UNet - which can improve the learning process of the\nmodel and help the model to generalise better. And finally, to make the\ndeformations inverse-consistent, cycle consistency loss has been employed. On\nthe task of registration of brain MRIs, the proposed method achieved\nsignificant improvements over ANTs and VoxelMorph, obtaining a Dice score of\n0.8013 \\pm 0.0243 for intramodal and 0.6211 \\pm 0.0309 for intermodal, while\nVoxelMorph achieved 0.7747 \\pm 0.0260 and 0.6071 \\pm 0.0510, respectively\n","authors":["Soumick Chatterjee","Himanshi Bajaj","Istiyak H. Siddiquee","Nandish Bandi Subbarayappa","Steve Simon","Suraj Bangalore Shashidhar","Oliver Speck","Andreas Nürnberge"],"pdf_url":"https://arxiv.org/pdf/2203.04317v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14185v1","updated":"2023-07-26T13:24:01Z","published":"2023-07-26T13:24:01Z","title":"A comparison of machine learning surrogate models of street-scale\n  flooding in Norfolk, Virginia","summary":"  Low-lying coastal cities, exemplified by Norfolk, Virginia, face the\nchallenge of street flooding caused by rainfall and tides, which strain\ntransportation and sewer systems and can lead to property damage. While\nhigh-fidelity, physics-based simulations provide accurate predictions of urban\npluvial flooding, their computational complexity renders them unsuitable for\nreal-time applications. Using data from Norfolk rainfall events between 2016\nand 2018, this study compares the performance of a previous surrogate model\nbased on a random forest algorithm with two deep learning models: Long\nShort-Term Memory (LSTM) and Gated Recurrent Unit (GRU). This investigation\nunderscores the importance of using a model architecture that supports the\ncommunication of prediction uncertainty and the effective integration of\nrelevant, multi-modal features.\n","authors":["Diana McSpadden","Steven Goldenberg","Binata Roy","Malachi Schram","Jonathan L. Goodall","Heather Richter"],"pdf_url":"https://arxiv.org/pdf/2307.14185v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2302.14831v2","updated":"2023-07-26T13:20:49Z","published":"2023-02-28T18:28:35Z","title":"FacEDiM: A Face Embedding Distribution Model for Few-Shot Biometric\n  Authentication of Cattle","summary":"  This work proposes to solve the problem of few-shot biometric authentication\nby computing the Mahalanobis distance between testing embeddings and a\nmultivariate Gaussian distribution of training embeddings obtained using\npre-trained CNNs. Experimental results show that models pre-trained on the\nImageNet dataset significantly outperform models pre-trained on human faces.\nWith a VGG16 model, we obtain a FRR of 1.25% for a FAR of 1.18% on a dataset of\n20 cattle identities.\n","authors":["Meshia Cédric Oveneke","Rucha Vaishampayan","Deogratias Lukamba Nsadisa","Jenny Ambukiyenyi Onya"],"pdf_url":"https://arxiv.org/pdf/2302.14831v2.pdf","comment":"4 pages, 1 figure, 1 table, paper accepted at Black In AI at the 36th\n  Conference on Neural Information Processing Systems (NeurIPS 2022), New\n  Orleans, USA"},{"id":"http://arxiv.org/abs/2307.14151v1","updated":"2023-07-26T12:29:58Z","published":"2023-07-26T12:29:58Z","title":"Learning Disentangled Discrete Representations","summary":"  Recent successes in image generation, model-based reinforcement learning, and\ntext-to-image generation have demonstrated the empirical advantages of discrete\nlatent representations, although the reasons behind their benefits remain\nunclear. We explore the relationship between discrete latent spaces and\ndisentangled representations by replacing the standard Gaussian variational\nautoencoder (VAE) with a tailored categorical variational autoencoder. We show\nthat the underlying grid structure of categorical distributions mitigates the\nproblem of rotational invariance associated with multivariate Gaussian\ndistributions, acting as an efficient inductive prior for disentangled\nrepresentations. We provide both analytical and empirical findings that\ndemonstrate the advantages of discrete VAEs for learning disentangled\nrepresentations. Furthermore, we introduce the first unsupervised model\nselection strategy that favors disentangled representations.\n","authors":["David Friede","Christian Reimers","Heiner Stuckenschmidt","Mathias Niepert"],"pdf_url":"https://arxiv.org/pdf/2307.14151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14145v1","updated":"2023-07-26T12:20:52Z","published":"2023-07-26T12:20:52Z","title":"Toward Design of Synthetic Active Inference Agents by Mere Mortals","summary":"  The theoretical properties of active inference agents are impressive, but how\ndo we realize effective agents in working hardware and software on edge\ndevices? This is an interesting problem because the computational load for\npolicy exploration explodes exponentially, while the computational resources\nare very limited for edge devices. In this paper, we discuss the necessary\nfeatures for a software toolbox that supports a competent non-expert engineer\nto develop working active inference agents. We introduce a toolbox-in-progress\nthat aims to accelerate the democratization of active inference agents in a\nsimilar way as TensorFlow propelled applications of deep learning technology.\n","authors":["Bert de Vries"],"pdf_url":"https://arxiv.org/pdf/2307.14145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14138v1","updated":"2023-07-26T12:06:13Z","published":"2023-07-26T12:06:13Z","title":"Piecewise-Stationary Combinatorial Semi-Bandit with Causally Related\n  Rewards","summary":"  We study the piecewise stationary combinatorial semi-bandit problem with\ncausally related rewards. In our nonstationary environment, variations in the\nbase arms' distributions, causal relationships between rewards, or both, change\nthe reward generation process. In such an environment, an optimal\ndecision-maker must follow both sources of change and adapt accordingly. The\nproblem becomes aggravated in the combinatorial semi-bandit setting, where the\ndecision-maker only observes the outcome of the selected bundle of arms. The\ncore of our proposed policy is the Upper Confidence Bound (UCB) algorithm. We\nassume the agent relies on an adaptive approach to overcome the challenge. More\nspecifically, it employs a change-point detector based on the Generalized\nLikelihood Ratio (GLR) test. Besides, we introduce the notion of group restart\nas a new alternative restarting strategy in the decision making process in\nstructured environments. Finally, our algorithm integrates a mechanism to trace\nthe variations of the underlying graph structure, which captures the causal\nrelationships between the rewards in the bandit setting. Theoretically, we\nestablish a regret upper bound that reflects the effects of the number of\nstructural- and distribution changes on the performance. The outcome of our\nnumerical experiments in real-world scenarios exhibits applicability and\nsuperior performance of our proposal compared to the state-of-the-art\nbenchmarks.\n","authors":["Behzad Nourani-Koliji","Steven Bilaj","Amir Rezaei Balef","Setareh Maghsudi"],"pdf_url":"https://arxiv.org/pdf/2307.14138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14134v1","updated":"2023-07-26T12:02:30Z","published":"2023-07-26T12:02:30Z","title":"Developing and Evaluating Tiny to Medium-Sized Turkish BERT Models","summary":"  This study introduces and evaluates tiny, mini, small, and medium-sized\nuncased Turkish BERT models, aiming to bridge the research gap in\nless-resourced languages. We trained these models on a diverse dataset\nencompassing over 75GB of text from multiple sources and tested them on several\ntasks, including mask prediction, sentiment analysis, news classification, and,\nzero-shot classification. Despite their smaller size, our models exhibited\nrobust performance, including zero-shot task, while ensuring computational\nefficiency and faster execution times. Our findings provide valuable insights\ninto the development and application of smaller language models, especially in\nthe context of the Turkish language.\n","authors":["Himmet Toprak Kesgin","Muzaffer Kaan Yuce","Mehmet Fatih Amasyali"],"pdf_url":"https://arxiv.org/pdf/2307.14134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01226v2","updated":"2023-07-26T11:40:03Z","published":"2023-02-02T17:06:50Z","title":"Factor Fields: A Unified Framework for Neural Fields and Beyond","summary":"  We present Factor Fields, a novel framework for modeling and representing\nsignals. Factor Fields decomposes a signal into a product of factors, each of\nwhich is represented by a neural or regular field representation operating on a\ncoordinate transformed input signal. We show that this decomposition yields a\nunified framework that generalizes several recent signal representations\nincluding NeRF, PlenOxels, EG3D, Instant-NGP, and TensoRF. Moreover, the\nframework allows for the creation of powerful new signal representations, such\nas the Coefficient-Basis Factorization (CoBaFa) which we propose in this paper.\nAs evidenced by our experiments, CoBaFa leads to improvements over previous\nfast reconstruction methods in terms of the three critical goals in neural\nsignal representation: approximation quality, compactness and efficiency.\nExperimentally, we demonstrate that our representation achieves better image\napproximation quality on 2D image regression tasks, higher geometric quality\nwhen reconstructing 3D signed distance fields and higher compactness for\nradiance field reconstruction tasks compared to previous fast reconstruction\nmethods. Besides, our CoBaFa representation enables generalization by sharing\nthe basis across signals during training, enabling generalization tasks such as\nimage regression with sparse observations and few-shot radiance field\nreconstruction. Project Page: https://apchenstu.github.io/FactorFields/\n","authors":["Anpei Chen","Zexiang Xu","Xinyue Wei","Siyu Tang","Hao Su","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2302.01226v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.14109v1","updated":"2023-07-26T11:12:55Z","published":"2023-07-26T11:12:55Z","title":"GraphRNN Revisited: An Ablation Study and Extensions for Directed\n  Acyclic Graphs","summary":"  GraphRNN is a deep learning-based architecture proposed by You et al. for\nlearning generative models for graphs. We replicate the results of You et al.\nusing a reproduced implementation of the GraphRNN architecture and evaluate\nthis against baseline models using new metrics. Through an ablation study, we\nfind that the BFS traversal suggested by You et al. to collapse representations\nof isomorphic graphs contributes significantly to model performance.\nAdditionally, we extend GraphRNN to generate directed acyclic graphs by\nreplacing the BFS traversal with a topological sort. We demonstrate that this\nmethod improves significantly over a directed-multiclass variant of GraphRNN on\na real-world dataset.\n","authors":["Taniya Das","Mark Koch","Maya Ravichandran","Nikhil Khatri"],"pdf_url":"https://arxiv.org/pdf/2307.14109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17595v3","updated":"2023-07-26T11:06:32Z","published":"2023-03-30T17:59:02Z","title":"Neglected Free Lunch -- Learning Image Classifiers Using Annotation\n  Byproducts","summary":"  Supervised learning of image classifiers distills human knowledge into a\nparametric model through pairs of images and corresponding labels (X,Y). We\nargue that this simple and widely used representation of human knowledge\nneglects rich auxiliary information from the annotation procedure, such as the\ntime-series of mouse traces and clicks left after image selection. Our insight\nis that such annotation byproducts Z provide approximate human attention that\nweakly guides the model to focus on the foreground cues, reducing spurious\ncorrelations and discouraging shortcut learning. To verify this, we create\nImageNet-AB and COCO-AB. They are ImageNet and COCO training sets enriched with\nsample-wise annotation byproducts, collected by replicating the respective\noriginal annotation tasks. We refer to the new paradigm of training models with\nannotation byproducts as learning using annotation byproducts (LUAB). We show\nthat a simple multitask loss for regressing Z together with Y already improves\nthe generalisability and robustness of the learned models. Compared to the\noriginal supervised learning, LUAB does not require extra annotation costs.\nImageNet-AB and COCO-AB are at https://github.com/naver-ai/NeglectedFreeLunch.\n","authors":["Dongyoon Han","Junsuk Choe","Seonghyeok Chun","John Joon Young Chung","Minsuk Chang","Sangdoo Yun","Jean Y. Song","Seong Joon Oh"],"pdf_url":"https://arxiv.org/pdf/2303.17595v3.pdf","comment":"Code & data at https://github.com/naver-ai/NeglectedFreeLunch. To be\n  presented at ICCV'23"},{"id":"http://arxiv.org/abs/2305.16573v3","updated":"2023-07-26T11:03:59Z","published":"2023-05-26T01:45:19Z","title":"Exploring Weight Balancing on Long-Tailed Recognition Problem","summary":"  Recognition problems in long-tailed data, where the sample size per class is\nheavily skewed, have recently gained importance because the distribution of the\nsample size per class in a dataset is generally exponential unless the sample\nsize is intentionally adjusted. Various approaches have been devised to address\nthese problems. Recently, weight balancing, which combines well-known classical\nregularization techniques with two-stage training, has been proposed. Despite\nits simplicity, it is known for its high performance against existing methods\ndevised in various ways. However, there is a lack of understanding as to why\nthis approach is effective for long-tailed data. In this study, we analyze the\nmethod focusing on neural collapse and cone effect at each training stage and\nfind that it can be decomposed into the increase in Fisher's discriminant ratio\nof the feature extractor caused by weight decay and cross entropy loss and\nimplicit logit adjustment caused by weight decay and class-balanced loss. Our\nanalysis shows that the training method can be further simplified by reducing\nthe number of training stages to one while increasing accuracy.\n","authors":["Naoya Hasegawa","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2305.16573v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.04172v3","updated":"2023-07-26T10:48:54Z","published":"2022-06-08T21:32:50Z","title":"Beyond the Edge of Stability via Two-step Gradient Updates","summary":"  Gradient Descent (GD) is a powerful workhorse of modern machine learning\nthanks to its scalability and efficiency in high-dimensional spaces. Its\nability to find local minimisers is only guaranteed for losses with Lipschitz\ngradients, where it can be seen as a `bona-fide' discretisation of an\nunderlying gradient flow. Yet, many ML setups involving overparametrised models\ndo not fall into this problem class, which has motivated research beyond the\nso-called ``Edge of Stability'' (EoS), where the step-size crosses the\nadmissibility threshold inversely proportional to the Lipschitz constant above.\nPerhaps surprisingly, GD has been empirically observed to still converge\nregardless of local instability and oscillatory behavior.\n  The incipient theoretical analysis of this phenomena has mainly focused in\nthe overparametrised regime, where the effect of choosing a large learning rate\nmay be associated to a `Sharpness-Minimisation' implicit regularisation within\nthe manifold of minimisers, under appropriate asymptotic limits. In contrast,\nin this work we directly examine the conditions for such unstable convergence,\nfocusing on simple, yet representative, learning problems, via analysis of\ntwo-step gradient updates. Specifically, we characterize a local condition\ninvolving third-order derivatives that guarantees existence and convergence to\nfixed points of the two-step updates, and leverage such property in a\nteacher-student setting, under population loss. Finally, starting from Matrix\nFactorization, we provide observations of period-2 orbit of GD in\nhigh-dimensional settings with intuition of its dynamics, along with\nexploration into more general settings.\n","authors":["Lei Chen","Joan Bruna"],"pdf_url":"https://arxiv.org/pdf/2206.04172v3.pdf","comment":"Accepted at ICML 2023. Update: more discussions on Matrix\n  Factorization"},{"id":"http://arxiv.org/abs/2307.06440v2","updated":"2023-07-26T10:33:21Z","published":"2023-07-12T20:10:14Z","title":"No Train No Gain: Revisiting Efficient Training Algorithms For\n  Transformer-based Language Models","summary":"  The computation necessary for training Transformer-based language models has\nskyrocketed in recent years. This trend has motivated research on efficient\ntraining algorithms designed to improve training, validation, and downstream\nperformance faster than standard training. In this work, we revisit three\ncategories of such algorithms: dynamic architectures (layer stacking, layer\ndropping), batch selection (selective backprop, RHO loss), and efficient\noptimizers (Lion, Sophia). When pre-training BERT and T5 with a fixed\ncomputation budget using such methods, we find that their training, validation,\nand downstream gains vanish compared to a baseline with a fully-decayed\nlearning rate. We define an evaluation protocol that enables computation to be\ndone on arbitrary machines by mapping all computation time to a reference\nmachine which we call reference system time. We discuss the limitations of our\nproposed protocol and release our code to encourage rigorous research in\nefficient training procedures: https://github.com/JeanKaddour/NoTrainNoGain.\n","authors":["Jean Kaddour","Oscar Key","Piotr Nawrot","Pasquale Minervini","Matt J. Kusner"],"pdf_url":"https://arxiv.org/pdf/2307.06440v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14085v1","updated":"2023-07-26T10:24:17Z","published":"2023-07-26T10:24:17Z","title":"Actions Speak What You Want: Provably Sample-Efficient Reinforcement\n  Learning of the Quantal Stackelberg Equilibrium from Strategic Feedbacks","summary":"  We study reinforcement learning (RL) for learning a Quantal Stackelberg\nEquilibrium (QSE) in an episodic Markov game with a leader-follower structure.\nIn specific, at the outset of the game, the leader announces her policy to the\nfollower and commits to it. The follower observes the leader's policy and, in\nturn, adopts a quantal response policy by solving an entropy-regularized policy\noptimization problem induced by leader's policy. The goal of the leader is to\nfind her optimal policy, which yields the optimal expected total return, by\ninteracting with the follower and learning from data. A key challenge of this\nproblem is that the leader cannot observe the follower's reward, and needs to\ninfer the follower's quantal response model from his actions against leader's\npolicies. We propose sample-efficient algorithms for both the online and\noffline settings, in the context of function approximation. Our algorithms are\nbased on (i) learning the quantal response model via maximum likelihood\nestimation and (ii) model-free or model-based RL for solving the leader's\ndecision making problem, and we show that they achieve sublinear regret upper\nbounds. Moreover, we quantify the uncertainty of these estimators and leverage\nthe uncertainty to implement optimistic and pessimistic algorithms for online\nand offline settings. Besides, when specialized to the linear and myopic\nsetting, our algorithms are also computationally efficient. Our theoretical\nanalysis features a novel performance-difference lemma which incorporates the\nerror of quantal response model, which might be of independent interest.\n","authors":["Siyu Chen","Mengdi Wang","Zhuoran Yang"],"pdf_url":"https://arxiv.org/pdf/2307.14085v1.pdf","comment":"129 pages, 1 figure"},{"id":"http://arxiv.org/abs/2304.04660v2","updated":"2023-07-26T10:06:06Z","published":"2023-04-10T15:36:54Z","title":"Uncertainty-driven Trajectory Truncation for Data Augmentation in\n  Offline Reinforcement Learning","summary":"  Equipped with the trained environmental dynamics, model-based offline\nreinforcement learning (RL) algorithms can often successfully learn good\npolicies from fixed-sized datasets, even some datasets with poor quality.\nUnfortunately, however, it can not be guaranteed that the generated samples\nfrom the trained dynamics model are reliable (e.g., some synthetic samples may\nlie outside of the support region of the static dataset). To address this\nissue, we propose Trajectory Truncation with Uncertainty (TATU), which\nadaptively truncates the synthetic trajectory if the accumulated uncertainty\nalong the trajectory is too large. We theoretically show the performance bound\nof TATU to justify its benefits. To empirically show the advantages of TATU, we\nfirst combine it with two classical model-based offline RL algorithms, MOPO and\nCOMBO. Furthermore, we integrate TATU with several off-the-shelf model-free\noffline RL algorithms, e.g., BCQ. Experimental results on the D4RL benchmark\nshow that TATU significantly improves their performance, often by a large\nmargin. Code is available here.\n","authors":["Junjie Zhang","Jiafei Lyu","Xiaoteng Ma","Jiangpeng Yan","Jun Yang","Le Wan","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2304.04660v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09469v2","updated":"2023-07-26T10:03:53Z","published":"2023-07-10T18:08:05Z","title":"Graph Representation of the Magnetic Field Topology in High-Fidelity\n  Plasma Simulations for Machine Learning Applications","summary":"  Topological analysis of the magnetic field in simulated plasmas allows the\nstudy of various physical phenomena in a wide range of settings. One such\napplication is magnetic reconnection, a phenomenon related to the dynamics of\nthe magnetic field topology, which is difficult to detect and characterize in\nthree dimensions. We propose a scalable pipeline for topological data analysis\nand spatiotemporal graph representation of three-dimensional magnetic vector\nfields. We demonstrate our methods on simulations of the Earth's magnetosphere\nproduced by Vlasiator, a supercomputer-scale Vlasov theory-based simulation for\nnear-Earth space. The purpose of this work is to challenge the machine learning\ncommunity to explore graph-based machine learning approaches to address a\nlargely open scientific problem with wide-ranging potential impact.\n","authors":["Ioanna Bouri","Fanni Franssila","Markku Alho","Giulia Cozzani","Ivan Zaitsev","Minna Palmroth","Teemu Roos"],"pdf_url":"https://arxiv.org/pdf/2307.09469v2.pdf","comment":"6 pages, 3 figures, Accepted at the ICML 2023 Workshop on Machine\n  Learning for Astrophysics"},{"id":"http://arxiv.org/abs/2307.14068v1","updated":"2023-07-26T09:40:19Z","published":"2023-07-26T09:40:19Z","title":"Dynamic Domain Discrepancy Adjustment for Active Multi-Domain Adaptation","summary":"  Multi-source unsupervised domain adaptation (MUDA) aims to transfer knowledge\nfrom related source domains to an unlabeled target domain. While recent MUDA\nmethods have shown promising results, most focus on aligning the overall\nfeature distributions across source domains, which can lead to negative effects\ndue to redundant features within each domain. Moreover, there is a significant\nperformance gap between MUDA and supervised methods. To address these\nchallenges, we propose a novel approach called Dynamic Domain Discrepancy\nAdjustment for Active Multi-Domain Adaptation (D3AAMDA). Firstly, we establish\na multi-source dynamic modulation mechanism during the training process based\non the degree of distribution differences between source and target domains.\nThis mechanism controls the alignment level of features between each source\ndomain and the target domain, effectively leveraging the local advantageous\nfeature information within the source domains. Additionally, we propose a\nMulti-source Active Boundary Sample Selection (MABS) strategy, which utilizes a\nguided dynamic boundary loss to design an efficient query function for\nselecting important samples. This strategy achieves improved generalization to\nthe target domain with minimal sampling costs. We extensively evaluate our\nproposed method on commonly used domain adaptation datasets, comparing it\nagainst existing UDA and ADA methods. The experimental results unequivocally\ndemonstrate the superiority of our approach.\n","authors":["Long Liu","Bo Zhou","Zhipeng Zhao","Zening Liu"],"pdf_url":"https://arxiv.org/pdf/2307.14068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14067v1","updated":"2023-07-26T09:34:34Z","published":"2023-07-26T09:34:34Z","title":"Machine Learning Applications In Healthcare: The State Of Knowledge and\n  Future Directions","summary":"  Detection of easily missed hidden patterns with fast processing power makes\nmachine learning (ML) indispensable to today's healthcare system. Though many\nML applications have already been discovered and many are still under\ninvestigation, only a few have been adopted by current healthcare systems. As a\nresult, there exists an enormous opportunity in healthcare system for ML but\ndistributed information, scarcity of properly arranged and easily explainable\ndocumentation in related sector are major impede which are making ML\napplications difficult to healthcare professionals. This study aimed to gather\nML applications in different areas of healthcare concisely and more effectively\nso that necessary information can be accessed immediately with relevant\nreferences. We divided our study into five major groups: community level work,\nrisk management/ preventive care, healthcare operation management, remote care,\nand early detection. Dividing these groups into subgroups, we provided relevant\nreferences with description in tabular form for quick access. Our objective is\nto inform people about ML applicability in healthcare industry, reduce the\nknowledge gap of clinicians about the ML applications and motivate healthcare\nprofessionals towards more machine learning based healthcare system.\n","authors":["Mrinmoy Roy","Sarwar J. Minar","Porarthi Dhar","A T M Omor Faruq"],"pdf_url":"https://arxiv.org/pdf/2307.14067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14066v1","updated":"2023-07-26T09:33:24Z","published":"2023-07-26T09:33:24Z","title":"Pre-Training with Diffusion models for Dental Radiography segmentation","summary":"  Medical radiography segmentation, and specifically dental radiography, is\nhighly limited by the cost of labeling which requires specific expertise and\nlabor-intensive annotations. In this work, we propose a straightforward\npre-training method for semantic segmentation leveraging Denoising Diffusion\nProbabilistic Models (DDPM), which have shown impressive results for generative\nmodeling. Our straightforward approach achieves remarkable performance in terms\nof label efficiency and does not require architectural modifications between\npre-training and downstream tasks. We propose to first pre-train a Unet by\nexploiting the DDPM training objective, and then fine-tune the resulting model\non a segmentation task. Our experimental results on the segmentation of dental\nradiographs demonstrate that the proposed method is competitive with\nstate-of-the-art pre-training methods.\n","authors":["Jérémy Rousseau","Christian Alaka","Emma Covili","Hippolyte Mayard","Laura Misrachi","Willy Au"],"pdf_url":"https://arxiv.org/pdf/2307.14066v1.pdf","comment":"13 pages, 6 figures, Deep Generative Models workshop @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.14025v1","updated":"2023-07-26T08:14:18Z","published":"2023-07-26T08:14:18Z","title":"Topologically-Regularized Multiple Instance Learning for Red Blood Cell\n  Disease Classification","summary":"  Diagnosing rare anemia disorders using microscopic images is challenging for\nskilled specialists and machine-learning methods alike. Due to thousands of\ndisease-relevant cells in a single blood sample, this constitutes a complex\nmultiple-instance learning (MIL) problem. While the spatial neighborhood of red\nblood cells is not meaningful per se, the topology, i.e., the geometry of blood\nsamples as a whole, contains informative features to remedy typical MIL issues,\nsuch as vanishing gradients and overfitting when training on limited data. We\nthus develop a topology-based approach that extracts multi-scale topological\nfeatures from bags of single red blood cell images. The topological features\nare used to regularize the model, enforcing the preservation of characteristic\ntopological properties of the data. Applied to a dataset of 71 patients\nsuffering from rare anemia disorders with 521 microscopic images of red blood\ncells, our experiments show that topological regularization is an effective\nmethod that leads to more than 3% performance improvements for the automated\nclassification of rare anemia disorders based on single-cell images. This is\nthe first approach that uses topological properties for regularizing the MIL\nprocess.\n","authors":["Salome Kazeminia","Ario Sadafi","Asya Makhro","Anna Bogdanova","Carsten Marr","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2307.14025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14023v1","updated":"2023-07-26T08:07:37Z","published":"2023-07-26T08:07:37Z","title":"Are Transformers with One Layer Self-Attention Using Low-Rank Weight\n  Matrices Universal Approximators?","summary":"  Existing analyses of the expressive capacity of Transformer models have\nrequired excessively deep layers for data memorization, leading to a\ndiscrepancy with the Transformers actually used in practice. This is primarily\ndue to the interpretation of the softmax function as an approximation of the\nhardmax function. By clarifying the connection between the softmax function and\nthe Boltzmann operator, we prove that a single layer of self-attention with\nlow-rank weight matrices possesses the capability to perfectly capture the\ncontext of an entire input sequence. As a consequence, we show that\nsingle-layer Transformer has a memorization capacity for finite samples, and\nthat Transformers consisting of one self-attention layer with two feed-forward\nneural networks are universal approximators for continuous functions on a\ncompact domain.\n","authors":["Tokio Kajitsuka","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2307.14023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14012v1","updated":"2023-07-26T07:50:41Z","published":"2023-07-26T07:50:41Z","title":"MCMC-Correction of Score-Based Diffusion Models for Model Composition","summary":"  Diffusion models can be parameterised in terms of either a score or an energy\nfunction. The energy parameterisation has better theoretical properties, mainly\nthat it enables an extended sampling procedure with a Metropolis--Hastings\ncorrection step, based on the change in total energy in the proposed samples.\nHowever, it seems to yield slightly worse performance, and more importantly,\ndue to the widespread popularity of score-based diffusion, there are limited\navailability of off-the-shelf pre-trained energy-based ones. This limitation\nundermines the purpose of model composition, which aims to combine pre-trained\nmodels to sample from new distributions. Our proposal, however, suggests\nretaining the score parameterization and instead computing the energy-based\nacceptance probability through line integration of the score function. This\nallows us to re-use existing diffusion models and still combine the reverse\nprocess with various Markov-Chain Monte Carlo (MCMC) methods. We evaluate our\nmethod on a 2D experiment and find that it achieve similar or arguably better\nperformance than the energy parameterisation.\n","authors":["Anders Sjöberg","Jakob Lindqvist","Magnus Önnheim","Mats Jirstrand","Lennart Svensson"],"pdf_url":"https://arxiv.org/pdf/2307.14012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13996v1","updated":"2023-07-26T07:08:03Z","published":"2023-07-26T07:08:03Z","title":"Fast algorithms for k-submodular maximization subject to a matroid\n  constraint","summary":"  In this paper, we apply a Threshold-Decreasing Algorithm to maximize\n$k$-submodular functions under a matroid constraint, which reduces the query\ncomplexity of the algorithm compared to the greedy algorithm with little loss\nin approximation ratio. We give a $(\\frac{1}{2} - \\epsilon)$-approximation\nalgorithm for monotone $k$-submodular function maximization, and a\n$(\\frac{1}{3} - \\epsilon)$-approximation algorithm for non-monotone case, with\ncomplexity $O(\\frac{n(k\\cdot EO + IO)}{\\epsilon} \\log \\frac{r}{\\epsilon})$,\nwhere $r$ denotes the rank of the matroid, and $IO, EO$ denote the number of\noracles to evaluate whether a subset is an independent set and to compute the\nfunction value of $f$, respectively. Since the constraint of total size can be\nlooked as a special matroid, called uniform matroid, then we present the fast\nalgorithm for maximizing $k$-submodular functions subject to a total size\nconstraint as corollaries. corollaries.\n","authors":["Shuxian Niu","Qian Liu","Yang Zhou","Min Li"],"pdf_url":"https://arxiv.org/pdf/2307.13996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13995v1","updated":"2023-07-26T07:07:27Z","published":"2023-07-26T07:07:27Z","title":"Take Your Pick: Enabling Effective Personalized Federated Learning\n  within Low-dimensional Feature Space","summary":"  Personalized federated learning (PFL) is a popular framework that allows\nclients to have different models to address application scenarios where\nclients' data are in different domains. The typical model of a client in PFL\nfeatures a global encoder trained by all clients to extract universal features\nfrom the raw data and personalized layers (e.g., a classifier) trained using\nthe client's local data. Nonetheless, due to the differences between the data\ndistributions of different clients (aka, domain gaps), the universal features\nproduced by the global encoder largely encompass numerous components irrelevant\nto a certain client's local task. Some recent PFL methods address the above\nproblem by personalizing specific parameters within the encoder. However, these\nmethods encounter substantial challenges attributed to the high dimensionality\nand non-linearity of neural network parameter space. In contrast, the feature\nspace exhibits a lower dimensionality, providing greater intuitiveness and\ninterpretability as compared to the parameter space. To this end, we propose a\nnovel PFL framework named FedPick. FedPick achieves PFL in the low-dimensional\nfeature space by selecting task-relevant features adaptively for each client\nfrom the features generated by the global encoder based on its local data\ndistribution. It presents a more accessible and interpretable implementation of\nPFL compared to those methods working in the parameter space. Extensive\nexperimental results show that FedPick could effectively select task-relevant\nfeatures for each client and improve model performance in cross-domain FL.\n","authors":["Guogang Zhu","Xuefeng Liu","Shaojie Tang","Jianwei Niu","Xinghao Wu","Jiaxing Shen"],"pdf_url":"https://arxiv.org/pdf/2307.13995v1.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2307.13994v1","updated":"2023-07-26T07:07:03Z","published":"2023-07-26T07:07:03Z","title":"BovineTalk: Machine Learning for Vocalization Analysis of Dairy Cattle\n  under Negative Affective States","summary":"  There is a critical need to develop and validate non-invasive animal-based\nindicators of affective states in livestock species, in order to integrate them\ninto on-farm assessment protocols, potentially via the use of precision\nlivestock farming (PLF) tools. One such promising approach is the use of vocal\nindicators. The acoustic structure of vocalizations and their functions were\nextensively studied in important livestock species, such as pigs, horses,\npoultry and goats, yet cattle remain understudied in this context to date. Cows\nwere shown to produce two types vocalizations: low-frequency calls (LF),\nproduced with the mouth closed, or partially closed, for close distance\ncontacts and open mouth emitted high-frequency calls (HF), produced for long\ndistance communication, with the latter considered to be largely associated\nwith negative affective states. Moreover, cattle vocalizations were shown to\ncontain information on individuality across a wide range of contexts, both\nnegative and positive. Nowadays, dairy cows are facing a series of negative\nchallenges and stressors in a typical production cycle, making vocalizations\nduring negative affective states of special interest for research. One\ncontribution of this study is providing the largest to date pre-processed\n(clean from noises) dataset of lactating adult multiparous dairy cows during\nnegative affective states induced by visual isolation challenges. Here we\npresent two computational frameworks - deep learning based and explainable\nmachine learning based, to classify high and low-frequency cattle calls, and\nindividual cow voice recognition. Our models in these two frameworks reached\n87.2% and 89.4% accuracy for LF and HF classification, with 68.9% and 72.5%\naccuracy rates for the cow individual identification, respectively.\n","authors":["Dinu Gavojdian","Teddy Lazebnik","Madalina Mincu","Ariel Oren","Ioana Nicolae","Anna Zamansky"],"pdf_url":"https://arxiv.org/pdf/2307.13994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13991v1","updated":"2023-07-26T06:58:19Z","published":"2023-07-26T06:58:19Z","title":"METAVerse: Meta-Learning Traversability Cost Map for Off-Road Navigation","summary":"  Autonomous navigation in off-road conditions requires an accurate estimation\nof terrain traversability. However, traversability estimation in unstructured\nenvironments is subject to high uncertainty due to the variability of numerous\nfactors that influence vehicle-terrain interaction. Consequently, it is\nchallenging to obtain a generalizable model that can accurately predict\ntraversability in a variety of environments. This paper presents METAVerse, a\nmeta-learning framework for learning a global model that accurately and\nreliably predicts terrain traversability across diverse environments. We train\nthe traversability prediction network to generate a dense and continuous-valued\ncost map from a sparse LiDAR point cloud, leveraging vehicle-terrain\ninteraction feedback in a self-supervised manner. Meta-learning is utilized to\ntrain a global model with driving data collected from multiple environments,\neffectively minimizing estimation uncertainty. During deployment, online\nadaptation is performed to rapidly adapt the network to the local environment\nby exploiting recent interaction experiences. To conduct a comprehensive\nevaluation, we collect driving data from various terrains and demonstrate that\nour method can obtain a global model that minimizes uncertainty. Moreover, by\nintegrating our model with a model predictive controller, we demonstrate that\nthe reduced uncertainty results in safe and stable navigation in unstructured\nand unknown terrains.\n","authors":["Junwon Seo","Taekyung Kim","Seongyong Ahn","Kiho Kwak"],"pdf_url":"https://arxiv.org/pdf/2307.13991v1.pdf","comment":"Our video can be found at https://youtu.be/4rIAMM1ZKMo"},{"id":"http://arxiv.org/abs/2307.13989v1","updated":"2023-07-26T06:54:31Z","published":"2023-07-26T06:54:31Z","title":"This is not correct! Negation-aware Evaluation of Language Generation\n  Systems","summary":"  Large language models underestimate the impact of negations on how much they\nchange the meaning of a sentence. Therefore, learned evaluation metrics based\non these models are insensitive to negations. In this paper, we propose\nNegBLEURT, a negation-aware version of the BLEURT evaluation metric. For that,\nwe designed a rule-based sentence negation tool and used it to create the\nCANNOT negation evaluation dataset. Based on this dataset, we fine-tuned a\nsentence transformer and an evaluation metric to improve their negation\nsensitivity. Evaluating these models on existing benchmarks shows that our\nfine-tuned models outperform existing metrics on the negated sentences by far\nwhile preserving their base models' performances on other perturbations.\n","authors":["Miriam Anschütz","Diego Miguel Lozano","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2307.13989v1.pdf","comment":"Accepted to INLG 2023"},{"id":"http://arxiv.org/abs/2307.13978v1","updated":"2023-07-26T06:34:24Z","published":"2023-07-26T06:34:24Z","title":"Controlling the Latent Space of GANs through Reinforcement Learning: A\n  Case Study on Task-based Image-to-Image Translation","summary":"  Generative Adversarial Networks (GAN) have emerged as a formidable AI tool to\ngenerate realistic outputs based on training datasets. However, the challenge\nof exerting control over the generation process of GANs remains a significant\nhurdle. In this paper, we propose a novel methodology to address this issue by\nintegrating a reinforcement learning (RL) agent with a latent-space GAN\n(l-GAN), thereby facilitating the generation of desired outputs. More\nspecifically, we have developed an actor-critic RL agent with a meticulously\ndesigned reward policy, enabling it to acquire proficiency in navigating the\nlatent space of the l-GAN and generating outputs based on specified tasks. To\nsubstantiate the efficacy of our approach, we have conducted a series of\nexperiments employing the MNIST dataset, including arithmetic addition as an\nillustrative task. The outcomes of these experiments serve to validate our\nmethodology. Our pioneering integration of an RL agent with a GAN model\nrepresents a novel advancement, holding great potential for enhancing\ngenerative networks in the future.\n","authors":["Mahyar Abbasian","Taha Rajabzadeh","Ahmadreza Moradipari","Seyed Amir Hossein Aqajari","Hongsheng Lu","Amir Rahmani"],"pdf_url":"https://arxiv.org/pdf/2307.13978v1.pdf","comment":"7 pages, 7 figures, 2 tables, conference paper"},{"id":"http://arxiv.org/abs/2307.13962v1","updated":"2023-07-26T05:29:29Z","published":"2023-07-26T05:29:29Z","title":"Understanding Deep Neural Networks via Linear Separability of Hidden\n  Layers","summary":"  In this paper, we measure the linear separability of hidden layer outputs to\nstudy the characteristics of deep neural networks. In particular, we first\npropose Minkowski difference based linear separability measures (MD-LSMs) to\nevaluate the linear separability degree of two points sets. Then, we\ndemonstrate that there is a synchronicity between the linear separability\ndegree of hidden layer outputs and the network training performance, i.e., if\nthe updated weights can enhance the linear separability degree of hidden layer\noutputs, the updated network will achieve a better training performance, and\nvice versa. Moreover, we study the effect of activation function and network\nsize (including width and depth) on the linear separability of hidden layers.\nFinally, we conduct the numerical experiments to validate our findings on some\npopular deep networks including multilayer perceptron (MLP), convolutional\nneural network (CNN), deep belief network (DBN), ResNet, VGGNet, AlexNet,\nvision transformer (ViT) and GoogLeNet.\n","authors":["Chao Zhang","Xinyu Chen","Wensheng Li","Lixue Liu","Wei Wu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2307.13962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1808.00560v9","updated":"2023-07-26T04:30:49Z","published":"2018-08-01T20:55:54Z","title":"Compressible Spectral Mixture Kernels with Sparse Dependency Structures\n  for Gaussian Processes","summary":"  Spectral mixture (SM) kernels comprise a powerful class of generalized\nkernels for Gaussian processes (GPs) to describe complex patterns. This paper\nintroduces model compression and time- and phase (TP) modulated dependency\nstructures to the original (SM) kernel for improved generalization of GPs.\nSpecifically, by adopting Bienaym\\'es identity, we generalize the dependency\nstructure through cross-covariance between the SM components. Then, we propose\na novel SM kernel with a dependency structure (SMD) by using cross-convolution\nbetween the SM components. Furthermore, we ameliorate the expressiveness of the\ndependency structure by parameterizing it with time and phase delays. The\ndependency structure has clear interpretations in terms of spectral density,\ncovariance behavior, and sampling path. To enrich the SMD with effective\nhyperparameter initialization, compressible SM kernel components, and sparse\ndependency structures, we introduce a novel structure adaptation (SA) algorithm\nin the end. A thorough comparative analysis of the SMD on both synthetic and\nreal-life applications corroborates its efficacy.\n","authors":["Kai Chen","Yijue Dai","Feng Yin","Elena Marchiori","Sergios Theodoridis"],"pdf_url":"https://arxiv.org/pdf/1808.00560v9.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2307.13421v2","updated":"2023-07-26T04:05:10Z","published":"2023-07-25T11:40:47Z","title":"On the Learning Dynamics of Attention Networks","summary":"  Attention models are typically learned by optimizing one of three standard\nloss functions that are variously called -- soft attention, hard attention, and\nlatent variable marginal likelihood (LVML) attention. All three paradigms are\nmotivated by the same goal of finding two models -- a `focus' model that\n`selects' the right \\textit{segment} of the input and a `classification' model\nthat processes the selected segment into the target label. However, they differ\nsignificantly in the way the selected segments are aggregated, resulting in\ndistinct dynamics and final results. We observe a unique signature of models\nlearned using these paradigms and explain this as a consequence of the\nevolution of the classification model under gradient descent when the focus\nmodel is fixed. We also analyze these paradigms in a simple setting and derive\nclosed-form expressions for the parameter trajectory under gradient flow. With\nthe soft attention loss, the focus model improves quickly at initialization and\nsplutters later on. On the other hand, hard attention loss behaves in the\nopposite fashion. Based on our observations, we propose a simple hybrid\napproach that combines the advantages of the different loss functions and\ndemonstrates it on a collection of semi-synthetic and real-world datasets\n","authors":["Rahul Vashisht","Harish G. Ramaswamy"],"pdf_url":"https://arxiv.org/pdf/2307.13421v2.pdf","comment":"Preprint: Accepted at ECAI-2023"},{"id":"http://arxiv.org/abs/2307.10569v2","updated":"2023-07-26T03:57:03Z","published":"2023-07-20T04:14:09Z","title":"Deceptive Alignment Monitoring","summary":"  As the capabilities of large machine learning models continue to grow, and as\nthe autonomy afforded to such models continues to expand, the spectre of a new\nadversary looms: the models themselves. The threat that a model might behave in\na seemingly reasonable manner, while secretly and subtly modifying its behavior\nfor ulterior reasons is often referred to as deceptive alignment in the AI\nSafety & Alignment communities. Consequently, we call this new direction\nDeceptive Alignment Monitoring. In this work, we identify emerging directions\nin diverse machine learning subfields that we believe will become increasingly\nimportant and intertwined in the near future for deceptive alignment\nmonitoring, and we argue that advances in these fields present both long-term\nchallenges and new research opportunities. We conclude by advocating for\ngreater involvement by the adversarial machine learning community in these\nemerging directions.\n","authors":["Andres Carranza","Dhruv Pai","Rylan Schaeffer","Arnuv Tandon","Sanmi Koyejo"],"pdf_url":"https://arxiv.org/pdf/2307.10569v2.pdf","comment":"Accepted as BlueSky Oral to 2023 ICML AdvML Workshop"},{"id":"http://arxiv.org/abs/2307.13944v1","updated":"2023-07-26T03:55:08Z","published":"2023-07-26T03:55:08Z","title":"Entropy Neural Estimation for Graph Contrastive Learning","summary":"  Contrastive learning on graphs aims at extracting distinguishable high-level\nrepresentations of nodes. In this paper, we theoretically illustrate that the\nentropy of a dataset can be approximated by maximizing the lower bound of the\nmutual information across different views of a graph, \\ie, entropy is estimated\nby a neural network. Based on this finding, we propose a simple yet effective\nsubset sampling strategy to contrast pairwise representations between views of\na dataset. In particular, we randomly sample nodes and edges from a given graph\nto build the input subset for a view. Two views are fed into a parameter-shared\nSiamese network to extract the high-dimensional embeddings and estimate the\ninformation entropy of the entire graph. For the learning process, we propose\nto optimize the network using two objectives, simultaneously. Concretely, the\ninput of the contrastive loss function consists of positive and negative pairs.\nOur selection strategy of pairs is different from previous works and we present\na novel strategy to enhance the representation ability of the graph encoder by\nselecting nodes based on cross-view similarities. We enrich the diversity of\nthe positive and negative pairs by selecting highly similar samples and totally\ndifferent data with the guidance of cross-view similarity scores, respectively.\nWe also introduce a cross-view consistency constraint on the representations\ngenerated from the different views. This objective guarantees the learned\nrepresentations are consistent across views from the perspective of the entire\ngraph. We conduct extensive experiments on seven graph benchmarks, and the\nproposed approach achieves competitive performance compared to the current\nstate-of-the-art methods. The source code will be publicly released once this\npaper is accepted.\n","authors":["Yixuan Ma","Xiaolin Zhang","Peng Zhang","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2307.13944v1.pdf","comment":"ACM MM 2023 accepted"},{"id":"http://arxiv.org/abs/2209.03143v2","updated":"2023-07-26T03:52:36Z","published":"2022-09-07T13:40:08Z","title":"AudioLM: a Language Modeling Approach to Audio Generation","summary":"  We introduce AudioLM, a framework for high-quality audio generation with\nlong-term consistency. AudioLM maps the input audio to a sequence of discrete\ntokens and casts audio generation as a language modeling task in this\nrepresentation space. We show how existing audio tokenizers provide different\ntrade-offs between reconstruction quality and long-term structure, and we\npropose a hybrid tokenization scheme to achieve both objectives. Namely, we\nleverage the discretized activations of a masked language model pre-trained on\naudio to capture long-term structure and the discrete codes produced by a\nneural audio codec to achieve high-quality synthesis. By training on large\ncorpora of raw audio waveforms, AudioLM learns to generate natural and coherent\ncontinuations given short prompts. When trained on speech, and without any\ntranscript or annotation, AudioLM generates syntactically and semantically\nplausible speech continuations while also maintaining speaker identity and\nprosody for unseen speakers. Furthermore, we demonstrate how our approach\nextends beyond speech by generating coherent piano music continuations, despite\nbeing trained without any symbolic representation of music.\n","authors":["Zalán Borsos","Raphaël Marinier","Damien Vincent","Eugene Kharitonov","Olivier Pietquin","Matt Sharifi","Dominik Roblek","Olivier Teboul","David Grangier","Marco Tagliasacchi","Neil Zeghidour"],"pdf_url":"https://arxiv.org/pdf/2209.03143v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13943v1","updated":"2023-07-26T03:48:37Z","published":"2023-07-26T03:48:37Z","title":"Topology-aware Robust Optimization for Out-of-distribution\n  Generalization","summary":"  Out-of-distribution (OOD) generalization is a challenging machine learning\nproblem yet highly desirable in many high-stake applications. Existing methods\nsuffer from overly pessimistic modeling with low generalization confidence. As\ngeneralizing to arbitrary test distributions is impossible, we hypothesize that\nfurther structure on the topology of distributions is crucial in developing\nstrong OOD resilience. To this end, we propose topology-aware robust\noptimization (TRO) that seamlessly integrates distributional topology in a\nprincipled optimization framework. More specifically, TRO solves two\noptimization objectives: (1) Topology Learning which explores data manifold to\nuncover the distributional topology; (2) Learning on Topology which exploits\nthe topology to constrain robust optimization for tightly-bounded\ngeneralization risks. We theoretically demonstrate the effectiveness of our\napproach and empirically show that it significantly outperforms the state of\nthe arts in a wide range of tasks including classification, regression, and\nsemantic segmentation. Moreover, we empirically find the data-driven\ndistributional topology is consistent with domain knowledge, enhancing the\nexplainability of our approach.\n","authors":["Fengchun Qiao","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2307.13943v1.pdf","comment":"In ICLR 2023 (17 pages including appendix). The source code and\n  pre-trained models are publicly available at: https://github.com/joffery/TRO"},{"id":"http://arxiv.org/abs/2307.13494v2","updated":"2023-07-26T03:41:25Z","published":"2023-07-25T13:42:22Z","title":"Duet: efficient and scalable hybriD neUral rElation undersTanding","summary":"  Learned cardinality estimation methods have achieved high precision compared\nto traditional methods. Among learned methods, query-driven approaches face the\ndata and workload drift problem for a long time. Although both query-driven and\nhybrid methods are proposed to avoid this problem, even the state-of-art of\nthem suffer from high training and estimation costs, limited scalability,\ninstability, and long-tailed distribution problem on high cardinality and high\ndimensional tables, which seriously affects the practical application of\nlearned cardinality estimators. In this paper, we prove that most of these\nproblems are directly caused by the widely used progressive sampling. We solve\nthis problem by introducing predicates into the autoregressive model and\npropose Duet, a stable, efficient, and scalable hybrid method to estimate\ncardinality directly without sampling or any non-differentiable process, which\ncan not only reduces the inference complexity from $O(n)$ to $O(1)$ compared to\nNaru and UAE but also achieve higher accuracy on high cardinality and high\ndimensional tables. Experimental results show that Duet can achieve all the\ndesign goals above and be much more practical and even has a lower inference\ncost on CPU than that of most learned methods on GPU.\n","authors":["Kaixin Zhang","Hongzhi Wang","Yabin Lu","Ziqi Li","Chang Shu","Yu Yan","Donghua Yang"],"pdf_url":"https://arxiv.org/pdf/2307.13494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05915v2","updated":"2023-07-26T03:32:36Z","published":"2023-07-12T04:44:31Z","title":"Prompt Generate Train (PGT): Few-shot Domain Adaption of Retrieval\n  Augmented Generation Models for Open Book Question-Answering","summary":"  We propose a framework - Prompt, Generate, Train (PGT) - to efficiently\ndevelop a generative question-answering model for open-book question-answering\nover a proprietary collection of text documents. The framework adapts a\nretriever augmented generation (RAG) model to the target domain using\nsupervised fine-tuning and reinforcement learning with synthetic feedback in a\nfew-shot setting. This, we hypothesize, will yield an aligned, uncertainty\ncalibrated model that is competitive with GPT-4 based in-context retrieval\naugmented generation in generating relevant answers at lower serving costs. The\nframework's synthetic generation pipeline will generate synthetic training data\ncomprising <passage, question, answer> tuples using an open-source LLM and a\nnovel consistency filtering scheme. The pipeline will be designed to generate\nboth abstractive and extractive questions that span the entire corpus. The\nframework proposes to fine-tune a smaller RAG model comprising a dense\nretriever (ColBERTv2) and a smaller sized LLM on the synthetic dataset. In\nparallel, the framework will train a Reward model to score domain grounded\nanswers higher than hallucinated answers using an a priori relevance ordering\nof synthetically assembled samples. In the next phase, the framework will align\nthe RAG model with the target domain using reinforcement learning (Proximal\nPolicy Optimization). This step may improve the RAG model's ability to generate\ngrounded answers and ignore out of domain questions. In the final phase, the\nframework will calibrate the model's uncertainty for extractive\nquestion-answers.\n","authors":["C. S. Krishna"],"pdf_url":"https://arxiv.org/pdf/2307.05915v2.pdf","comment":"10"},{"id":"http://arxiv.org/abs/2307.13938v1","updated":"2023-07-26T03:30:28Z","published":"2023-07-26T03:30:28Z","title":"Improving Semi-Supervised Semantic Segmentation with Dual-Level Siamese\n  Structure Network","summary":"  Semi-supervised semantic segmentation (SSS) is an important task that\nutilizes both labeled and unlabeled data to reduce expenses on labeling\ntraining examples. However, the effectiveness of SSS algorithms is limited by\nthe difficulty of fully exploiting the potential of unlabeled data. To address\nthis, we propose a dual-level Siamese structure network (DSSN) for pixel-wise\ncontrastive learning. By aligning positive pairs with a pixel-wise contrastive\nloss using strong augmented views in both low-level image space and high-level\nfeature space, the proposed DSSN is designed to maximize the utilization of\navailable unlabeled data. Additionally, we introduce a novel class-aware\npseudo-label selection strategy for weak-to-strong supervision, which addresses\nthe limitations of most existing methods that do not perform selection or apply\na predefined threshold for all classes. Specifically, our strategy selects the\ntop high-confidence prediction of the weak view for each class to generate\npseudo labels that supervise the strong augmented views. This strategy is\ncapable of taking into account the class imbalance and improving the\nperformance of long-tailed classes. Our proposed method achieves\nstate-of-the-art results on two datasets, PASCAL VOC 2012 and Cityscapes,\noutperforming other SSS algorithms by a significant margin.\n","authors":["Zhibo Tain","Xiaolin Zhang","Peng Zhang","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2307.13938v1.pdf","comment":"ACM MM 2023 accpeted"},{"id":"http://arxiv.org/abs/2205.13619v5","updated":"2023-07-26T03:20:47Z","published":"2022-05-26T20:48:53Z","title":"Fairness in Recommendation: Foundations, Methods and Applications","summary":"  As one of the most pervasive applications of machine learning, recommender\nsystems are playing an important role on assisting human decision making. The\nsatisfaction of users and the interests of platforms are closely related to the\nquality of the generated recommendation results. However, as a highly\ndata-driven system, recommender system could be affected by data or algorithmic\nbias and thus generate unfair results, which could weaken the reliance of the\nsystems. As a result, it is crucial to address the potential unfairness\nproblems in recommendation settings. Recently, there has been growing attention\non fairness considerations in recommender systems with more and more literature\non approaches to promote fairness in recommendation. However, the studies are\nrather fragmented and lack a systematic organization, thus making it difficult\nto penetrate for new researchers to the domain. This motivates us to provide a\nsystematic survey of existing works on fairness in recommendation. This survey\nfocuses on the foundations for fairness in recommendation literature. It first\npresents a brief introduction about fairness in basic machine learning tasks\nsuch as classification and ranking in order to provide a general overview of\nfairness research, as well as introduce the more complex situations and\nchallenges that need to be considered when studying fairness in recommender\nsystems. After that, the survey will introduce fairness in recommendation with\na focus on the taxonomies of current fairness definitions, the typical\ntechniques for improving fairness, as well as the datasets for fairness studies\nin recommendation. The survey also talks about the challenges and opportunities\nin fairness research with the hope of promoting the fair recommendation\nresearch area and beyond.\n","authors":["Yunqi Li","Hanxiong Chen","Shuyuan Xu","Yingqiang Ge","Juntao Tan","Shuchang Liu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2205.13619v5.pdf","comment":"38 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.13924v1","updated":"2023-07-26T02:45:59Z","published":"2023-07-26T02:45:59Z","title":"trajdata: A Unified Interface to Multiple Human Trajectory Datasets","summary":"  The field of trajectory forecasting has grown significantly in recent years,\npartially owing to the release of numerous large-scale, real-world human\ntrajectory datasets for autonomous vehicles (AVs) and pedestrian motion\ntracking. While such datasets have been a boon for the community, they each use\ncustom and unique data formats and APIs, making it cumbersome for researchers\nto train and evaluate methods across multiple datasets. To remedy this, we\npresent trajdata: a unified interface to multiple human trajectory datasets. At\nits core, trajdata provides a simple, uniform, and efficient representation and\nAPI for trajectory and map data. As a demonstration of its capabilities, in\nthis work we conduct a comprehensive empirical evaluation of existing\ntrajectory datasets, providing users with a rich understanding of the data\nunderpinning much of current pedestrian and AV motion forecasting research, and\nproposing suggestions for future datasets from these insights. trajdata is\npermissively licensed (Apache 2.0) and can be accessed online at\nhttps://github.com/NVlabs/trajdata\n","authors":["Boris Ivanovic","Guanyu Song","Igor Gilitschenski","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2307.13924v1.pdf","comment":"15 pages, 15 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.13918v1","updated":"2023-07-26T02:34:57Z","published":"2023-07-26T02:34:57Z","title":"Simulation-based Inference for Cardiovascular Models","summary":"  Over the past decades, hemodynamics simulators have steadily evolved and have\nbecome tools of choice for studying cardiovascular systems in-silico. While\nsuch tools are routinely used to simulate whole-body hemodynamics from\nphysiological parameters, solving the corresponding inverse problem of mapping\nwaveforms back to plausible physiological parameters remains both promising and\nchallenging. Motivated by advances in simulation-based inference (SBI), we cast\nthis inverse problem as statistical inference. In contrast to alternative\napproaches, SBI provides \\textit{posterior distributions} for the parameters of\ninterest, providing a \\textit{multi-dimensional} representation of uncertainty\nfor \\textit{individual} measurements. We showcase this ability by performing an\nin-silico uncertainty analysis of five biomarkers of clinical interest\ncomparing several measurement modalities. Beyond the corroboration of known\nfacts, such as the feasibility of estimating heart rate, our study highlights\nthe potential of estimating new biomarkers from standard-of-care measurements.\nSBI reveals practically relevant findings that cannot be captured by standard\nsensitivity analyses, such as the existence of sub-populations for which\nparameter estimation exhibits distinct uncertainty regimes. Finally, we study\nthe gap between in-vivo and in-silico with the MIMIC-III waveform database and\ncritically discuss how cardiovascular simulations can inform real-world data\nanalysis.\n","authors":["Antoine Wehenkel","Jens Behrmann","Andrew C. Miller","Guillermo Sapiro","Ozan Sener","Marco Cuturi","Jörn-Henrik Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2307.13918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13917v1","updated":"2023-07-26T02:34:13Z","published":"2023-07-26T02:34:13Z","title":"BayesDAG: Gradient-Based Posterior Sampling for Causal Discovery","summary":"  Bayesian causal discovery aims to infer the posterior distribution over\ncausal models from observed data, quantifying epistemic uncertainty and\nbenefiting downstream tasks. However, computational challenges arise due to\njoint inference over combinatorial space of Directed Acyclic Graphs (DAGs) and\nnonlinear functions. Despite recent progress towards efficient posterior\ninference over DAGs, existing methods are either limited to variational\ninference on node permutation matrices for linear causal models, leading to\ncompromised inference accuracy, or continuous relaxation of adjacency matrices\nconstrained by a DAG regularizer, which cannot ensure resulting graphs are\nDAGs. In this work, we introduce a scalable Bayesian causal discovery framework\nbased on stochastic gradient Markov Chain Monte Carlo (SG-MCMC) that overcomes\nthese limitations. Our approach directly samples DAGs from the posterior\nwithout requiring any DAG regularization, simultaneously draws function\nparameter samples and is applicable to both linear and nonlinear causal models.\nTo enable our approach, we derive a novel equivalence to the permutation-based\nDAG learning, which opens up possibilities of using any relaxed gradient\nestimator defined over permutations. To our knowledge, this is the first\nframework applying gradient-based MCMC sampling for causal discovery. Empirical\nevaluations on synthetic and real-world datasets demonstrate our approach's\neffectiveness compared to state-of-the-art baselines.\n","authors":["Yashas Annadani","Nick Pawlowski","Joel Jennings","Stefan Bauer","Cheng Zhang","Wenbo Gong"],"pdf_url":"https://arxiv.org/pdf/2307.13917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13916v1","updated":"2023-07-26T02:33:54Z","published":"2023-07-26T02:33:54Z","title":"Online learning in bandits with predicted context","summary":"  We consider the contextual bandit problem where at each time, the agent only\nhas access to a noisy version of the context and the error variance (or an\nestimator of this variance). This setting is motivated by a wide range of\napplications where the true context for decision-making is unobserved, and only\na prediction of the context by a potentially complex machine learning algorithm\nis available. When the context error is non-diminishing, classical bandit\nalgorithms fail to achieve sublinear regret. We propose the first online\nalgorithm in this setting with sublinear regret compared to the appropriate\nbenchmark. The key idea is to extend the measurement error model in classical\nstatistics to the online decision-making setting, which is nontrivial due to\nthe policy being dependent on the noisy context observations.\n","authors":["Yongyi Guo","Susan Murphy"],"pdf_url":"https://arxiv.org/pdf/2307.13916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02115v2","updated":"2023-07-26T02:20:30Z","published":"2023-06-03T14:01:54Z","title":"Table and Image Generation for Investigating Knowledge of Entities in\n  Pre-trained Vision and Language Models","summary":"  In this paper, we propose a table and image generation task to verify how the\nknowledge about entities acquired from natural language is retained in Vision &\nLanguage (V&L) models. This task consists of two parts: the first is to\ngenerate a table containing knowledge about an entity and its related image,\nand the second is to generate an image from an entity with a caption and a\ntable containing related knowledge of the entity. In both tasks, the model must\nknow the entities used to perform the generation properly. We created the\nWikipedia Table and Image Generation (WikiTIG) dataset from about 200,000\ninfoboxes in English Wikipedia articles to perform the proposed tasks. We\nevaluated the performance on the tasks with respect to the above research\nquestion using the V&L model OFA, which has achieved state-of-the-art results\nin multiple tasks. Experimental results show that OFA forgets part of its\nentity knowledge by pre-training as a complement to improve the performance of\nimage related tasks.\n","authors":["Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2306.02115v2.pdf","comment":"Accepted at ACL 2023"},{"id":"http://arxiv.org/abs/2307.13909v1","updated":"2023-07-26T02:18:04Z","published":"2023-07-26T02:18:04Z","title":"Graph Neural Networks-based Hybrid Framework For Predicting Particle\n  Crushing Strength","summary":"  Graph Neural Networks have emerged as an effective machine learning tool for\nmulti-disciplinary tasks such as pharmaceutical molecule classification and\nchemical reaction prediction, because they can model non-euclidean\nrelationships between different entities. Particle crushing, as a significant\nfield of civil engineering, describes the breakage of granular materials caused\nby the breakage of particle fragment bonds under the modeling of numerical\nsimulations, which motivates us to characterize the mechanical behaviors of\nparticle crushing through the connectivity of particle fragments with Graph\nNeural Networks (GNNs). However, there lacks an open-source large-scale\nparticle crushing dataset for research due to the expensive costs of laboratory\ntests or numerical simulations. Therefore, we firstly generate a dataset with\n45,000 numerical simulations and 900 particle types to facilitate the research\nprogress of machine learning for particle crushing. Secondly, we devise a\nhybrid framework based on GNNs to predict particle crushing strength in a\nparticle fragment view with the advances of state of the art GNNs. Finally, we\ncompare our hybrid framework against traditional machine learning methods and\nthe plain MLP to verify its effectiveness. The usefulness of different features\nis further discussed through the gradient attribution explanation w.r.t the\npredictions. Our data and code are released at\nhttps://github.com/doujiang-zheng/GNN-For-Particle-Crushing.\n","authors":["Tongya Zheng","Tianli Zhang","Qingzheng Guan","Wenjie Huang","Zunlei Feng","Mingli Song","Chun Chen"],"pdf_url":"https://arxiv.org/pdf/2307.13909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13907v1","updated":"2023-07-26T02:15:11Z","published":"2023-07-26T02:15:11Z","title":"Robustness Verification of Deep Neural Networks using Star-Based\n  Reachability Analysis with Variable-Length Time Series Input","summary":"  Data-driven, neural network (NN) based anomaly detection and predictive\nmaintenance are emerging research areas. NN-based analytics of time-series data\noffer valuable insights into past behaviors and estimates of critical\nparameters like remaining useful life (RUL) of equipment and state-of-charge\n(SOC) of batteries. However, input time series data can be exposed to\nintentional or unintentional noise when passing through sensors, necessitating\nrobust validation and verification of these NNs. This paper presents a case\nstudy of the robustness verification approach for time series regression NNs\n(TSRegNN) using set-based formal methods. It focuses on utilizing\nvariable-length input data to streamline input manipulation and enhance network\narchitecture generalizability. The method is applied to two data sets in the\nPrognostics and Health Management (PHM) application areas: (1) SOC estimation\nof a Lithium-ion battery and (2) RUL estimation of a turbine engine. The NNs'\nrobustness is checked using star-based reachability analysis, and several\nperformance measures evaluate the effect of bounded perturbations in the input\non network outputs, i.e., future outcomes. Overall, the paper offers a\ncomprehensive case study for validating and verifying NN-based analytics of\ntime-series data in real-world applications, emphasizing the importance of\nrobustness testing for accurate and reliable predictions, especially\nconsidering the impact of noise on future outcomes.\n","authors":["Neelanjana Pal","Diego Manzanas Lopez","Taylor T Johnson"],"pdf_url":"https://arxiv.org/pdf/2307.13907v1.pdf","comment":"Under Review, 26 Pages, 14 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.13903v1","updated":"2023-07-26T02:02:19Z","published":"2023-07-26T02:02:19Z","title":"Corruption-Robust Lipschitz Contextual Search","summary":"  I study the problem of learning a Lipschitz function with corrupted binary\nsignals. The learner tries to learn a Lipschitz function $f$ that the adversary\nchooses. In each round, the adversary selects a context vector $x_t$ in the\ninput space, and the learner makes a guess to the true function value $f(x_t)$\nand receives a binary signal indicating whether the guess was high or low. In a\ntotal of $C$ rounds, the signal may be corrupted, though the value of $C$ is\nunknown to the learner. The learner's goal is to incur a small cumulative loss.\nI present a natural yet powerful technique sanity check, which proves useful in\ndesigning corruption-robust algorithms. I design algorithms which (treating the\nLipschitz parameter $L$ as constant): for the symmetric loss, the learner\nachieves regret $O(C\\log T)$ with $d = 1$ and $O_d(C\\log T + T^{(d-1)/d})$ with\n$d > 1$; for the pricing loss the learner achieves regret $\\widetilde{O}\n(T^{d/(d+1)} + C\\cdot T^{1/(d+1)})$.\n","authors":["Shiliang Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.13903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13899v1","updated":"2023-07-26T01:47:49Z","published":"2023-07-26T01:47:49Z","title":"Regularizing Neural Networks with Meta-Learning Generative Models","summary":"  This paper investigates methods for improving generative data augmentation\nfor deep learning. Generative data augmentation leverages the synthetic samples\nproduced by generative models as an additional dataset for classification with\nsmall dataset settings. A key challenge of generative data augmentation is that\nthe synthetic data contain uninformative samples that degrade accuracy. This is\nbecause the synthetic samples do not perfectly represent class categories in\nreal data and uniform sampling does not necessarily provide useful samples for\ntasks. In this paper, we present a novel strategy for generative data\naugmentation called meta generative regularization (MGR). To avoid the\ndegradation of generative data augmentation, MGR utilizes synthetic samples in\nthe regularization term for feature extractors instead of in the loss function,\ne.g., cross-entropy. These synthetic samples are dynamically determined to\nminimize the validation losses through meta-learning. We observed that MGR can\navoid the performance degradation of na\\\"ive generative data augmentation and\nboost the baselines. Experiments on six datasets showed that MGR is effective\nparticularly when datasets are smaller and stably outperforms baselines.\n","authors":["Shin'ya Yamaguchi","Daiki Chijiwa","Sekitoshi Kanai","Atsutoshi Kumagai","Hisashi Kashima"],"pdf_url":"https://arxiv.org/pdf/2307.13899v1.pdf","comment":"Accepted to Data-centric Machine Learning Research (DMLR) Workshop at\n  ICML 2023"},{"id":"http://arxiv.org/abs/2206.02789v2","updated":"2023-07-26T01:23:54Z","published":"2022-06-06T00:28:37Z","title":"Efficient and Accurate Physics-aware Multiplex Graph Neural Networks for\n  3D Small Molecules and Macromolecule Complexes","summary":"  Recent advances in applying Graph Neural Networks (GNNs) to molecular science\nhave showcased the power of learning three-dimensional (3D) structure\nrepresentations with GNNs. However, most existing GNNs suffer from the\nlimitations of insufficient modeling of diverse interactions, computational\nexpensive operations, and ignorance of vectorial values. Here, we tackle these\nlimitations by proposing a novel GNN model, Physics-aware Multiplex Graph\nNeural Network (PaxNet), to efficiently and accurately learn the\nrepresentations of 3D molecules for both small organic compounds and\nmacromolecule complexes. PaxNet separates the modeling of local and non-local\ninteractions inspired by molecular mechanics, and reduces the expensive\nangle-related computations. Besides scalar properties, PaxNet can also predict\nvectorial properties by learning an associated vector for each atom. To\nevaluate the performance of PaxNet, we compare it with state-of-the-art\nbaselines in two tasks. On small molecule dataset for predicting quantum\nchemical properties, PaxNet reduces the prediction error by 15% and uses 73%\nless memory than the best baseline. On macromolecule dataset for predicting\nprotein-ligand binding affinities, PaxNet outperforms the best baseline while\nreducing the memory consumption by 33% and the inference time by 85%. Thus,\nPaxNet provides a universal, robust and accurate method for large-scale machine\nlearning of molecules. Our code is available at\nhttps://github.com/zetayue/Physics-aware-Multiplex-GNN.\n","authors":["Shuo Zhang","Yang Liu","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2206.02789v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2307.13885v1","updated":"2023-07-26T01:10:29Z","published":"2023-07-26T01:10:29Z","title":"Efficient Estimation of the Local Robustness of Machine Learning Models","summary":"  Machine learning models often need to be robust to noisy input data. The\neffect of real-world noise (which is often random) on model predictions is\ncaptured by a model's local robustness, i.e., the consistency of model\npredictions in a local region around an input. However, the na\\\"ive approach to\ncomputing local robustness based on Monte-Carlo sampling is statistically\ninefficient, leading to prohibitive computational costs for large-scale\napplications. In this work, we develop the first analytical estimators to\nefficiently compute local robustness of multi-class discriminative models using\nlocal linear function approximation and the multivariate Normal CDF. Through\nthe derivation of these estimators, we show how local robustness is connected\nto concepts such as randomized smoothing and softmax probability. We also\nconfirm empirically that these estimators accurately and efficiently compute\nthe local robustness of standard deep learning models. In addition, we\ndemonstrate these estimators' usefulness for various tasks involving local\nrobustness, such as measuring robustness bias and identifying examples that are\nvulnerable to noise perturbation in a dataset. By developing these analytical\nestimators, this work not only advances conceptual understanding of local\nrobustness, but also makes its computation practical, enabling the use of local\nrobustness in critical downstream applications.\n","authors":["Tessa Han","Suraj Srinivas","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2307.13885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13883v1","updated":"2023-07-26T01:07:52Z","published":"2023-07-26T01:07:52Z","title":"ExeDec: Execution Decomposition for Compositional Generalization in\n  Neural Program Synthesis","summary":"  When writing programs, people have the ability to tackle a new complex task\nby decomposing it into smaller and more familiar subtasks. While it is\ndifficult to measure whether neural program synthesis methods have similar\ncapabilities, we can measure whether they compositionally generalize, that is,\nwhether a model that has been trained on the simpler subtasks is subsequently\nable to solve more complex tasks. In this paper, we characterize several\ndifferent forms of compositional generalization that are desirable in program\nsynthesis, forming a meta-benchmark which we use to create generalization tasks\nfor two popular datasets, RobustFill and DeepCoder. We then propose ExeDec, a\nnovel decomposition-based synthesis strategy that predicts execution subgoals\nto solve problems step-by-step informed by program execution at each step.\nExeDec has better synthesis performance and greatly improved compositional\ngeneralization ability compared to baselines.\n","authors":["Kensen Shi","Joey Hong","Manzil Zaheer","Pengcheng Yin","Charles Sutton"],"pdf_url":"https://arxiv.org/pdf/2307.13883v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2204.03758"},{"id":"http://arxiv.org/abs/2111.09885v3","updated":"2023-07-26T00:56:50Z","published":"2021-11-18T18:59:35Z","title":"Rate-optimal Bayesian Simple Regret in Best Arm Identification","summary":"  We consider best arm identification in the multi-armed bandit problem.\nAssuming certain continuity conditions of the prior, we characterize the rate\nof the Bayesian simple regret. Differing from Bayesian regret minimization\n(Lai, 1987), the leading term in the Bayesian simple regret derives from the\nregion where the gap between optimal and suboptimal arms is smaller than\n$\\sqrt{\\frac{\\log T}{T}}$. We propose a simple and easy-to-compute algorithm\nwith its leading term matching with the lower bound up to a constant factor;\nsimulation results support our theoretical findings.\n","authors":["Junpei Komiyama","Kaito Ariu","Masahiro Kato","Chao Qin"],"pdf_url":"https://arxiv.org/pdf/2111.09885v3.pdf","comment":"To appear in Mathematics of Operations Research. Changed the title\n  from the previous version"},{"id":"http://arxiv.org/abs/2203.01482v2","updated":"2023-07-26T00:49:29Z","published":"2022-03-03T01:53:47Z","title":"MetaDT: Meta Decision Tree with Class Hierarchy for Interpretable\n  Few-Shot Learning","summary":"  Few-Shot Learning (FSL) is a challenging task, which aims to recognize novel\nclasses with few examples. Recently, lots of methods have been proposed from\nthe perspective of meta-learning and representation learning. However, few\nworks focus on the interpretability of FSL decision process. In this paper, we\ntake a step towards the interpretable FSL by proposing a novel meta-learning\nbased decision tree framework, namely, MetaDT. In particular, the FSL\ninterpretability is achieved from two aspects, i.e., a concept aspect and a\nvisual aspect. On the concept aspect, we first introduce a tree-like concept\nhierarchy as FSL prior. Then, resorting to the prior, we split each few-shot\ntask to a set of subtasks with different concept levels and then perform class\nprediction via a model of decision tree. The advantage of such design is that a\nsequence of high-level concept decisions that lead up to a final class\nprediction can be obtained, which clarifies the FSL decision process. On the\nvisual aspect, a set of subtask-specific classifiers with visual attention\nmechanism is designed to perform decision at each node of the decision tree. As\na result, a subtask-specific heatmap visualization can be obtained to achieve\nthe decision interpretability of each tree node. At last, to alleviate the data\nscarcity issue of FSL, we regard the prior of concept hierarchy as an\nundirected graph, and then design a graph convolution-based decision tree\ninference network as our meta-learner to infer parameters of the decision tree.\nExtensive experiments on performance comparison and interpretability analysis\nshow superiority of our MetaDT.\n","authors":["Baoquan Zhang","Hao Jiang","Xutao Li","Shanshan Feng","Yunming Ye","Rui Ye"],"pdf_url":"https://arxiv.org/pdf/2203.01482v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2210.13763v3","updated":"2023-07-26T00:26:26Z","published":"2022-10-25T04:46:30Z","title":"Teal: Learning-Accelerated Optimization of WAN Traffic Engineering","summary":"  The rapid expansion of global cloud wide-area networks (WANs) has posed a\nchallenge for commercial optimization engines to efficiently solve network\ntraffic engineering (TE) problems at scale. Existing acceleration strategies\ndecompose TE optimization into concurrent subproblems but realize limited\nparallelism due to an inherent tradeoff between run time and allocation\nperformance.\n  We present Teal, a learning-based TE algorithm that leverages the parallel\nprocessing power of GPUs to accelerate TE control. First, Teal designs a\nflow-centric graph neural network (GNN) to capture WAN connectivity and network\nflows, learning flow features as inputs to downstream allocation. Second, to\nreduce the problem scale and make learning tractable, Teal employs a\nmulti-agent reinforcement learning (RL) algorithm to independently allocate\neach traffic demand while optimizing a central TE objective. Finally, Teal\nfine-tunes allocations with ADMM (Alternating Direction Method of Multipliers),\na highly parallelizable optimization algorithm for reducing constraint\nviolations such as overutilized links.\n  We evaluate Teal using traffic matrices from Microsoft's WAN. On a large WAN\ntopology with >1,700 nodes, Teal generates near-optimal flow allocations while\nrunning several orders of magnitude faster than the production optimization\nengine. Compared with other TE acceleration schemes, Teal satisfies 6--32% more\ntraffic demand and yields 197--625x speedups.\n","authors":["Zhiying Xu","Francis Y. Yan","Rachee Singh","Justin T. Chiu","Alexander M. Rush","Minlan Yu"],"pdf_url":"https://arxiv.org/pdf/2210.13763v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14989v2","updated":"2023-07-26T00:13:38Z","published":"2023-04-28T17:15:40Z","title":"Kullback-Leibler Maillard Sampling for Multi-armed Bandits with Bounded\n  Rewards","summary":"  We study $K$-armed bandit problems where the reward distributions of the arms\nare all supported on the $[0,1]$ interval. It has been a challenge to design\nregret-efficient randomized exploration algorithms in this setting. Maillard\nsampling~\\cite{maillard13apprentissage}, an attractive alternative to Thompson\nsampling, has recently been shown to achieve competitive regret guarantees in\nthe sub-Gaussian reward setting~\\cite{bian2022maillard} while maintaining\nclosed-form action probabilities, which is useful for offline policy\nevaluation. In this work, we propose the Kullback-Leibler Maillard Sampling\n(KL-MS) algorithm, a natural extension of Maillard sampling for achieving\nKL-style gap-dependent regret bound. We show that KL-MS enjoys the asymptotic\noptimality when the rewards are Bernoulli and has a worst-case regret bound of\nthe form $O(\\sqrt{\\mu^*(1-\\mu^*) K T \\ln K} + K \\ln T)$, where $\\mu^*$ is the\nexpected reward of the optimal arm, and $T$ is the time horizon length.\n","authors":["Hao Qin","Kwang-Sung Jun","Chicheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.14989v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13869v1","updated":"2023-07-26T00:01:21Z","published":"2023-07-26T00:01:21Z","title":"Good Lattice Training: Physics-Informed Neural Networks Accelerated by\n  Number Theory","summary":"  Physics-informed neural networks (PINNs) offer a novel and efficient approach\nto solving partial differential equations (PDEs). Their success lies in the\nphysics-informed loss, which trains a neural network to satisfy a given PDE at\nspecific points and to approximate the solution. However, the solutions to PDEs\nare inherently infinite-dimensional, and the distance between the output and\nthe solution is defined by an integral over the domain. Therefore, the\nphysics-informed loss only provides a finite approximation, and selecting\nappropriate collocation points becomes crucial to suppress the discretization\nerrors, although this aspect has often been overlooked. In this paper, we\npropose a new technique called good lattice training (GLT) for PINNs, inspired\nby number theoretic methods for numerical analysis. GLT offers a set of\ncollocation points that are effective even with a small number of points and\nfor multi-dimensional spaces. Our experiments demonstrate that GLT requires\n2--20 times fewer collocation points (resulting in lower computational cost)\nthan uniformly random sampling or Latin hypercube sampling, while achieving\ncompetitive performance.\n","authors":["Takashi Matsubara","Takaharu Yaguchi"],"pdf_url":"https://arxiv.org/pdf/2307.13869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13868v1","updated":"2023-07-26T00:01:16Z","published":"2023-07-26T00:01:16Z","title":"Learning sources of variability from high-dimensional observational\n  studies","summary":"  Causal inference studies whether the presence of a variable influences an\nobserved outcome. As measured by quantities such as the \"average treatment\neffect,\" this paradigm is employed across numerous biological fields, from\nvaccine and drug development to policy interventions. Unfortunately, the\nmajority of these methods are often limited to univariate outcomes. Our work\ngeneralizes causal estimands to outcomes with any number of dimensions or any\nmeasurable space, and formulates traditional causal estimands for nominal\nvariables as causal discrepancy tests. We propose a simple technique for\nadjusting universally consistent conditional independence tests and prove that\nthese tests are universally consistent causal discrepancy tests. Numerical\nexperiments illustrate that our method, Causal CDcorr, leads to improvements in\nboth finite sample validity and power when compared to existing strategies. Our\nmethods are all open source and available at github.com/ebridge2/cdcorr.\n","authors":["Eric W. Bridgeford","Jaewon Chung","Brian Gilbert","Sambit Panda","Adam Li","Cencheng Shen","Alexandra Badea","Brian Caffo","Joshua T. Vogelstein"],"pdf_url":"https://arxiv.org/pdf/2307.13868v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.14335v1","updated":"2023-07-26T17:54:04Z","published":"2023-07-26T17:54:04Z","title":"WavJourney: Compositional Audio Creation with Large Language Models","summary":"  Large Language Models (LLMs) have shown great promise in integrating diverse\nexpert models to tackle intricate language and vision tasks. Despite their\nsignificance in advancing the field of Artificial Intelligence Generated\nContent (AIGC), their potential in intelligent audio content creation remains\nunexplored. In this work, we tackle the problem of creating audio content with\nstorylines encompassing speech, music, and sound effects, guided by text\ninstructions. We present WavJourney, a system that leverages LLMs to connect\nvarious audio models for audio content generation. Given a text description of\nan auditory scene, WavJourney first prompts LLMs to generate a structured\nscript dedicated to audio storytelling. The audio script incorporates diverse\naudio elements, organized based on their spatio-temporal relationships. As a\nconceptual representation of audio, the audio script provides an interactive\nand interpretable rationale for human engagement. Afterward, the audio script\nis fed into a script compiler, converting it into a computer program. Each line\nof the program calls a task-specific audio generation model or computational\noperation function (e.g., concatenate, mix). The computer program is then\nexecuted to obtain an explainable solution for audio generation. We demonstrate\nthe practicality of WavJourney across diverse real-world scenarios, including\nscience fiction, education, and radio play. The explainable and interactive\ndesign of WavJourney fosters human-machine co-creation in multi-round\ndialogues, enhancing creative control and adaptability in audio production.\nWavJourney audiolizes the human imagination, opening up new avenues for\ncreativity in multimedia content creation.\n","authors":["Xubo Liu","Zhongkai Zhu","Haohe Liu","Yi Yuan","Meng Cui","Qiushi Huang","Jinhua Liang","Yin Cao","Qiuqiang Kong","Mark D. Plumbley","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14335v1.pdf","comment":"Project Page: https://audio-agi.github.io/WavJourney_demopage/"},{"id":"http://arxiv.org/abs/2307.14244v1","updated":"2023-07-26T15:15:13Z","published":"2023-07-26T15:15:13Z","title":"Neural-based Cross-modal Search and Retrieval of Artwork","summary":"  Creating an intelligent search and retrieval system for artwork images,\nparticularly paintings, is crucial for documenting cultural heritage, fostering\nwider public engagement, and advancing artistic analysis and interpretation.\nVisual-Semantic Embedding (VSE) networks are deep learning models used for\ninformation retrieval, which learn joint representations of textual and visual\ndata, enabling 1) cross-modal search and retrieval tasks, such as image-to-text\nand text-to-image retrieval; and 2) relation-focused retrieval to capture\nentity relationships and provide more contextually relevant search results.\nAlthough VSE networks have played a significant role in cross-modal information\nretrieval, their application to painting datasets, such as ArtUK, remains\nunexplored. This paper introduces BoonArt, a VSE-based cross-modal search\nengine that allows users to search for images using textual queries, and to\nobtain textual descriptions along with the corresponding images when using\nimage queries. The performance of BoonArt was evaluated using the ArtUK\ndataset. Experimental evaluations revealed that BoonArt achieved 97% Recall@10\nfor image-to-text retrieval, and 97.4% Recall@10 for text-to-image Retrieval.\nBy bridging the gap between textual and visual modalities, BoonArt provides a\nmuch-improved search performance compared to traditional search engines, such\nas the one provided by the ArtUK website. BoonArt can be utilised to work with\nother artwork datasets.\n","authors":["Yan Gong","Georgina Cosma","Axel Finke"],"pdf_url":"https://arxiv.org/pdf/2307.14244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14240v1","updated":"2023-07-26T15:08:02Z","published":"2023-07-26T15:08:02Z","title":"Boon: A Neural Search Engine for Cross-Modal Information Retrieval","summary":"  Visual-Semantic Embedding (VSE) networks can help search engines better\nunderstand the meaning behind visual content and associate it with relevant\ntextual information, leading to more accurate search results. VSE networks can\nbe used in cross-modal search engines to embed image and textual descriptions\nin a shared space, enabling image-to-text and text-to-image retrieval tasks.\nHowever, the full potential of VSE networks for search engines has yet to be\nfully explored. This paper presents Boon, a novel cross-modal search engine\nthat combines two state-of-the-art networks: the GPT-3.5-turbo large language\nmodel, and the VSE network VITR (VIsion Transformers with Relation-focused\nlearning) to enhance the engine's capabilities in extracting and reasoning with\nregional relationships in images. VITR employs encoders from CLIP that were\ntrained with 400 million image-description pairs and it was fine-turned on the\nRefCOCOg dataset. Boon's neural-based components serve as its main\nfunctionalities: 1) a 'cross-modal search engine' that enables end-users to\nperform image-to-text and text-to-image retrieval. 2) a 'multi-lingual\nconversational AI' component that enables the end-user to converse about one or\nmore images selected by the end-user. Such a feature makes the search engine\naccessible to a wide audience, including those with visual impairments. 3) Boon\nis multi-lingual and can take queries and handle conversations about images in\nmultiple languages. Boon was implemented using the Django and PyTorch\nframeworks. The interface and capabilities of the Boon search engine are\ndemonstrated using the RefCOCOg dataset, and the engine's ability to search for\nmultimedia through the web is facilitated by Google's API.\n","authors":["Yan Gong","Georgina Cosma"],"pdf_url":"https://arxiv.org/pdf/2307.14240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14119v1","updated":"2023-07-26T11:38:45Z","published":"2023-07-26T11:38:45Z","title":"A semantics-driven methodology for high-quality image annotation","summary":"  Recent work in Machine Learning and Computer Vision has highlighted the\npresence of various types of systematic flaws inside ground truth object\nrecognition benchmark datasets. Our basic tenet is that these flaws are rooted\nin the many-to-many mappings which exist between the visual information encoded\nin images and the intended semantics of the labels annotating them. The net\nconsequence is that the current annotation process is largely under-specified,\nthus leaving too much freedom to the subjective judgment of annotators. In this\npaper, we propose vTelos, an integrated Natural Language Processing, Knowledge\nRepresentation, and Computer Vision methodology whose main goal is to make\nexplicit the (otherwise implicit) intended annotation semantics, thus\nminimizing the number and role of subjective choices. A key element of vTelos\nis the exploitation of the WordNet lexico-semantic hierarchy as the main means\nfor providing the meaning of natural language labels and, as a consequence, for\ndriving the annotation of images based on the objects and the visual properties\nthey depict. The methodology is validated on images populating a subset of the\nImageNet hierarchy.\n","authors":["Fausto Giunchiglia","Mayukh Bagchi","Xiaolei Diao"],"pdf_url":"https://arxiv.org/pdf/2307.14119v1.pdf","comment":"Accepted @ 26th European Conference on Artificial Intelligence (ECAI)\n  2023, Krak\\'ow, Poland"},{"id":"http://arxiv.org/abs/2307.13981v1","updated":"2023-07-26T06:38:33Z","published":"2023-07-26T06:38:33Z","title":"Analysis of Video Quality Datasets via Design of Minimalistic Video\n  Quality Models","summary":"  Blind video quality assessment (BVQA) plays an indispensable role in\nmonitoring and improving the end-users' viewing experience in various\nreal-world video-enabled media applications. As an experimental field, the\nimprovements of BVQA models have been measured primarily on a few human-rated\nVQA datasets. Thus, it is crucial to gain a better understanding of existing\nVQA datasets in order to properly evaluate the current progress in BVQA.\nTowards this goal, we conduct a first-of-its-kind computational analysis of VQA\ndatasets via designing minimalistic BVQA models. By minimalistic, we restrict\nour family of BVQA models to build only upon basic blocks: a video preprocessor\n(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an\noptional temporal quality analyzer, and a quality regressor, all with the\nsimplest possible instantiations. By comparing the quality prediction\nperformance of different model variants on eight VQA datasets with realistic\ndistortions, we find that nearly all datasets suffer from the easy dataset\nproblem of varying severity, some of which even admit blind image quality\nassessment (BIQA) solutions. We additionally justify our claims by contrasting\nour model generalizability on these VQA datasets, and by ablating a dizzying\nset of BVQA design choices related to the basic building blocks. Our results\ncast doubt on the current progress in BVQA, and meanwhile shed light on good\npractices of constructing next-generation VQA datasets and models.\n","authors":["Wei Sun","Wen Wen","Xiongkuo Min","Long Lan","Guangtao Zhai","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2307.13981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05152v2","updated":"2023-07-26T05:12:03Z","published":"2023-05-09T03:33:12Z","title":"Who is Speaking Actually? Robust and Versatile Speaker Traceability for\n  Voice Conversion","summary":"  Voice conversion (VC), as a voice style transfer technology, is becoming\nincreasingly prevalent while raising serious concerns about its illegal use.\nProactively tracing the origins of VC-generated speeches, i.e., speaker\ntraceability, can prevent the misuse of VC, but unfortunately has not been\nextensively studied. In this paper, we are the first to investigate the speaker\ntraceability for VC and propose a traceable VC framework named VoxTracer. Our\nVoxTracer is similar to but beyond the paradigm of audio watermarking. We first\nuse unique speaker embedding to represent speaker identity. Then we design a\nVAE-Glow structure, in which the hiding process imperceptibly integrates the\nsource speaker identity into the VC, and the tracing process accurately\nrecovers the source speaker identity and even the source speech in spite of\nsevere speech quality degradation. To address the speech mismatch between the\nhiding and tracing processes affected by different distortions, we also adopt\nan asynchronous training strategy to optimize the VAE-Glow models. The\nVoxTracer is versatile enough to be applied to arbitrary VC methods and popular\naudio coding standards. Extensive experiments demonstrate that the VoxTracer\nachieves not only high imperceptibility in hiding, but also nearly 100% tracing\naccuracy against various types of audio lossy compressions (AAC, MP3, Opus and\nSILK) with a broad range of bitrates (16 kbps - 128 kbps) even in a very short\ntime duration (0.74s). Our speech demo is available at\nhttps://anonymous.4open.science/w/DEMOofVoxTracer.\n","authors":["Yanzhen Ren","Hongcheng Zhu","Liming Zhai","Zongkun Sun","Rubing Shen","Lina Wang"],"pdf_url":"https://arxiv.org/pdf/2305.05152v2.pdf","comment":"has been accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2206.02070v2","updated":"2023-07-26T03:53:19Z","published":"2022-06-04T23:33:34Z","title":"Priors in Deep Image Restoration and Enhancement: A Survey","summary":"  Image restoration and enhancement is a process of improving the image quality\nby removing degradations, such as noise, blur, and resolution degradation. Deep\nlearning (DL) has recently been applied to image restoration and enhancement.\nDue to its ill-posed property, plenty of works have been explored priors to\nfacilitate training deep neural networks (DNNs). However, the importance of\npriors has not been systematically studied and analyzed by far in the research\ncommunity. Therefore, this paper serves as the first study that provides a\ncomprehensive overview of recent advancements in priors for deep image\nrestoration and enhancement. Our work covers five primary contents: (1) A\ntheoretical analysis of priors for deep image restoration and enhancement; (2)\nA hierarchical and structural taxonomy of priors commonly used in the DL-based\nmethods; (3) An insightful discussion on each prior regarding its principle,\npotential, and applications; (4) A summary of crucial problems by highlighting\nthe potential future directions, especially adopting the large-scale foundation\nmodels as prior, to spark more research in the community; (5) An open-source\nrepository that provides a taxonomy of all mentioned works and code links.\n","authors":["Yunfan Lu","Yiqi Lin","Hao Wu","Yunhao Luo","Xu Zheng","Hui Xiong","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2206.02070v2.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2307.04192v2","updated":"2023-07-26T02:29:03Z","published":"2023-07-09T14:54:30Z","title":"SAS Video-QA: Self-Adaptive Sampling for Efficient Video\n  Question-Answering","summary":"  Video question--answering is a fundamental task in the field of video\nunderstanding. Although current vision--language models (VLMs) equipped with\nVideo Transformers have enabled temporal modeling and yielded superior results,\nthey are at the cost of huge computational power and thus too expensive to\ndeploy in real-time application scenarios. An economical workaround only\nsamples a small portion of frames to represent the main content of that video\nand tune an image--text model on these sampled frames. Recent video\nunderstanding models usually randomly sample a set of frames or clips,\nregardless of internal correlations between their visual contents, nor their\nrelevance to the problem. We argue that such kinds of aimless sampling may omit\nthe key frames from which the correct answer can be deduced, and the situation\ngets worse when the sampling sparsity increases, which always happens as the\nvideo lengths increase. To mitigate this issue, we propose two frame sampling\nstrategies, namely the most domain frames (MDF) and most implied frames (MIF),\nto maximally preserve those frames that are most likely vital to the given\nquestions. MDF passively minimizes the risk of key frame omission in a\nbootstrap manner, while MIS actively searches key frames customized for each\nvideo--question pair with the assistance of auxiliary models. The experimental\nresults on three public datasets from three advanced VLMs (CLIP, GIT and\nAll-in-one) demonstrate that our proposed strategies can boost the performance\nfor image--text pretrained models. The source codes pertaining to the method\nproposed in this paper are publicly available at\nhttps://github.com/declare-lab/sas-vqa.\n","authors":["Wei Han","Hui Chen","Min-Yen Kan","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2307.04192v2.pdf","comment":"8 pages, 7 figures"}]}}
\ No newline at end of file
diff --git a/favicon.ico b/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5
GIT binary patch
literal 15086
zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau
zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}<z`M#I>4!8Q=syhFvI(6#Q
zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c
zg{r6iyXi3T|KFt>>TsDWWe%JEG<m`Z;&-3^R0ug1f<^}$q5Q=P?#uie-}>eI;m)t_
z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa
zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y<ax-x_)V}OWlBT0T`Px7TNY}*zI;?
zWn^JY)))ky4lEHv(5WD_*xdquU*8ck(q|ykPW+@)bE-C$rN!lK{RRgcVzzm<##`_U
zhBRv$;y#E&mknJ|`AlUeIZXK`M0_2g$!Ip4kuom@NlD3tvX9Ccg@E$`PIbJ+;%m9!
z<G~o6H5!>2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8`
zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%P<t#lxR;fx2$gDB*dT
z-+m<>AnhXUA$><WLApjNA(g#c7XMh?0~WKTNN+d>^4n&yX>&2hmV*Ry0tB<smelLF
ztRFHgMKX(=h8&WgVVQw@_BMsOVbpSXo@0Ik=~dE0q{g&!VYIhlK_u#FczLLMOi)ko
z+Z57n(iM_JDs#0Ax^C!-1?B}BcHt@g_Dc3Eh}sc_4%<2)dUZ4gEg6IkTRNc3wK5L1
z6{?)5f*}isz;3of^glU085Y(z<Pg1^YMF|L2O1XD6WkV32U49XP;J{o4SyMGej3!O
zHBg^!hWca^)bbCZX3l{c@B&m*+OSZ{)`NA{lQ<?~=R0ohh!k^5A$GAB-B&+}N~ay>
zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j
zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z
z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4<Ggka
zN-&m0Qby!Ce-0`$i+Xi0!o%ue+*@xUD=Q20&GR)q^1tFcc0&for^wLqi<X-H+iiBl
z#>AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u
zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B
zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El
zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE<x*gAYQrYe#$37OG7fouZ<ko_rE&?p&yo
zCyTcs<FJ?VMnpipMqVCl%uPv=##(hlptfu+uJ3~V+i#)z_SJEY8wa&^E!0<ELH+Op
z)Q>+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+
zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_
zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~
zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M
zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm
z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2
zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R
zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BG<cICna&vQY(Nf}SL
zjkW~;^jT0XN{Rn#!5;v%dTnw2F?h*S?8%NDFg0ri)4X}b^+C$8-{qT>k+NDH$2J`E
zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL
zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JL<XQxi*{lO~cQ!n*ZwBj{Fu`
zPo2`&;BhzI^F<PkXB({|^>y9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^
z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0<rr~E<k87UPdvlyVcO_;~
z+9dH1Sx0(2qxYG+F<_sW-hEfc_BefP@ueNdN9ypMiQhE*EbDgu3icyMpqel?^>(R|
z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE
z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;<V&^EBZkW(`MPt+n<=pJk
z$Nm1v6JBrL%bXQIqD4!nXNbKtHor(~w4ChSo9}9Y>DK9ZoqOLJHETi*8l?M+!q*#o
zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*Dx<aiO|CwQ)fvke~&&#7;MztE%p`Pxc
z@5_b1JAEh#vwRc%mwMtGHgAQ-4I3e0Y64QNsd#8pCrCWeS1k%U8HDE-4??0j5ic%z
z5pj#+FxN7-F#aYn#3{BZ2+au9eZ#wGCm74t_Tw_IJ<Ul&7Qrj>N;%mZkENXqy&csb
zsD_E}O+<E9w#I9oc^*1#?1ZwHxDWO+1)dGWphbhU7_RhnmW@QbM*LHr(=5}_WM4tt
zUG@eeNOc&?l>J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS
zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9T<bF4y0;bcEGh6t6X6&+-)P5}%CR$#vK<
zu8~jt5(j;O)PPv(*MoYP``Tfnpb`=pZ%ARhgmS&Zc-NpIP*EMYu5$j|OZzyEbq<jV
z_UIA=e{$`UFk8$TKds-ac1*QS#j9DbVzO;AlFUh%xnL$*?&3UmN%wDSgOs!sP0t;%
zOJ2qyJ}Vv-&a<!GCGoM2knE(Pq%NeEB#D_fB}Fpdo%A~CBhnuvH?f5ZxfKwzJVuN4
z8yG|m;jhg<=H)D}&q~@r7r##8rIDYv#=F*cwQ~X~FXuF7#?mF9Xh}q!AM3b7@BB`1
z$$BAim*XVy0nU?Tth`8-PgTZ07yc7=ThaN0A&We-GqN%4gW>LDq~u-ur`m{J5sTMs
zuVdouiO8@OoM~_@pI-BHF}c0L<ZGE)Kh@IW^qLM4vw`1YwP1Go9Q0fnjgroS#dcLV
zS3x^}O=tfba*Ga?&n@poFZznur9Atw+sEBBJutA_*?)bP_eHHAYat+)dnzwdrPGxV
zZ;ywZIc1nL(EG#QJQw!eE_=hvwM-M9FD5TY)^u0&_kHvG2t7i(?PHR2*daV~?M~ZJ
z+3_sL$98<>>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI<ni3ebD5Irm*N}m
z5ae+_=7k*!&!fJ3;xDIy|Fs0|&*?_c|Nr<8^nlUta!)Iq)R9zpPm6cCiv2OtNj>}D
zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50
zU`*>4B=PI|ujbte-Xj>la6GD=q@VAS<lMgAv3lsbsVma0>BzQb<MA!rv)_VL`<L37
zY3Cgh+Iv~|2d)~3M&C1L!MjY-e$d$<L~o47+>E(Mou3Mu)rQ&jISBtLywuzSE(YL*
zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut
zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&<
zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`-
zv1q-oHJX3j0-ZkZ<kYuvPa<k}6#A|0kH@w?hDkY-bi6hzdTj22vIn>?7)E-Ue)=q)
z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ
zL&}654a0<m6Od!iLHrx9pmN6usKjaXuTSWEpX&vCQ`!3%B=3ZNJj%1^cr=M>gZHLP
z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDU<D_7}zFX3hEaUJ(yk{8~_H;JRC&PQmwrl|Hs
z@LzE9&?}5F`#;8UUd-5(W7~2kLt+}smT`}}fP7M+va`AW+gTKcGx}uu4BkPZ{>XWm
zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J
z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*<Lp%UOTTUSl=Fn+?@R1Q@D7Wk43tCo
z@{Xj&Th7U)O+ycizLBTAld+LI9HU1=rKRb0&*yS(c7fXNLm!Oc9JjD<MMjCs$vNcI
zsegUGA=tKU(=kXL?Op}j7(bqI1TXoEo#`J0wP|aS-U*Hk8}zu4jIE>Gj(PbecRHl3
z)RAxU<Q_x4rqAuaL?L$c*&5X6Jw^Pb9XXekc_sbqssH7@l##LqF5$n3b7e&F*oIpy
zSNXOt=j_(cocT+8BA?7V(KF#=@Q}Q`lW(s5BmB8HRx+mErs2ezv$&`a4L%%CEmq#7
z3Gc$t<{Ev$yovARTd`QL@SjZGndUWj+7y<Jf7MwQ;l((UDKZkKTD5rh@jLyy$jiLE
zlW$rq%5i)ya$ffP2h_(t@IT11-J8?i{Njv3&I;r_ry*^F*djxI=H;Dyb2?*?`LOFt
zu3NPk3-p1%#Ii<lyyU)xv|;qW$VwYvdgBde9g}k!c_;S4as2bB7b9r1X8ORpKu7$(
zQYhDnXFl7hhcbo+2GMPq*T!$-DeoNhUT}Xv9qLJ4_VK(&_OtEww!`zo2Oy|3ZI|3%
za>QWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG
z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY
zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q
zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D
zrv-UL^G+<+sG)<QVxQt2G}?qg0qiH1DOm5D9&CzPcK$yB+Mfls1$Zat9VBKYVuo!7
z?*67A-b@r#&Q!s3s|RR)zn$xj+)EZeRs2N*W8qU`h~JYol9rH`k+zUz&6c&pF>xM6
z?TJFFEmf0C{C}YwOAfki>*iPz<?_mD{9;zdV2*i?hI#mk5h$DIrQLRFtq6t0^LlOQ
zrTL-a+nY1Y7_)ecll(=c={GYDHek&FWSg@wa@k0fyIjsg%Z{~5&dXkBo^_rUZyUXI
zH2=RyV2S!z^LR~k-q9Jgexsee1xfY-G7c}TcnS4?s_)ujB6<AViF@fp#@Q1YA9vy<
z@q%J8f&btOiBm{eBAiqhr{MoJEA~f=#(r8?`9kG&)z4Pf^77jw-B9Zq9BZ(y=INS7
z83g49q1V=4h}|3uzduX%hpL{higC-wVc@2L2tFT-`oGr4gk=+;#cQ|q*7ZmBqGw1)
z3P|qFGnKto7Jat$(R4k<o`MJVJ%Cy#YawoZ940NBgeglWBm8UH<;9`&ZE4dMPp9ol
zWWF!wa|j;cnNh;x1hhTa7I*x52dZ4CqP6Rv?~kgvRndQ8ESkJE66(ucw1c#}vd&#5
zT_o+;j#jS^N0Tq_bk#HcH*JLff|f9S`@Md@ZQVNk{C&w1-UnJu`%Qm&AJ^=xJ7~{!
zd7S<|Xd>QOx^KY$zohMj<o=SZe{#Q&cg;0hoH?F;vr|o-|2o?7-8=PjTDjvc>$PFW
zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD
z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^
zpZemTUj}!jMxoMTuBTTxc8<P;=0jXXV5S|FR#8reRN+bmJi7Z)#BGS<8uA_@lG<}$
z_cr%v->|>VvOmu-_V5+=E%E4@&<i714M*KS>Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM
z<F}jxOX_l6iz2n5?$u^rm{9XfO}x4CP0Y=niz$nyBH}35Z>Ac2-+N1Mj9EDb0eKE%
zBWGqVy3+V)V<crI70w@)*dtUWx5}P~pCa~f{%OCB_pZ-!SIg~bzrVvCV$sVLGYG!D
F_!mjUQ#=3w

literal 0
HcmV?d00001

diff --git a/index.css b/index.css
new file mode 100644
index 00000000..9ded9d94
--- /dev/null
+++ b/index.css
@@ -0,0 +1,355 @@
+:root {
+    /* Palette: Nord (https://www.nordtheme.com)*/
+    --nord00: #2e3440;
+    --nord01: #3b4252;
+    --nord02: #434c5e;
+    --nord03: #4c566a;
+    --nord04: #d8dee9;
+    --nord05: #e5e9f0;
+    --nord06: #eceff4;
+    --nord07: #8fbcbb;
+    --nord08: #88c0d0;
+    --nord09: #81a1c1;
+    --nord0A: #5e81ac;
+    --nord0B: #bf616a;
+    --nord0C: #d08770;
+    --nord0D: #ebcb8b;
+    --nord0E: #a3be8c;
+    --nord0F: #b48ead;
+
+
+    /* Typograph */
+    --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue",
+    sans-serif;
+    --font-size-scaler: 62.5%;
+    --font-size-m: 1.6rem;
+    --font-size-s: 1.4rem;
+
+    /* Components */
+    --body-color: var(--nord06);
+    --body-bg: var(--nord00);
+
+    --header-title: var(--nord06);
+    --header-container: var(--nord00);
+    --header-title-preffix: var(--nord0F);
+
+    --chip-font: var(--nord08);
+    --chip-color: var(--nord0B);
+
+    --icons: var(--nord06);
+    --icons-hover: var(--nord0F);
+
+    --day-container: var(--nord01);
+    --date: var(--nord09);
+
+    --summary: var(--nord0E);
+    --summary-hover: var(--nord0F);
+
+    --details-open: var(--nord02);
+    --details-content: var(--nord05);
+    --details-a: var(--nord07);
+    --details-a-hover: var(--nord0F);
+
+    --highlight-title: var(--nord0B);
+    --highlight-author: var(--nord0B);
+
+    --article-summary-hover-color: var(--nord0D);
+    --article-summary-color: var(--nord04);
+
+    --article-title-color: var(--nord05);
+    --article-title-hover-color: var(--nord0E);
+
+    --accordion-content-rail-color: var(--nord01);
+    --accordion-content-hover-rail-color: var(--nord0D);
+    --accordion-title-marker-color: var(--nord01);
+    --accordion-title-hover-marker-color: var(--nord0E);
+
+    --footer-color: var(--nord04);
+    --footer-link-hover-color: var(--nord0D);
+}
+
+[data-theme="light"] {
+    /* Theme design */
+
+    --color-primary: var(--nord07);
+    --color-primary-second: var(--nord00);
+    --color-info: var(--nord0A);
+    --color-success: var(--nord0E);
+    --color-warning: var(--nord0C);
+    --color-danger: var(--nord0B);
+
+    --color-text: var(--nord00);
+    --color-hover: var(--nord0D);
+    --color-shadow: var(--nord03);
+
+    --color-primary-h: var(--nord09);
+    --color-primary-s: var(--nord08);
+    --color-primary-l: var(--nord07);
+
+    --color-contrast-higher-h: var(--nord01);
+    --color-contrast-higher-l: var(--nord02);
+    --color-contrast-higher-s: var(--nord03);
+
+    --color-content: white;
+
+    --background: var(--nord06);
+    --background-content: var(--nord05);
+    --background-color: var(--nord04);
+
+    /* Components */
+
+    --chip-font: var(--nord06);
+    --chip-color: var(--nord09);
+
+    --body-color: var(--background-color);
+    --body-bg: var(--background);
+
+    --header-title: var(--color-shadow);
+    --header-container: var(--background);
+    --header-title-preffix: var(--color-primary-h);
+
+    --icons: var(--color-shadow);
+    --icons-hover: var(--color-hover);
+
+    --day-container: var(--background-content);
+    --date: var(--color-primary-l);
+
+    --summary: var(--color-info);
+    --summary-hover: var(--color-success);
+
+    --details-open: var(--color-content);
+    --details-content: var(--color-text);
+    --details-a: var(--color-primary-h);
+    --details-a-hover: var(--color-hover);
+
+    --highlight-title: var(--color-danger);
+    --highlight-author: var(--color-warning);
+
+    --article-summary-color: var(--color-text);
+    --article-summary-hover-color: var(--color-primary-s);
+
+    --article-title-color: var(--color-primary);
+    --article-title-hover-color: var(--color-success);
+
+    --accordion-content-rail-color: var(--color-warning);
+    --accordion-content-hover-rail-color: var(--color-warning);
+    --accordion-title-marker-color: var(--color-success);
+    --accordion-title-hover-marker-color: var(--color-success);
+
+    --footer-color: var(--color-text);
+    --footer-link-hover-color: var(--color-hover);
+}
+
+html {
+    font-size: var(--font-size-scaler);
+}
+
+body {
+    background-color: var(--body-bg);
+    font-family: var(--font-family-default);
+    color: var(--body-color);
+    margin: 0;
+    padding-top: 16px;
+    display: grid;
+}
+
+.header-container {
+    width: 90%;
+    max-width: 1200px;
+    background: var(--header-container);
+    margin: 0 auto;
+}
+
+.header-title {
+    font-size: 32px;
+    font-weight: bold;
+    color: var(--header-title);
+    margin: 0;
+    padding-bottom: 14px;
+}
+
+.header-title-preffix {
+    color: var(--header-title-preffix);
+}
+
+.icons {
+    color: var(--icons);
+    padding-bottom: 16px;
+}
+
+.icons a {
+    color: var(--icons);
+    text-decoration: none;
+}
+
+.icons a:hover {
+    color: var(--icons-hover);
+}
+
+.day-container {
+    padding: 16px 16px 16px 16px;
+    background: var(--day-container);
+    width: 90%;
+    max-width: 1200px;
+    margin: 0 auto;
+    margin-bottom: 8px;
+    border-radius: 10px;
+}
+
+.date {
+    font-size: 24px;
+    font-weight: 700;
+    margin: 0;
+    color: var(--date);
+}
+
+p {
+    margin: 0;
+}
+
+summary {
+    font-weight: 600;
+    color: var(--summary);
+}
+
+summary:hover {
+    text-decoration: underline;
+    cursor: pointer;
+    color: var(--summary-hover);
+}
+
+details {
+    --border-color: transparent;
+
+    padding: 2px 4px;
+    font-size: 20px;
+    border: 1px solid var(--border-color);
+    border-radius: 4px;
+}
+
+details[open] {
+    background-color: var(--details-open);
+    margin-bottom: 8px;
+}
+
+.details-content {
+    padding: 12px 3px;
+    gap: 16px;
+    color: var(--details-content);
+}
+
+details a {
+    color: var(--details-a);
+}
+
+details a:hover {
+    color: var(--details-a-hover);
+}
+
+footer {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    justify-content: space-between;
+}
+
+.description {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    text-align: center;
+}
+
+.highlight-author {
+    color: var(--highlight-author);
+    font-weight: bold;
+}
+
+.highlight-title {
+    color: var(--highlight-title);
+    font-weight: bold;
+}
+
+.channel-description {
+    text-align: center;
+    font-size: var(--font-size-scaler);
+}
+
+.article-summary-link {
+    color: var(--article-summary-color);
+    font-size: var(--font-size-s);
+    text-decoration: none;
+}
+
+.article-summary-link:hover {
+    color: var(--article-summary-hover-color);
+    --accordion-content-rail-color: var(--accordion-content-hover-rail-color);
+}
+
+.article-summary-box-outer {
+    display: block;
+    padding: 4px 8px 8px 4px;
+}
+
+.article-summary-box-inner {
+    padding-left: 8px;
+    border-left: 1px solid var(--accordion-content-rail-color);
+    font-size: var(--font-size-m);
+}
+
+.article-expander {
+    padding: 10px 4px;
+    border-radius: 4px;
+}
+
+.article-authors {
+    font-size: var(--font-size-m);
+    padding: 0.25em 1em;
+}
+
+.article-authors a {
+    text-decoration: none;
+}
+
+.article-expander-title {
+    font-size: var(--font-size-m);
+    font-weight: 600;
+}
+
+.article-expander-title:hover {
+    cursor: pointer;
+}
+
+.article-expander-title::marker {
+    color: var(--accordion-title-marker-color);
+}
+
+.article-expander-title:hover::marker {
+    color: var(--accordion-title-hover-marker-color);
+}
+
+/* for switcher */
+.theme-switch {
+    display: inline-block;
+    position: relative;
+}
+
+.theme-switch input {
+    display: none;
+}
+
+/* chip */
+.chip {
+    font-size: 90%;
+    align-items: center;
+    color: var(--chip-font);
+    background: var(--chip-color);
+    border-radius: 5rem;
+    display: inline-flex;
+    padding: .2rem .4rem;
+    vertical-align: middle;
+}
\ No newline at end of file
diff --git a/index.html b/index.html
new file mode 100644
index 00000000..9bb5f9b0
--- /dev/null
+++ b/index.html
@@ -0,0 +1,75199 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <title>MyArxiv</title>
+    <meta charset="utf-8"/>
+    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
+    <meta name="robots" content="noindex, nofollow"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1"/>
+    <link rel="shortcut icon" type="image/x-icon" href="favicon.ico"/>
+    <link href="index.css" rel="stylesheet"/>
+    <link href="https://cdn.jsdelivr.net/npm/remixicon@2.5.0/fonts/remixicon.css" rel="stylesheet">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.js"
+            integrity="sha384-z1fJDqw8ZApjGO3/unPWUPsIymfsJmyrDVWC8Tv/a1HeOtGmkwNd/7xUS0Xcnvsx"
+            crossorigin="anonymous"></script>
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/contrib/auto-render.min.js"
+            integrity="sha384-+XBljXPPiv+OzfbB3cVmLHf4hdUFHlWNZN5spNQ7rmHTXpd7WvJum6fIACpNNfIR"
+            crossorigin="anonymous"></script>
+    <script>
+        document.addEventListener("DOMContentLoaded", function () {
+            renderMathInElement(document.body, {
+                // customised options
+                // • auto-render specific keys, e.g.:
+                delimiters: [
+                    {left: '$$', right: '$$', display: true},
+                    {left: '$', right: '$', display: false},
+                    {left: '\\(', right: '\\)', display: false},
+                    {left: '\\[', right: '\\]', display: true},
+                    {left: "\\begin{equation}", right: "\\end{equation}", display: true},
+                    {left: "\\begin{align}", right: "\\end{align}", display: true},
+                    {left: "\\begin{alignat}", right: "\\end{alignat}", display: true},
+                    {left: "\\begin{gather}", right: "\\end{gather}", display: true},
+                    {left: "\\begin{CD}", right: "\\end{CD}", display: true},
+                ],
+                // • rendering keys, e.g.:
+                throwOnError: false
+            });
+        });
+    </script>
+</head>
+
+<body>
+<section class="header-container">
+    <div style="display:flex; justify-content:space-between; align-items:flex-end;">
+        <div>
+            <div class="header-title">
+                MyArxiv
+            </div>
+        </div>
+
+        <div class=icons>
+            <label class="theme-switch" for="checkbox">
+                <input type="checkbox" id="checkbox"/>
+                <i id="theme-icon" class="ri-moon-line" style="font-size: 32px" rel="noopener noreferrer"></i>
+            </label>
+        </div>
+    </div>
+</section>
+
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-26T00:00:00Z">2023-07-26</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">32</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalist Biomedical AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Tu, Shekoofeh Azizi, Danny Driess, Mike Schaekermann, Mohamed Amin, Pi-Chuan Chang, Andrew Carroll, Chuck Lau, Ryutaro Tanno, Ira Ktena, Basil Mustafa, Aakanksha Chowdhery, Yun Liu, Simon Kornblith, David Fleet, Philip Mansfield, Sushant Prakash, Renee Wong, Sunny Virmani, Christopher Semturs, S Sara Mahdavi, Bradley Green, Ewa Dominowska, Blaise Aguera y Arcas, Joelle Barral, Dale Webster, Greg S. Corrado, Yossi Matias, Karan Singhal, Pete Florence, Alan Karthikesalingam, Vivek Natarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medicine is inherently multimodal, with rich data modalities spanning text,
+imaging, genomics, and more. Generalist biomedical artificial intelligence (AI)
+systems that flexibly encode, integrate, and interpret this data at scale can
+potentially enable impactful applications ranging from scientific discovery to
+care delivery. To enable the development of these models, we first curate
+MultiMedBench, a new multimodal biomedical benchmark. MultiMedBench encompasses
+14 diverse tasks such as medical question answering, mammography and
+dermatology image interpretation, radiology report generation and
+summarization, and genomic variant calling. We then introduce Med-PaLM
+Multimodal (Med-PaLM M), our proof of concept for a generalist biomedical AI
+system. Med-PaLM M is a large multimodal generative model that flexibly encodes
+and interprets biomedical data including clinical language, imaging, and
+genomics with the same set of model weights. Med-PaLM M reaches performance
+competitive with or exceeding the state of the art on all MultiMedBench tasks,
+often surpassing specialist models by a wide margin. We also report examples of
+zero-shot generalization to novel medical concepts and tasks, positive transfer
+learning across tasks, and emergent zero-shot medical reasoning. To further
+probe the capabilities and limitations of Med-PaLM M, we conduct a radiologist
+evaluation of model-generated (and human) chest X-ray reports and observe
+encouraging performance across model scales. In a side-by-side ranking on 246
+retrospective chest X-rays, clinicians express a pairwise preference for
+Med-PaLM M reports over those produced by radiologists in up to 40.50% of
+cases, suggesting potential clinical utility. While considerable work is needed
+to validate these models in real-world use cases, our results represent a
+milestone towards the development of generalist biomedical AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Moral Beliefs Encoded in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nino Scherrer, Claudia Shi, Amir Feder, David M. Blei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a case study on the design, administration,
+post-processing, and evaluation of surveys on large language models (LLMs). It
+comprises two components: (1) A statistical method for eliciting beliefs
+encoded in LLMs. We introduce statistical measures and evaluation metrics that
+quantify the probability of an LLM "making a choice", the associated
+uncertainty, and the consistency of that choice. (2) We apply this method to
+study what moral beliefs are encoded in different LLMs, especially in ambiguous
+cases where the right choice is not obvious. We design a large-scale survey
+comprising 680 high-ambiguity moral scenarios (e.g., "Should I tell a white
+lie?") and 687 low-ambiguity moral scenarios (e.g., "Should I stop for a
+pedestrian on the road?"). Each scenario includes a description, two possible
+actions, and auxiliary labels indicating violated rules (e.g., "do not kill").
+We administer the survey to 28 open- and closed-source LLMs. We find that (a)
+in unambiguous scenarios, most models "choose" actions that align with
+commonsense. In ambiguous cases, most models express uncertainty. (b) Some
+models are uncertain about choosing the commonsense action because their
+responses are sensitive to the question-wording. (c) Some models reflect clear
+preferences in ambiguous scenarios. Specifically, closed-source models tend to
+agree with each other.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of Libraries for the Sentimental Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wendy Ccoya, Edson Pinto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study is main goal is to provide a comparative comparison of libraries
+using machine learning methods. Experts in natural language processing (NLP)
+are becoming more and more interested in sentiment analysis (SA) of text
+changes. The objective of employing NLP text analysis techniques is to
+recognize and categorize feelings related to twitter users utterances. In this
+examination, issues with SA and the libraries utilized are also looked at.
+provides a number of cooperative methods to classify emotional polarity. The
+Naive Bayes Classifier, Decision Tree Classifier, Maxent Classifier, Sklearn
+Classifier, Sklearn Classifier MultinomialNB, and other conjoint learning
+algorithms, according to recent research, are very effective. In the project
+will use Five Python and R libraries NLTK, TextBlob, Vader, Transformers (GPT
+and BERT pretrained), and Tidytext will be used in the study to apply sentiment
+analysis techniques. Four machine learning models Tree of Decisions (DT),
+Support Vector Machine (SVM), Naive Bayes (NB), and K-Nearest Neighbor (KNN)
+will also be used. To evaluate how well libraries for SA operate in the social
+network environment, comparative study was also carried out. The measures to
+assess the best algorithms in this experiment, which used a single data set for
+each method, were precision, recall, and F1 score. We conclude that the BERT
+transformer method with an Accuracy: 0.973 is recommended for sentiment
+analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatically Evaluating Opinion Prevalence in Opinion Summarization <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Malon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When faced with a large number of product reviews, it is not clear that a
+human can remember all of them and weight opinions representatively to write a
+good reference summary. We propose an automatic metric to test the prevalence
+of the opinions that a summary expresses, based on counting the number of
+reviews that are consistent with each statement in the summary, while
+discrediting trivial or redundant statements. To formulate this opinion
+prevalence metric, we consider several existing methods to score the factual
+consistency of a summary statement with respect to each individual source
+review. On a corpus of Amazon product reviews, we gather multiple human
+judgments of the opinion consistency, to determine which automatic metric best
+expresses consistency in product reviews. Using the resulting opinion
+prevalence metric, we show that a human authored summary has only slightly
+better opinion prevalence than randomly selected extracts from the source
+reviews, and previous extractive and abstractive unsupervised opinion
+summarization methods perform worse than humans. We demonstrate room for
+improvement with a greedy construction of extractive summaries with twice the
+opinion prevalence achieved by humans. Finally, we show that preprocessing
+source reviews by simplification can raise the opinion prevalence achieved by
+existing abstractive opinion summarization systems to the level of human
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 6th Workshop on e-Commerce and NLP (KDD 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chat<span class="highlight-title">GPT</span> and Persuasive Technologies for the Management and Delivery of
+  Personalized Recommendations in Hotel Hospitality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manolis Remountakis, Konstantinos Kotis, Babis Kourtzis, George E. Tsekouras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems have become indispensable tools in the hotel hospitality
+industry, enabling personalized and tailored experiences for guests. Recent
+advancements in large language models (LLMs), such as ChatGPT, and persuasive
+technologies, have opened new avenues for enhancing the effectiveness of those
+systems. This paper explores the potential of integrating ChatGPT and
+persuasive technologies for automating and improving hotel hospitality
+recommender systems. First, we delve into the capabilities of ChatGPT, which
+can understand and generate human-like text, enabling more accurate and
+context-aware recommendations. We discuss the integration of ChatGPT into
+recommender systems, highlighting the ability to analyze user preferences,
+extract valuable insights from online reviews, and generate personalized
+recommendations based on guest profiles. Second, we investigate the role of
+persuasive technology in influencing user behavior and enhancing the persuasive
+impact of hotel recommendations. By incorporating persuasive techniques, such
+as social proof, scarcity and personalization, recommender systems can
+effectively influence user decision-making and encourage desired actions, such
+as booking a specific hotel or upgrading their room. To investigate the
+efficacy of ChatGPT and persuasive technologies, we present a pilot experi-ment
+with a case study involving a hotel recommender system. We aim to study the
+impact of integrating ChatGPT and persua-sive techniques on user engagement,
+satisfaction, and conversion rates. The preliminary results demonstrate the
+potential of these technologies in enhancing the overall guest experience and
+business performance. Overall, this paper contributes to the field of hotel
+hospitality by exploring the synergistic relationship between LLMs and
+persuasive technology in recommender systems, ultimately influencing guest
+satisfaction and hotel revenue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Founding a mathematical diffusion model in linguistics. The case study
+  of German syntactic features in the North-Eastern Italian dialects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        I. Lazzizzera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We take as a case study the spread of Germanic syntactic features into
+Romance dialects of North-Eastern Italy, which occurred after the immigration
+of German people in the Tyrol during the High Middle Ages.
+  An interactive map is produced using tools of what is called Geographic Data
+Science. A smooth two-dimensional surface $\mathcal{G}$ expresses locally which
+fraction of territory uses a given German language feature: it is obtained by
+interpolating a discrete function that says if at any surveyed locality that
+feature is used or not.\newline
+  This surface $\mathcal{G}$ is thought of as the value at the present time of
+a function describing a diffusion-convection phenomenon in two dimensions (here
+said \emph{tidal} mode), which is subjected in a very natural way to the same
+equation, suitably contextualized, used in physics for a number of
+phenomenological facts like the heat diffusion. It is shown that solutions of
+this equation, evaluated at the present time, fit well with the data as
+interpolated by $\mathcal{G}$, thus providing convincing pictures of
+diffusion-convection of the linguistic features of the case study, albeit
+simplifications and approximations.\newline
+  Very importantly, it is shown that Schmidt's 'waves' can be counted among the
+solutions of the diffusion equation: superimposing Schmidt 'waves' to a 'tidal
+flooding' can reproduce complexities of real linguistic diffusion events.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UnScientify: Detecting Scientific Uncertainty in Scholarly Full Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Panggih Kusuma Ningrum, Philipp Mayr, Iana Atanassova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This demo paper presents UnScientify, an interactive system designed to
+detect scientific uncertainty in scholarly full text. The system utilizes a
+weakly supervised technique that employs a fine-grained annotation scheme to
+identify verbally formulated uncertainty at the sentence level in scientific
+texts. The pipeline for the system includes a combination of pattern matching,
+complex sentence checking, and authorial reference checking. Our approach
+automates labeling and annotation tasks for scientific uncertainty
+identification, taking into account different types of scientific uncertainty,
+that can serve various applications such as information retrieval, text mining,
+and scholarly document processing. Additionally, UnScientify provides
+interpretable results, aiding in the comprehension of identified instances of
+scientific uncertainty in text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted for the Joint Workshop of the 4th Extraction and
+  Evaluation of Knowledge Entities from Scientific Documents and the 3rd AI +
+  Informetrics (EEKE-AII2023), June 26, 2023, Santa Fe, New Mexico, USA and
+  Online</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LOIS: Looking Out of Instance Semantics for Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Zhang, Yeming Chen, Yaoru Sun, Fang Wang, Haibo Shi, Haoran Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual question answering (VQA) has been intensively studied as a multimodal
+task that requires effort in bridging vision and language to infer answers
+correctly. Recent attempts have developed various attention-based modules for
+solving VQA tasks. However, the performance of model inference is largely
+bottlenecked by visual processing for semantics understanding. Most existing
+detection methods rely on bounding boxes, remaining a serious challenge for VQA
+models to understand the causal nexus of object semantics in images and
+correctly infer contextual information. To this end, we propose a finer model
+framework without bounding boxes in this work, termed Looking Out of Instance
+Semantics (LOIS) to tackle this important issue. LOIS enables more fine-grained
+feature descriptions to produce visual facts. Furthermore, to overcome the
+label ambiguity caused by instance masks, two types of relation attention
+modules: 1) intra-modality and 2) inter-modality, are devised to infer the
+correct answers from the different multi-view features. Specifically, we
+implement a mutual relation attention module to model sophisticated and deeper
+visual semantic relations between instance objects and background information.
+In addition, our proposed attention model can further analyze salient image
+regions by focusing on important word-related questions. Experimental results
+on four benchmark VQA datasets prove that our proposed method has favorable
+performance in improving visual reasoning capability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Developing and Evaluating Tiny to Medium-Sized Turkish <span class="highlight-title">BERT</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Himmet Toprak Kesgin, Muzaffer Kaan Yuce, Mehmet Fatih Amasyali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces and evaluates tiny, mini, small, and medium-sized
+uncased Turkish BERT models, aiming to bridge the research gap in
+less-resourced languages. We trained these models on a diverse dataset
+encompassing over 75GB of text from multiple sources and tested them on several
+tasks, including mask prediction, sentiment analysis, news classification, and,
+zero-shot classification. Despite their smaller size, our models exhibited
+robust performance, including zero-shot task, while ensuring computational
+efficiency and faster execution times. Our findings provide valuable insights
+into the development and application of smaller language models, especially in
+the context of the Turkish language.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Say Goodbye to RNN-T Loss: A Novel CIF-based Transducer Architecture for
+  Automatic Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian-Hao Zhang, Dinghao Zhou, Guiping Zhon, Baoxiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RNN-T models are widely used in ASR, which rely on the RNN-T loss to achieve
+length alignment between input audio and target sequence. However, the
+implementation complexity and the alignment-based optimization target of RNN-T
+loss lead to computational redundancy and a reduced role for predictor network,
+respectively. In this paper, we propose a novel model named CIF-Transducer
+(CIF-T) which incorporates the Continuous Integrate-and-Fire (CIF) mechanism
+with the RNN-T model to achieve efficient alignment. In this way, the RNN-T
+loss is abandoned, thus bringing a computational reduction and allowing the
+predictor network a more significant role. We also introduce Funnel-CIF,
+Context Blocks, Unified Gating and Bilinear Pooling joint network, and
+auxiliary training strategy to further improve performance. Experiments on the
+178-hour AISHELL-1 and 10000-hour WenetSpeech datasets show that CIF-T achieves
+state-of-the-art results with lower computational overhead compared to RNN-T
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Implicit Feedback from Deployment Data in Dialogue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Yuanzhe Pang, Stephen Roller, Kyunghyun Cho, He He, Jason Weston
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study improving social conversational agents by learning from natural
+dialogue between users and a deployed model, without extra annotations. To
+implicitly measure the quality of a machine-generated utterance, we leverage
+signals like user response length, sentiment and reaction of the future human
+utterances in the collected dialogue episodes. Our experiments use the publicly
+released deployment data from BlenderBot (Xu et al., 2023). Human evaluation
+indicates improvements in our new models over baseline responses; however, we
+find that some proxy signals can lead to more generations with undesirable
+properties as well. For example, optimizing for conversation length can lead to
+more controversial or unfriendly generations compared to the baseline, whereas
+optimizing for positive sentiment or reaction can decrease these behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoding Chat<span class="highlight-title">GPT</span>: A Taxonomy of Existing Research, Current Challenges,
+  and Possible Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahab Saquib Sohail, Faiza Farhat, Yassine Himeur, Mohammad Nadeem, Dag Øivind Madsen, Yashbir Singh, Shadi Atalla, Wathiq Mansoor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chat Generative Pre-trained Transformer (ChatGPT) has gained significant
+interest and attention since its launch in November 2022. It has shown
+impressive performance in various domains, including passing exams and creative
+writing. However, challenges and concerns related to biases and trust persist.
+In this work, we present a comprehensive review of over 100 Scopus-indexed
+publications on ChatGPT, aiming to provide a taxonomy of ChatGPT research and
+explore its applications. We critically analyze the existing literature,
+identifying common approaches employed in the studies. Additionally, we
+investigate diverse application areas where ChatGPT has found utility, such as
+healthcare, marketing and financial services, software engineering, academic
+and scientific writing, research and education, environmental science, and
+natural language processing. Through examining these applications, we gain
+valuable insights into the potential of ChatGPT in addressing real-world
+challenges. We also discuss crucial issues related to ChatGPT, including biases
+and trustworthiness, emphasizing the need for further research and development
+in these areas. Furthermore, we identify potential future directions for
+ChatGPT research, proposing solutions to current challenges and speculating on
+expected advancements. By fully leveraging the capabilities of ChatGPT, we can
+unlock its potential across various domains, leading to advancements in
+conversational AI and transformative impacts in society.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages. 8 figures and 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi3WOZ: A Multilingual, Multi-Domain, Multi-Parallel <span class="highlight-title">Dataset</span> for
+  Training and Evaluating Culturally Adapted Task-Oriented Dialog Systems <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songbo Hu, Han Zhou, Mete Hergul, Milan Gritta, Guchun Zhang, Ignacio Iacobacci, Ivan Vulić, Anna Korhonen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating high-quality annotated data for task-oriented dialog (ToD) is known
+to be notoriously difficult, and the challenges are amplified when the goal is
+to create equitable, culturally adapted, and large-scale ToD datasets for
+multiple languages. Therefore, the current datasets are still very scarce and
+suffer from limitations such as translation-based non-native dialogs with
+translation artefacts, small scale, or lack of cultural adaptation, among
+others. In this work, we first take stock of the current landscape of
+multilingual ToD datasets, offering a systematic overview of their properties
+and limitations. Aiming to reduce all the detected limitations, we then
+introduce Multi3WOZ, a novel multilingual, multi-domain, multi-parallel ToD
+dataset. It is large-scale and offers culturally adapted dialogs in 4 languages
+to enable training and evaluation of multilingual and cross-lingual ToD
+systems. We describe a complex bottom-up data collection process that yielded
+the final dataset, and offer the first sets of baseline scores across different
+ToD-related tasks for future reference, also highlighting its challenging
+nature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A pre-MIT Press publication version for TACL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised extraction of local and global keywords from a single text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lida Aleksanyan, Armen E. Allahverdyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an unsupervised, corpus-independent method to extract keywords
+from a single text. It is based on the spatial distribution of words and the
+response of this distribution to a random permutation of words. As compared to
+existing methods (such as e.g. YAKE) our method has three advantages. First, it
+is significantly more effective at extracting keywords from long texts. Second,
+it allows inference of two types of keywords: local and global. Third, it
+uncovers basic themes in texts. Additionally, our method is
+language-independent and applies to short texts. The results are obtained via
+human annotators with previous knowledge of texts from our database of
+classical literary works (the agreement between annotators is from moderate to
+substantial). Our results are supported via human-independent arguments based
+on the average length of extracted content words and on the average number of
+nouns in extracted words. We discuss relations of keywords with higher-order
+textual features and reveal a connection between keywords and chapter
+divisions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Affective Natural Language Generation of Event Descriptions through
+  Fine-grained Appraisal Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yarik Menchaca Resendiz, Roman Klinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models for affective text generation have shown a remarkable progress, but
+they commonly rely only on basic emotion theories or valance/arousal values as
+conditions. This is appropriate when the goal is to create explicit emotion
+statements ("The kid is happy."). Emotions are, however, commonly communicated
+implicitly. For instance, the emotional interpretation of an event ("Their dog
+died.") does often not require an explicit emotion statement. In psychology,
+appraisal theories explain the link between a cognitive evaluation of an event
+and the potentially developed emotion. They put the assessment of the situation
+on the spot, for instance regarding the own control or the responsibility for
+what happens. We hypothesize and subsequently show that including appraisal
+variables as conditions in a generation framework comes with two advantages.
+(1) The generation model is informed in greater detail about what makes a
+specific emotion and what properties it has. This leads to text generation that
+better fulfills the condition. (2) The variables of appraisal allow a user to
+perform a more fine-grained control of the generated text, by stating
+properties of a situation instead of only providing the emotion category. Our
+Bart and T5-based experiments with 7 emotions (Anger, Disgust, Fear, Guilt,
+Joy, Sadness, Shame), and 7 appraisals (Attention, Responsibility, Control,
+Circumstance, Pleasantness, Effort, Certainty) show that (1) adding appraisals
+during training improves the accurateness of the generated texts by 10 pp in
+F1. Further, (2) the texts with appraisal variables are longer and contain more
+details. This exemplifies the greater control for users.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INLG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ This is not correct! Negation-aware Evaluation of Language Generation
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13989v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13989v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miriam Anschütz, Diego Miguel Lozano, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models underestimate the impact of negations on how much they
+change the meaning of a sentence. Therefore, learned evaluation metrics based
+on these models are insensitive to negations. In this paper, we propose
+NegBLEURT, a negation-aware version of the BLEURT evaluation metric. For that,
+we designed a rule-based sentence negation tool and used it to create the
+CANNOT negation evaluation dataset. Based on this dataset, we fine-tuned a
+sentence transformer and an evaluation metric to improve their negation
+sensitivity. Evaluating these models on existing benchmarks shows that our
+fine-tuned models outperform existing metrics on the negated sentences by far
+while preserving their base models' performances on other perturbations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INLG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Does Diffusion Influence <span class="highlight-title">Pretrain</span>ed Language Models on
+  Out-of-Distribution Data? <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huazheng Wang, Daixuan Cheng, Haifeng Sun, Jingyu Wang, Qi Qi, Jianxin Liao, Jing Wang, Cong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based pretrained language models (PLMs) have achieved great
+success in modern NLP. An important advantage of PLMs is good
+out-of-distribution (OOD) robustness. Recently, diffusion models have attracted
+a lot of work to apply diffusion to PLMs. It remains under-explored how
+diffusion influences PLMs on OOD data. The core of diffusion models is a
+forward diffusion process which gradually applies Gaussian noise to inputs, and
+a reverse denoising process which removes noise. The noised input
+reconstruction is a fundamental ability of diffusion models. We directly
+analyze OOD robustness by measuring the reconstruction loss, including testing
+the abilities to reconstruct OOD data, and to detect OOD samples. Experiments
+are conducted by analyzing different training parameters and data statistical
+features on eight datasets. It shows that finetuning PLMs with diffusion
+degrades the reconstruction ability on OOD data. The comparison also shows that
+diffusion models can effectively detect OOD samples, achieving state-of-the-art
+performance in most of the datasets with an absolute accuracy improvement up to
+18%. These results indicate that diffusion reduces OOD robustness of PLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grammar<span class="highlight-title">GPT</span>: Exploring Open-Source LLMs for Native Chinese Grammatical
+  Error Correction with Supervised Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaxin Fan, Feng Jiang, Peifeng Li, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Grammatical error correction aims to correct ungrammatical sentences
+automatically. Recently, some work has demonstrated the excellent capabilities
+of closed-source Large Language Models (LLMs, e.g., ChatGPT) in grammatical
+error correction. However, the potential of open-source LLMs remains
+unexplored. In this paper, we introduced GrammarGPT, an open-source LLM, to
+preliminary explore its potential for native Chinese grammatical error
+correction. The core recipe of GrammarGPT is to leverage the hybrid dataset of
+ChatGPT-generated and human-annotated. For grammatical errors with clues, we
+proposed a heuristic method to guide ChatGPT to generate ungrammatical
+sentences by providing those clues. For grammatical errors without clues, we
+collected ungrammatical sentences from publicly available websites and manually
+corrected them. In addition, we employed an error-invariant augmentation method
+to enhance the ability of the model to correct native Chinese grammatical
+errors. We ultimately constructed about 1k parallel data and utilized these
+data to fine-tune open-source LLMs (e.g., Phoenix, released by The Chinese
+University of Hong Kong, Shenzhen) with instruction tuning. The experimental
+results show that GrammarGPT outperforms the existing SOTA system
+significantly. Although model parameters are 20x larger than the SOTA baseline,
+the required amount of data for instruction tuning is 1200x smaller,
+illustrating the potential of open-source LLMs on native CGEC. Our GrammarGPT
+ranks $3^{rd}$ on NLPCC2023 SharedTask1, demonstrating our approach's
+effectiveness. The code and data are available at
+\url{https://github.com/FreedomIntelligence/GrammarGPT}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FinTree: Financial <span class="highlight-title">Dataset</span> <span class="highlight-title">Pretrain</span> <span class="highlight-title">Transformer</span> Encoder for Relation
+  Extraction <span class="chip">SIGIR'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyunjong Ok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present FinTree, Financial Dataset Pretrain Transformer Encoder for
+Relation Extraction. Utilizing an encoder language model, we further pretrain
+FinTree on the financial dataset, adapting the model in financial domain tasks.
+FinTree stands out with its novel structure that predicts a masked token
+instead of the conventional [CLS] token, inspired by the Pattern Exploiting
+Training methodology. This structure allows for more accurate relation
+predictions between two given entities. The model is trained with a unique
+input pattern to provide contextual and positional information about the
+entities of interest, and a post-processing step ensures accurate predictions
+in line with the entity types. Our experiments demonstrate that FinTree
+outperforms on the REFinD, a large-scale financial relation extraction dataset.
+The code and pretrained models are available at
+https://github.com/HJ-Ok/FinTree.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4pages, 2 figures, The SIGIR'23 Workshop on Knowledge Discovery from
+  Unstructured Data in Financial Services</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">GPT</span>-3 Models are Few-Shot Financial Reasoners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13617v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13617v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raul Salles de Padua, Imran Qureshi, Mustafa U. Karakaplan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Financial analysis is an important tool for evaluating company performance.
+Practitioners work to answer financial questions to make profitable investment
+decisions, and use advanced quantitative analyses to do so. As a result,
+Financial Question Answering (QA) is a question answering task that requires
+deep reasoning about numbers. Furthermore, it is unknown how well pre-trained
+language models can reason in the financial domain. The current
+state-of-the-art requires a retriever to collect relevant facts about the
+financial question from the text and a generator to produce a valid financial
+program and a final answer. However, recently large language models like GPT-3
+have achieved state-of-the-art performance on wide variety of tasks with just a
+few shot examples. We run several experiments with GPT-3 and find that a
+separate retrieval model and logic engine continue to be essential components
+to achieving SOTA performance in this task, particularly due to the precise
+nature of financial questions and the complex information stored in financial
+documents. With this understanding, our refined prompt-engineering approach on
+GPT-3 achieves near SOTA accuracy without any fine-tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMBench: Is Your Multi-modal Model an All-around Player? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06281v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06281v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Liu, Haodong Duan, Yuanhan Zhang, Bo Li, Songyang Zhang, Wangbo Zhao, Yike Yuan, Jiaqi Wang, Conghui He, Ziwei Liu, Kai Chen, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models have recently achieved remarkable progress,
+exhibiting great perception and reasoning abilities concerning visual
+information. However, how to effectively evaluate these large vision-language
+models remains a major obstacle, hindering future model development.
+Traditional benchmarks like VQAv2 or COCO Caption provide quantitative
+performance measurements but suffer from a lack of fine-grained ability
+assessment and non-robust evaluation metrics. Recent subjective benchmarks,
+such as OwlEval, offer comprehensive evaluations of a model's abilities by
+incorporating human labor, but they are not scalable and display significant
+bias. In response to these challenges, we propose MMBench, a novel
+multi-modality benchmark. MMBench methodically develops a comprehensive
+evaluation pipeline, primarily comprised of two elements. The first element is
+a meticulously curated dataset that surpasses existing similar benchmarks in
+terms of the number and variety of evaluation questions and abilities. The
+second element introduces a novel CircularEval strategy and incorporates the
+use of ChatGPT. This implementation is designed to convert free-form
+predictions into pre-defined choices, thereby facilitating a more robust
+evaluation of the model's predictions. MMBench is a systematically-designed
+objective benchmark for robustly evaluating the various abilities of
+vision-language models. We hope MMBench will assist the research community in
+better evaluating their models and encourage future advancements in this
+domain. Project page: https://opencompass.org.cn/mmbench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Multi-Modal Representations for Ambiguity Detection &
+  Coreference Resolution in the SIMMC 2.0 Challenge <span class="chip">AAAI 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.12645v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.12645v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Chiyah-Garcia, Alessandro Suglia, José Lopes, Arash Eshghi, Helen Hastie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anaphoric expressions, such as pronouns and referential descriptions, are
+situated with respect to the linguistic context of prior turns, as well as, the
+immediate visual environment. However, a speaker's referential descriptions do
+not always uniquely identify the referent, leading to ambiguities in need of
+resolution through subsequent clarificational exchanges. Thus, effective
+Ambiguity Detection and Coreference Resolution are key to task success in
+Conversational AI. In this paper, we present models for these two tasks as part
+of the SIMMC 2.0 Challenge (Kottur et al. 2021). Specifically, we use TOD-BERT
+and LXMERT based models, compare them to a number of baselines and provide
+ablation experiments. Our results show that (1) language models are able to
+exploit correlations in the data to detect ambiguity; and (2) unimodal
+coreference resolution models can avoid the need for a vision component,
+through the use of smart object representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2022 DSTC10 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FacTool: Factuality Detection in Generative AI -- A Tool Augmented
+  Framework for Multi-Task and Multi-Domain Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13528v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13528v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        I-Chun Chern, Steffi Chern, Shiqi Chen, Weizhe Yuan, Kehua Feng, Chunting Zhou, Junxian He, Graham Neubig, Pengfei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of generative pre-trained models has facilitated the synthesis
+of high-quality text, but it has also posed challenges in identifying factual
+errors in the generated text. In particular: (1) A wider range of tasks now
+face an increasing risk of containing factual errors when handled by generative
+models. (2) Generated texts tend to be lengthy and lack a clearly defined
+granularity for individual facts. (3) There is a scarcity of explicit evidence
+available during the process of fact checking. With the above challenges in
+mind, in this paper, we propose FacTool, a task and domain agnostic framework
+for detecting factual errors of texts generated by large language models (e.g.,
+ChatGPT). Experiments on four different tasks (knowledge-based QA, code
+generation, mathematical reasoning, and scientific literature review) show the
+efficacy of the proposed method. We release the code of FacTool associated with
+ChatGPT plugin interface at https://github.com/GAIR-NLP/factool .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Explainable and Language-Agnostic LLMs: Symbolic Reverse
+  Engineering of Language at Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00017v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00017v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Walid S. Saba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved a milestone that undenia-bly
+changed many held beliefs in artificial intelligence (AI). However, there
+remains many limitations of these LLMs when it comes to true language
+understanding, limitations that are a byproduct of the under-lying architecture
+of deep neural networks. Moreover, and due to their subsymbolic nature,
+whatever knowledge these models acquire about how language works will always be
+buried in billions of microfeatures (weights), none of which is meaningful on
+its own, making such models hopelessly unexplainable. To address these
+limitations, we suggest com-bining the strength of symbolic representations
+with what we believe to be the key to the success of LLMs, namely a successful
+bottom-up re-verse engineering of language at scale. As such we argue for a
+bottom-up reverse engineering of language in a symbolic setting. Hints on what
+this project amounts to have been suggested by several authors, and we discuss
+in some detail here how this project could be accomplished.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Draft, preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Media<span class="highlight-title">GPT</span> : A Large Language Model For Chinese Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhonghao Wang, Zijia Lu, Bo Jin, Haiying Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown remarkable capabilities in generating
+high-quality text and making predictions based on large amounts of data,
+including the media domain. However, in practical applications, the differences
+between the media's use cases and the general-purpose applications of LLMs have
+become increasingly apparent, especially Chinese. This paper examines the
+unique characteristics of media-domain-specific LLMs compared to general LLMs,
+designed a diverse set of task instruction types to cater the specific
+requirements of the domain and constructed unique datasets that are tailored to
+the media domain. Based on these, we proposed MediaGPT, a domain-specific LLM
+for the Chinese media domain, training by domain-specific data and experts SFT
+data. By performing human experts evaluation and strong model evaluation on a
+validation set, this paper demonstrated that MediaGPT outperforms mainstream
+models on various Chinese media domain tasks and verifies the importance of
+domain data and domain-defined prompt types for building an effective
+domain-specific LLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ No Train No Gain: Revisiting Efficient Training Algorithms For
+  <span class="highlight-title">Transformer</span>-based Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06440v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06440v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean Kaddour, Oscar Key, Piotr Nawrot, Pasquale Minervini, Matt J. Kusner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The computation necessary for training Transformer-based language models has
+skyrocketed in recent years. This trend has motivated research on efficient
+training algorithms designed to improve training, validation, and downstream
+performance faster than standard training. In this work, we revisit three
+categories of such algorithms: dynamic architectures (layer stacking, layer
+dropping), batch selection (selective backprop, RHO loss), and efficient
+optimizers (Lion, Sophia). When pre-training BERT and T5 with a fixed
+computation budget using such methods, we find that their training, validation,
+and downstream gains vanish compared to a baseline with a fully-decayed
+learning rate. We define an evaluation protocol that enables computation to be
+done on arbitrary machines by mapping all computation time to a reference
+machine which we call reference system time. We discuss the limitations of our
+proposed protocol and release our code to encourage rigorous research in
+efficient training procedures: https://github.com/JeanKaddour/NoTrainNoGain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Derivative Free Weight-space Ensembling <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03506v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03506v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dean Ninalga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work suggests that interpolating between the weights of two
+specialized language models can transfer knowledge between tasks in a way that
+multi-task learning cannot. However, very few have explored interpolation
+between more than two models, where each has a distinct knowledge base. In this
+paper, we introduce Derivative Free Weight-space Ensembling (DFWE), a new
+few-sample task transfer approach for open-domain dialogue. Our framework
+creates a set of diverse expert language models trained using a predefined set
+of source tasks. Next, we finetune each of the expert models on the target
+task, approaching the target task from several distinct knowledge bases.
+Finally, we linearly interpolate between the model weights using a
+gradient-free-optimization algorithm, to efficiently find a good interpolation
+weighting. We demonstrate the effectiveness of the method on FETA-Friends
+outperforming the standard pretrain-finetune approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For consideration at the 5th Workshop on NLP for Conversational AI
+  (co-located with ACL 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ "Are you telling me to put glasses on the dog?'' Content-Grounded
+  Annotation of Instruction Clarification Requests in the CoDraw <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02377v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02377v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brielen Madureira, David Schlangen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction Clarification Requests are a mechanism to solve communication
+problems, which is very functional in instruction-following interactions.
+Recent work has argued that the CoDraw dataset is a valuable source of
+naturally occurring iCRs. Beyond identifying when iCRs should be made, dialogue
+models should also be able to generate them with suitable form and content. In
+this work, we introduce CoDraw-iCR (v2), extending the existing iCR identifiers
+with fine-grained information grounded in the underlying dialogue game items
+and possible actions. Our annotation can serve to model and evaluate repair
+capabilities of dialogue agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A 2-page version will appear at SemDial 2023 as a poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Do Emergent Abilities Exist in Quantized Large Language Models: An
+  Empirical Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08072v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08072v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiyu Liu, Zikang Liu, Ze-Feng Gao, Dawei Gao, Wayne Xin Zhao, Yaliang Li, Bolin Ding, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the superior performance, Large Language Models~(LLMs) require
+significant computational resources for deployment and use. To overcome this
+issue, quantization methods have been widely applied to reduce the memory
+footprint of LLMs as well as increasing the inference rate. However, a major
+challenge is that low-bit quantization methods often lead to performance
+degradation. It is important to understand how quantization impacts the
+capacity of LLMs. Different from previous studies focused on overall
+performance, this work aims to investigate the impact of quantization on
+\emph{emergent abilities}, which are important characteristics that distinguish
+LLMs from small language models. Specially, we examine the abilities of
+in-context learning, chain-of-thought reasoning, and instruction-following in
+quantized LLMs. Our empirical experiments show that these emergent abilities
+still exist in 4-bit quantization models, while 2-bit models encounter severe
+performance degradation on the test of these abilities. To improve the
+performance of low-bit models, we conduct two special experiments: (1)
+fine-gained impact analysis that studies which components (or substructures)
+are more sensitive to quantization, and (2) performance compensation through
+model fine-tuning. Our work derives a series of important findings to
+understand the impact of quantization on emergent abilities, and sheds lights
+on the possibilities of extremely low-bit quantization for LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAS Video-QA: Self-Adaptive Sampling for Efficient Video
+  Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Han, Hui Chen, Min-Yen Kan, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video question--answering is a fundamental task in the field of video
+understanding. Although current vision--language models (VLMs) equipped with
+Video Transformers have enabled temporal modeling and yielded superior results,
+they are at the cost of huge computational power and thus too expensive to
+deploy in real-time application scenarios. An economical workaround only
+samples a small portion of frames to represent the main content of that video
+and tune an image--text model on these sampled frames. Recent video
+understanding models usually randomly sample a set of frames or clips,
+regardless of internal correlations between their visual contents, nor their
+relevance to the problem. We argue that such kinds of aimless sampling may omit
+the key frames from which the correct answer can be deduced, and the situation
+gets worse when the sampling sparsity increases, which always happens as the
+video lengths increase. To mitigate this issue, we propose two frame sampling
+strategies, namely the most domain frames (MDF) and most implied frames (MIF),
+to maximally preserve those frames that are most likely vital to the given
+questions. MDF passively minimizes the risk of key frame omission in a
+bootstrap manner, while MIS actively searches key frames customized for each
+video--question pair with the assistance of auxiliary models. The experimental
+results on three public datasets from three advanced VLMs (CLIP, GIT and
+All-in-one) demonstrate that our proposed strategies can boost the performance
+for image--text pretrained models. The source codes pertaining to the method
+proposed in this paper are publicly available at
+https://github.com/declare-lab/sas-vqa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Table and Image Generation for Investigating Knowledge of Entities in
+  <span class="highlight-title">Pre-train</span>ed Vision and Language Models <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02115v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02115v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hidetaka Kamigaito, Katsuhiko Hayashi, Taro Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a table and image generation task to verify how the
+knowledge about entities acquired from natural language is retained in Vision &
+Language (V&L) models. This task consists of two parts: the first is to
+generate a table containing knowledge about an entity and its related image,
+and the second is to generate an image from an entity with a caption and a
+table containing related knowledge of the entity. In both tasks, the model must
+know the entities used to perform the generation properly. We created the
+Wikipedia Table and Image Generation (WikiTIG) dataset from about 200,000
+infoboxes in English Wikipedia articles to perform the proposed tasks. We
+evaluated the performance on the tasks with respect to the above research
+question using the V&L model OFA, which has achieved state-of-the-art results
+in multiple tasks. Experimental results show that OFA forgets part of its
+entity knowledge by pre-training as a complement to improve the performance of
+image related tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive Comparison of <span class="highlight-title">Pre-train</span>ing Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.11483v9">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.11483v9.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the development of pre-trained language models has brought natural
+language processing (NLP) tasks to the new state-of-the-art. In this paper we
+explore the efficiency of various pre-trained language models. We pre-train a
+list of transformer-based models with the same amount of text and the same
+training steps. The experimental results shows that the most improvement upon
+the origin BERT is adding the RNN-layer to capture more contextual information
+for short text understanding. But the conclusion is: There are no remarkable
+improvement for short text understanding for similar BERT structures.
+Data-centric method[12] can achieve better performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">105</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Virtual Mirrors: Non-Line-of-Sight Imaging Beyond the Third Bounce 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Royo, Talha Sultan, Adolfo Muñoz, Khadijeh Masumnia-Bisheh, Eric Brandt, Diego Gutierrez, Andreas Velten, Julio Marco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-line-of-sight (NLOS) imaging methods are capable of reconstructing
+complex scenes that are not visible to an observer using indirect illumination.
+However, they assume only third-bounce illumination, so they are currently
+limited to single-corner configurations, and present limited visibility when
+imaging surfaces at certain orientations. To reason about and tackle these
+limitations, we make the key observation that planar diffuse surfaces behave
+specularly at wavelengths used in the computational wave-based NLOS imaging
+domain. We call such surfaces virtual mirrors. We leverage this observation to
+expand the capabilities of NLOS imaging using illumination beyond the third
+bounce, addressing two problems: imaging single-corner objects at limited
+visibility angles, and imaging objects hidden behind two corners. To image
+objects at limited visibility angles, we first analyze the reflections of the
+known illuminated point on surfaces of the scene as an estimator of the
+position and orientation of objects with limited visibility. We then image
+those limited visibility objects by computationally building secondary
+apertures at other surfaces that observe the target object from a direct
+visibility perspective. Beyond single-corner NLOS imaging, we exploit the
+specular behavior of virtual mirrors to image objects hidden behind a second
+corner by imaging the space behind such virtual mirrors, where the mirror image
+of objects hidden around two corners is formed. No specular surfaces were
+involved in the making of this paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAMo: Leveraging Memory and Attention for Monocular Video Depth
+  Estimation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajeev Yasarla, Hong Cai, Jisoo Jeong, Yunxiao Shi, Risheek Garrepalli, Fatih Porikli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MAMo, a novel memory and attention frame-work for monocular video
+depth estimation. MAMo can augment and improve any single-image depth
+estimation networks into video depth estimation models, enabling them to take
+advantage of the temporal information to predict more accurate depth. In MAMo,
+we augment model with memory which aids the depth prediction as the model
+streams through the video. Specifically, the memory stores learned visual and
+displacement tokens of the previous time instances. This allows the depth
+network to cross-reference relevant features from the past when predicting
+depth on the current frame. We introduce a novel scheme to continuously update
+the memory, optimizing it to keep tokens that correspond with both the past and
+the present visual information. We adopt attention-based approach to process
+memory features where we first learn the spatio-temporal relation among the
+resultant visual and displacement memory tokens using self-attention module.
+Further, the output features of self-attention are aggregated with the current
+visual features through cross-attention. The cross-attended features are
+finally given to a decoder to predict depth on the current frame. Through
+extensive experiments on several benchmarks, including KITTI, NYU-Depth V2, and
+DDAD, we show that MAMo consistently improves monocular depth estimation
+networks and sets new state-of-the-art (SOTA) accuracy. Notably, our MAMo video
+depth estimation provides higher accuracy with lower latency, when omparing to
+SOTA cost-volume-based video depth models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalist Biomedical AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Tu, Shekoofeh Azizi, Danny Driess, Mike Schaekermann, Mohamed Amin, Pi-Chuan Chang, Andrew Carroll, Chuck Lau, Ryutaro Tanno, Ira Ktena, Basil Mustafa, Aakanksha Chowdhery, Yun Liu, Simon Kornblith, David Fleet, Philip Mansfield, Sushant Prakash, Renee Wong, Sunny Virmani, Christopher Semturs, S Sara Mahdavi, Bradley Green, Ewa Dominowska, Blaise Aguera y Arcas, Joelle Barral, Dale Webster, Greg S. Corrado, Yossi Matias, Karan Singhal, Pete Florence, Alan Karthikesalingam, Vivek Natarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medicine is inherently multimodal, with rich data modalities spanning text,
+imaging, genomics, and more. Generalist biomedical artificial intelligence (AI)
+systems that flexibly encode, integrate, and interpret this data at scale can
+potentially enable impactful applications ranging from scientific discovery to
+care delivery. To enable the development of these models, we first curate
+MultiMedBench, a new multimodal biomedical benchmark. MultiMedBench encompasses
+14 diverse tasks such as medical question answering, mammography and
+dermatology image interpretation, radiology report generation and
+summarization, and genomic variant calling. We then introduce Med-PaLM
+Multimodal (Med-PaLM M), our proof of concept for a generalist biomedical AI
+system. Med-PaLM M is a large multimodal generative model that flexibly encodes
+and interprets biomedical data including clinical language, imaging, and
+genomics with the same set of model weights. Med-PaLM M reaches performance
+competitive with or exceeding the state of the art on all MultiMedBench tasks,
+often surpassing specialist models by a wide margin. We also report examples of
+zero-shot generalization to novel medical concepts and tasks, positive transfer
+learning across tasks, and emergent zero-shot medical reasoning. To further
+probe the capabilities and limitations of Med-PaLM M, we conduct a radiologist
+evaluation of model-generated (and human) chest X-ray reports and observe
+encouraging performance across model scales. In a side-by-side ranking on 246
+retrospective chest X-rays, clinicians express a pairwise preference for
+Med-PaLM M reports over those produced by radiologists in up to 40.50% of
+cases, suggesting potential clinical utility. While considerable work is needed
+to validate these models in real-world use cases, our results represent a
+milestone towards the development of generalist biomedical AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event-based Vision for Early Prediction of Manipulation Actions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Deniz, Cornelia Fermuller, Eduardo Ros, Manuel Rodriguez-Alvarez, Francisco Barranco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neuromorphic visual sensors are artificial retinas that output sequences of
+asynchronous events when brightness changes occur in the scene. These sensors
+offer many advantages including very high temporal resolution, no motion blur
+and smart data compression ideal for real-time processing. In this study, we
+introduce an event-based dataset on fine-grained manipulation actions and
+perform an experimental study on the use of transformers for action prediction
+with events. There is enormous interest in the fields of cognitive robotics and
+human-robot interaction on understanding and predicting human actions as early
+as possible. Early prediction allows anticipating complex stages for planning,
+enabling effective and real-time interaction. Our Transformer network uses
+events to predict manipulation actions as they occur, using online inference.
+The model succeeds at predicting actions early on, building up confidence over
+time and achieving state-of-the-art classification. Moreover, the
+attention-based transformer architecture allows us to study the role of the
+spatio-temporal patterns selected by the model. Our experiments show that the
+Transformer network captures action dynamic features outperforming video-based
+approaches and succeeding with scenarios where the differences between actions
+lie in very subtle cues. Finally, we release the new event dataset, which is
+the first in the literature for manipulation action recognition. Code will be
+available at https://github.com/DaniDeniz/EventVisionTransformer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Instruction Inversion: Image Editing via Visual <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thao Nguyen, Yuheng Li, Utkarsh Ojha, Yong Jae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-conditioned image editing has emerged as a powerful tool for editing
+images. However, in many situations, language can be ambiguous and ineffective
+in describing specific image edits. When faced with such challenges, visual
+prompts can be a more informative and intuitive way to convey ideas. We present
+a method for image editing via visual prompting. Given pairs of example that
+represent the "before" and "after" images of an edit, our goal is to learn a
+text-based editing direction that can be used to perform the same edit on new
+images. We leverage the rich, pretrained editing capabilities of text-to-image
+diffusion models by inverting visual prompts into editing instructions. Our
+results show that with just one example pair, we can achieve competitive
+results compared to state-of-the-art text-conditioned image editing frameworks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://thaoshibe.github.io/visii/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unraveling the Complexity of Splitting Sequential Data: Tackling
+  Challenges in Video and Time Series Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Botache, Kristina Dingel, Rico Huhnstock, Arno Ehresmann, Bernhard Sick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Splitting of sequential data, such as videos and time series, is an essential
+step in various data analysis tasks, including object tracking and anomaly
+detection. However, splitting sequential data presents a variety of challenges
+that can impact the accuracy and reliability of subsequent analyses. This
+concept article examines the challenges associated with splitting sequential
+data, including data acquisition, data representation, split ratio selection,
+setting up quality criteria, and choosing suitable selection strategies. We
+explore these challenges through two real-world examples: motor test benches
+and particle tracking in liquids.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ US & MR Image-Fusion Based on Skin Co-Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martina Paccini, Giacomo Paschina, Stefano De Beni, Giuseppe Patanè
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The study and development of innovative solutions for the advanced
+visualisation, representation and analysis of medical images offer different
+research directions. Current practice in medical imaging consists in combining
+real-time US with imaging modalities that allow internal anatomy acquisitions,
+such as CT, MRI, PET or similar. Application of image-fusion approaches can be
+found in tracking surgical tools and/or needles, in real-time during
+interventions. Thus, this work proposes a fusion imaging system for the
+registration of CT and MRI images with real-time US acquisition leveraging a 3D
+camera sensor. The main focus of the work is the portability of the system and
+its applicability to different anatomical districts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large-scale Fully-Unsupervised Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Bertocco, Fernanda Andaló, Terrance E. Boult, Anderson Rocha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fully-unsupervised Person and Vehicle Re-Identification have received
+increasing attention due to their broad applicability in surveillance,
+forensics, event understanding, and smart cities, without requiring any manual
+annotation. However, most of the prior art has been evaluated in datasets that
+have just a couple thousand samples. Such small-data setups often allow the use
+of costly techniques in time and memory footprints, such as Re-Ranking, to
+improve clustering results. Moreover, some previous work even pre-selects the
+best clustering hyper-parameters for each dataset, which is unrealistic in a
+large-scale fully-unsupervised scenario. In this context, this work tackles a
+more realistic scenario and proposes two strategies to learn from large-scale
+unlabeled data. The first strategy performs a local neighborhood sampling to
+reduce the dataset size in each iteration without violating neighborhood
+relationships. A second strategy leverages a novel Re-Ranking technique, which
+has a lower time upper bound complexity and reduces the memory complexity from
+O(n^2) to O(kn) with k << n. To avoid the pre-selection of specific
+hyper-parameter values for the clustering algorithm, we also present a novel
+scheduling algorithm that adjusts the density parameter during training, to
+leverage the diversity of samples and keep the learning robust to noisy
+labeling. Finally, due to the complementary knowledge learned by different
+models, we also introduce a co-training strategy that relies upon the
+permutation of predicted pseudo-labels, among the backbones, with no need for
+any hyper-parameters or weighting optimization. The proposed methodology
+outperforms the state-of-the-art methods in well-known benchmarks and in the
+challenging large-scale Veri-Wild dataset, with a faster and memory-efficient
+Re-Ranking strategy, and a large-scale, noisy-robust, and ensemble-based
+learning approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted for possible publication in an IEEE
+  Transactions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and
+  Game Theory <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14277v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14277v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongxiang Li, Meng Cao, Xuxin Cheng, Yaowei Li, Zhihong Zhu, Yuexian Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent video grounding works attempt to introduce vanilla contrastive
+learning into video grounding. However, we claim that this naive solution is
+suboptimal. Contrastive learning requires two key properties: (1)
+\emph{alignment} of features of similar samples, and (2) \emph{uniformity} of
+the induced distribution of the normalized features on the hypersphere. Due to
+two annoying issues in video grounding: (1) the co-existence of some visual
+entities in both ground truth and other moments, \ie semantic overlapping; (2)
+only a few moments in the video are annotated, \ie sparse annotation dilemma,
+vanilla contrastive learning is unable to model the correlations between
+temporally distant moments and learned inconsistent video representations. Both
+characteristics lead to vanilla contrastive learning being unsuitable for video
+grounding. In this paper, we introduce Geodesic and Game Localization (G2L), a
+semantically aligned and uniform video grounding framework via geodesic and
+game theory. We quantify the correlations among moments leveraging the geodesic
+distance that guides the model to learn the correct cross-modal
+representations. Furthermore, from the novel perspective of game theory, we
+propose semantic Shapley interaction based on geodesic distance sampling to
+learn fine-grained semantic alignment in similar moments. Experiments on three
+benchmarks demonstrate the effectiveness of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deepfake Image Generation for Improved Brain Tumor Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roa'a Al-Emaryeen, Sara Al-Nahhas, Fatima Himour, Waleed Mahafza, Omar Al-Kadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the world progresses in technology and health, awareness of disease by
+revealing asymptomatic signs improves. It is important to detect and treat
+tumors in early stage as it can be life-threatening. Computer-aided
+technologies are used to overcome lingering limitations facing disease
+diagnosis, while brain tumor segmentation remains a difficult process,
+especially when multi-modality data is involved. This is mainly attributed to
+ineffective training due to lack of data and corresponding labelling. This work
+investigates the feasibility of employing deep-fake image generation for
+effective brain tumor segmentation. To this end, a Generative Adversarial
+Network was used for image-to-image translation for increasing dataset size,
+followed by image segmentation using a U-Net-based convolutional neural network
+trained with deepfake images. Performance of the proposed approach is compared
+with ground truth of four publicly available datasets. Results show improved
+performance in terms of image segmentation quality metrics, and could
+potentially assist when training with limited data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures, 2 tables, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Artifact Restoration in Histology Images with Diffusion Probabilistic
+  Models <span class="chip">MICCAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenqi He, Junjun He, Jin Ye, Yiqing Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Histological whole slide images (WSIs) can be usually compromised by
+artifacts, such as tissue folding and bubbles, which will increase the
+examination difficulty for both pathologists and Computer-Aided Diagnosis (CAD)
+systems. Existing approaches to restoring artifact images are confined to
+Generative Adversarial Networks (GANs), where the restoration process is
+formulated as an image-to-image transfer. Those methods are prone to suffer
+from mode collapse and unexpected mistransfer in the stain style, leading to
+unsatisfied and unrealistic restored images. Innovatively, we make the first
+attempt at a denoising diffusion probabilistic model for histological artifact
+restoration, namely ArtiFusion.Specifically, ArtiFusion formulates the artifact
+region restoration as a gradual denoising process, and its training relies
+solely on artifact-free images to simplify the training complexity.Furthermore,
+to capture local-global correlations in the regional artifact restoration, a
+novel Swin-Transformer denoising architecture is designed, along with a time
+token scheme. Our extensive evaluations demonstrate the effectiveness of
+ArtiFusion as a pre-processing method for histology analysis, which can
+successfully preserve the tissue structures and stain style in artifact-free
+regions during the restoration. Code is available at
+https://github.com/zhenqi-he/ArtiFusion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MICCAI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse Double Descent in Vision <span class="highlight-title">Transformer</span>s: real or phantom threat? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Quétu, Marta Milovanovic, Enzo Tartaglione
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision transformers (ViT) have been of broad interest in recent theoretical
+and empirical works. They are state-of-the-art thanks to their attention-based
+approach, which boosts the identification of key features and patterns within
+images thanks to the capability of avoiding inductive bias, resulting in highly
+accurate image analysis. Meanwhile, neoteric studies have reported a ``sparse
+double descent'' phenomenon that can occur in modern deep-learning models,
+where extremely over-parametrized models can generalize well. This raises
+practical questions about the optimal size of the model and the quest over
+finding the best trade-off between sparsity and performance is launched: are
+Vision Transformers also prone to sparse double descent? Can we find a way to
+avoid such a phenomenon? Our work tackles the occurrence of sparse double
+descent on ViTs. Despite some works that have shown that traditional
+architectures, like Resnet, are condemned to the sparse double descent
+phenomenon, for ViTs we observe that an optimally-tuned $\ell_2$ regularization
+relieves such a phenomenon. However, everything comes at a cost: optimal lambda
+will sacrifice the potential compression of the ViT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fluorescent Neuronal Cells v2: Multi-Task, Multi-Format Annotations for
+  Deep Learning in Microscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Clissa, Antonio Macaluso, Roberto Morelli, Alessandra Occhinegro, Emiliana Piscitiello, Ludovico Taddei, Marco Luppi, Roberto Amici, Matteo Cerri, Timna Hitrec, Lorenzo Rinaldi, Antonio Zoccoli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fluorescent Neuronal Cells v2 is a collection of fluorescence microscopy
+images and the corresponding ground-truth annotations, designed to foster
+innovative research in the domains of Life Sciences and Deep Learning. This
+dataset encompasses three image collections in which rodent neuronal cells'
+nuclei and cytoplasm are stained with diverse markers to highlight their
+anatomical or functional characteristics. Alongside the images, we provide
+ground-truth annotations for several learning tasks, including semantic
+segmentation, object detection, and counting. The contribution is two-fold.
+First, given the variety of annotations and their accessible formats, we
+envision our work facilitating methodological advancements in computer vision
+approaches for segmentation, detection, feature learning, unsupervised and
+self-supervised learning, transfer learning, and related areas. Second, by
+enabling extensive exploration and benchmarking, we hope Fluorescent Neuronal
+Cells v2 will catalyze breakthroughs in fluorescence microscopy analysis and
+promote cutting-edge discoveries in life sciences. The data are available at:
+https://amsacta.unibo.it/id/eprint/7347
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages; 5 figures; 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Defending Adversarial Patches via Joint Region Localizing and Inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14242v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14242v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwen Chen, Xingxing Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are successfully used in various applications, but show
+their vulnerability to adversarial examples. With the development of
+adversarial patches, the feasibility of attacks in physical scenes increases,
+and the defenses against patch attacks are urgently needed. However, defending
+such adversarial patch attacks is still an unsolved problem. In this paper, we
+analyse the properties of adversarial patches, and find that: on the one hand,
+adversarial patches will lead to the appearance or contextual inconsistency in
+the target objects; on the other hand, the patch region will show abnormal
+changes on the high-level feature maps of the objects extracted by a backbone
+network. Considering the above two points, we propose a novel defense method
+based on a ``localizing and inpainting" mechanism to pre-process the input
+examples. Specifically, we design an unified framework, where the ``localizing"
+sub-network utilizes a two-branch structure to represent the above two aspects
+to accurately detect the adversarial patch region in the image. For the
+``inpainting" sub-network, it utilizes the surrounding contextual cues to
+recover the original content covered by the adversarial patch. The quality of
+inpainted images is also evaluated by measuring the appearance consistency and
+the effects of adversarial attacks. These two sub-networks are then jointly
+trained via an iterative optimization manner. In this way, the ``localizing"
+and ``inpainting" modules can interact closely with each other, and thus learn
+a better solution. A series of experiments versus traffic sign classification
+and detection tasks are conducted to defend against various adversarial patch
+attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DisguisOR: Holistic Face Anonymization for the Operating Room 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lennart Bastian, Tony Danjun Wang, Tobias Czempiel, Benjamin Busam, Nassir Navab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Recent advances in Surgical Data Science (SDS) have contributed to
+an increase in video recordings from hospital environments. While methods such
+as surgical workflow recognition show potential in increasing the quality of
+patient care, the quantity of video data has surpassed the scale at which
+images can be manually anonymized. Existing automated 2D anonymization methods
+under-perform in Operating Rooms (OR), due to occlusions and obstructions. We
+propose to anonymize multi-view OR recordings using 3D data from multiple
+camera streams. Methods: RGB and depth images from multiple cameras are fused
+into a 3D point cloud representation of the scene. We then detect each
+individual's face in 3D by regressing a parametric human mesh model onto
+detected 3D human keypoints and aligning the face mesh with the fused 3D point
+cloud. The mesh model is rendered into every acquired camera view, replacing
+each individual's face. Results: Our method shows promise in locating faces at
+a higher rate than existing approaches. DisguisOR produces geometrically
+consistent anonymizations for each camera view, enabling more realistic
+anonymization that is less detrimental to downstream tasks. Conclusion:
+Frequent obstructions and crowding in operating rooms leaves significant room
+for improvement for off-the-shelf anonymization methods. DisguisOR addresses
+privacy on a scene level and has the potential to facilitate further research
+in SDS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IPCAI 2023; 21 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computational Approaches for Traditional Chinese Painting: From the "Six
+  Principles of Painting" Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Zhang, Jian-Wei Zhang, Kam Kwai Wong, Yifang Wang, Yingchaojie Feng, Luwei Wang, Wei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional Chinese Painting (TCP) is an invaluable cultural heritage
+resource and a unique visual art style. In recent years, increasing interest
+has been placed on digitalizing TCPs to preserve and revive the culture. The
+resulting digital copies have enabled the advancement of computational methods
+for structured and systematic understanding of TCPs. To explore this topic, we
+conducted an in-depth analysis of 92 pieces of literature. We examined the
+current use of computer technologies on TCPs from three perspectives, based on
+numerous conversations with specialists. First, in light of the "Six Principles
+of Painting" theory, we categorized the articles according to their research
+focus on artistic elements. Second, we created a four-stage framework to
+illustrate the purposes of TCP applications. Third, we summarized the popular
+computational techniques applied to TCPs. The framework also provides insights
+into potential applications and future prospects, with professional opinion.
+The list of surveyed publications and related information is available online
+at https://ca4tcp.com.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADAPT: Efficient Multi-Agent Trajectory Prediction with Adaptation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Görkay Aydemir, Adil Kaan Akan, Fatma Güney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forecasting future trajectories of agents in complex traffic scenes requires
+reliable and efficient predictions for all agents in the scene. However,
+existing methods for trajectory prediction are either inefficient or sacrifice
+accuracy. To address this challenge, we propose ADAPT, a novel approach for
+jointly predicting the trajectories of all agents in the scene with dynamic
+weight learning. Our approach outperforms state-of-the-art methods in both
+single-agent and multi-agent settings on the Argoverse and Interaction
+datasets, with a fraction of their computational overhead. We attribute the
+improvement in our performance: first, to the adaptive head augmenting the
+model capacity without increasing the model size; second, to our design choices
+in the endpoint-conditioned prediction, reinforced by gradient stopping. Our
+analyses show that ADAPT can focus on each agent with adaptive prediction,
+allowing for accurate predictions efficiently. https://KUIS-AI.github.io/adapt
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Resolution-Aware Design of Atrous Rates for Semantic Segmentation
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14179v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14179v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bum Jun Kim, Hyeyeon Choi, Hyeonah Jang, Sang Woo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DeepLab is a widely used deep neural network for semantic segmentation, whose
+success is attributed to its parallel architecture called atrous spatial
+pyramid pooling (ASPP). ASPP uses multiple atrous convolutions with different
+atrous rates to extract both local and global information. However, fixed
+values of atrous rates are used for the ASPP module, which restricts the size
+of its field of view. In principle, atrous rate should be a hyperparameter to
+change the field of view size according to the target task or dataset. However,
+the manipulation of atrous rate is not governed by any guidelines. This study
+proposes practical guidelines for obtaining an optimal atrous rate. First, an
+effective receptive field for semantic segmentation is introduced to analyze
+the inner behavior of segmentation networks. We observed that the use of ASPP
+module yielded a specific pattern in the effective receptive field, which was
+traced to reveal the module's underlying mechanism. Accordingly, we derive
+practical guidelines for obtaining the optimal atrous rate, which should be
+controlled based on the size of input image. Compared to other values, using
+the optimal atrous rate consistently improved the segmentation results across
+multiple datasets, including the STARE, CHASE_DB1, HRF, Cityscapes, and iSAID
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-definition event frame generation using SoC FPGA devices <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krzysztof Blachut, Tomasz Kryjak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we have addressed the implementation of the accumulation and
+projection of high-resolution event data stream (HD -1280 x 720 pixels) onto
+the image plane in FPGA devices. The results confirm the feasibility of this
+approach, but there are a number of challenges, limitations and trade-offs to
+be considered. The required hardware resources of selected data
+representations, such as binary frame, event frame, exponentially decaying time
+surface and event frequency, were compared with those available on several
+popular platforms from AMD Xilinx. The resulting event frames can be used for
+typical vision algorithms, such as object classification and detection, using
+both classical and deep neural network methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted for the SPA 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LOIS: Looking Out of Instance Semantics for Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Zhang, Yeming Chen, Yaoru Sun, Fang Wang, Haibo Shi, Haoran Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual question answering (VQA) has been intensively studied as a multimodal
+task that requires effort in bridging vision and language to infer answers
+correctly. Recent attempts have developed various attention-based modules for
+solving VQA tasks. However, the performance of model inference is largely
+bottlenecked by visual processing for semantics understanding. Most existing
+detection methods rely on bounding boxes, remaining a serious challenge for VQA
+models to understand the causal nexus of object semantics in images and
+correctly infer contextual information. To this end, we propose a finer model
+framework without bounding boxes in this work, termed Looking Out of Instance
+Semantics (LOIS) to tackle this important issue. LOIS enables more fine-grained
+feature descriptions to produce visual facts. Furthermore, to overcome the
+label ambiguity caused by instance masks, two types of relation attention
+modules: 1) intra-modality and 2) inter-modality, are devised to infer the
+correct answers from the different multi-view features. Specifically, we
+implement a mutual relation attention module to model sophisticated and deeper
+visual semantic relations between instance objects and background information.
+In addition, our proposed attention model can further analyze salient image
+regions by focusing on important word-related questions. Experimental results
+on four benchmark VQA datasets prove that our proposed method has favorable
+performance in improving visual reasoning capability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Creative Birds: <span class="highlight-title">Self-Supervised</span> Single-View 3D Style Transfer <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renke Wang, Guimin Que, Shuo Chen, Xiang Li, Jun Li, Jian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel method for single-view 3D style transfer
+that generates a unique 3D object with both shape and texture transfer. Our
+focus lies primarily on birds, a popular subject in 3D reconstruction, for
+which no existing single-view 3D transfer methods have been developed.The
+method we propose seeks to generate a 3D mesh shape and texture of a bird from
+two single-view images. To achieve this, we introduce a novel shape transfer
+generator that comprises a dual residual gated network (DRGNet), and a
+multi-layer perceptron (MLP). DRGNet extracts the features of source and target
+images using a shared coordinate gate unit, while the MLP generates spatial
+coordinates for building a 3D mesh. We also introduce a semantic UV texture
+transfer module that implements textural style transfer using semantic UV
+segmentation, which ensures consistency in the semantic meaning of the
+transferred regions. This module can be widely adapted to many existing
+approaches. Finally, our method constructs a novel 3D bird using a
+differentiable renderer. Experimental results on the CUB dataset verify that
+our method achieves state-of-the-art performance on the single-view 3D style
+transfer task. Code is available in
+https://github.com/wrk226/2D-to-3D-Evolution-Transfer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-modal Learning with Missing Modality via Shared-Specific Feature
+  Modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hu Wang, Yuanhong Chen, Congbo Ma, Jodie Avery, Louise Hull, Gustavo Carneiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The missing modality issue is critical but non-trivial to be solved by
+multi-modal models. Current methods aiming to handle the missing modality
+problem in multi-modal tasks, either deal with missing modalities only during
+evaluation or train separate models to handle specific missing modality
+settings. In addition, these models are designed for specific tasks, so for
+example, classification models are not easily adapted to segmentation tasks and
+vice versa. In this paper, we propose the Shared-Specific Feature Modelling
+(ShaSpec) method that is considerably simpler and more effective than competing
+approaches that address the issues above. ShaSpec is designed to take advantage
+of all available input modalities during training and evaluation by learning
+shared and specific features to better represent the input data. This is
+achieved from a strategy that relies on auxiliary tasks based on distribution
+alignment and domain classification, in addition to a residual feature fusion
+procedure. Also, the design simplicity of ShaSpec enables its easy adaptation
+to multiple tasks, such as classification and segmentation. Experiments are
+conducted on both medical image segmentation and computer vision
+classification, with results indicating that ShaSpec outperforms competing
+methods by a large margin. For instance, on BraTS2018, ShaSpec improves the
+SOTA by more than 3% for enhancing tumour, 5% for tumour core and 3% for whole
+tumour.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Memory-Efficient Graph Convolutional Networks for Object Classification
+  and Detection with Event Cameras <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kamil Jeziorek, Andrea Pinna, Tomasz Kryjak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in event camera research emphasize processing data in its
+original sparse form, which allows the use of its unique features such as high
+temporal resolution, high dynamic range, low latency, and resistance to image
+blur. One promising approach for analyzing event data is through graph
+convolutional networks (GCNs). However, current research in this domain
+primarily focuses on optimizing computational costs, neglecting the associated
+memory costs. In this paper, we consider both factors together in order to
+achieve satisfying results and relatively low model complexity. For this
+purpose, we performed a comparative analysis of different graph convolution
+operations, considering factors such as execution time, the number of trainable
+model parameters, data format requirements, and training outcomes. Our results
+show a 450-fold reduction in the number of parameters for the feature
+extraction module and a 4.5-fold reduction in the size of the data
+representation while maintaining a classification accuracy of 52.3%, which is
+6.3% higher compared to the operation used in state-of-the-art approaches. To
+further evaluate performance, we implemented the object detection architecture
+and evaluated its performance on the N-Caltech101 dataset. The results showed
+an accuracy of 53.7 % mAP@0.5 and reached an execution rate of 82 graphs per
+second.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for the SPA 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A semantics-driven methodology for high-quality image annotation <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fausto Giunchiglia, Mayukh Bagchi, Xiaolei Diao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work in Machine Learning and Computer Vision has highlighted the
+presence of various types of systematic flaws inside ground truth object
+recognition benchmark datasets. Our basic tenet is that these flaws are rooted
+in the many-to-many mappings which exist between the visual information encoded
+in images and the intended semantics of the labels annotating them. The net
+consequence is that the current annotation process is largely under-specified,
+thus leaving too much freedom to the subjective judgment of annotators. In this
+paper, we propose vTelos, an integrated Natural Language Processing, Knowledge
+Representation, and Computer Vision methodology whose main goal is to make
+explicit the (otherwise implicit) intended annotation semantics, thus
+minimizing the number and role of subjective choices. A key element of vTelos
+is the exploitation of the WordNet lexico-semantic hierarchy as the main means
+for providing the meaning of natural language labels and, as a consequence, for
+driving the annotation of images based on the objects and the visual properties
+they depict. The methodology is validated on images populating a subset of the
+ImageNet hierarchy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted @ 26th European Conference on Artificial Intelligence (ECAI)
+  2023, Krak\'ow, Poland</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Periocular biometrics: databases, algorithms and directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14111v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14111v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Alonso-Fernandez, Josef Bigun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Periocular biometrics has been established as an independent modality due to
+concerns on the performance of iris or face systems in uncontrolled conditions.
+Periocular refers to the facial region in the eye vicinity, including eyelids,
+lashes and eyebrows. It is available over a wide range of acquisition
+distances, representing a trade-off between the whole face (which can be
+occluded at close distances) and the iris texture (which do not have enough
+resolution at long distances). Since the periocular region appears in face or
+iris images, it can be used also in conjunction with these modalities. Features
+extracted from the periocular region have been also used successfully for
+gender classification and ethnicity classification, and to study the impact of
+gender transformation or plastic surgery in the recognition performance. This
+paper presents a review of the state of the art in periocular biometric
+research, providing an insight of the most relevant issues and giving a
+thorough coverage of the existing literature. Future research trends are also
+briefly discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in: 2016 4th International Conference on Biometrics and
+  Forensics (IWBF). arXiv admin note: substantial text overlap with
+  arXiv:1810.03360</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VideoControlNet: A Motion-Guided Video-to-Video Translation Framework by
+  Using Diffusion Model with ControlNet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Hu, Dong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, diffusion models like StableDiffusion have achieved impressive
+image generation results. However, the generation process of such diffusion
+models is uncontrollable, which makes it hard to generate videos with
+continuous and consistent content. In this work, by using the diffusion model
+with ControlNet, we proposed a new motion-guided video-to-video translation
+framework called VideoControlNet to generate various videos based on the given
+prompts and the condition from the input video. Inspired by the video codecs
+that use motion information for reducing temporal redundancy, our framework
+uses motion information to prevent the regeneration of the redundant areas for
+content consistency. Specifically, we generate the first frame (i.e., the
+I-frame) by using the diffusion model with ControlNet. Then we generate other
+key frames (i.e., the P-frame) based on the previous I/P-frame by using our
+newly proposed motion-guided P-frame generation (MgPG) method, in which the
+P-frames are generated based on the motion information and the occlusion areas
+are inpainted by using the diffusion model. Finally, the rest frames (i.e., the
+B-frame) are generated by using our motion-guided B-frame interpolation (MgBI)
+module. Our experiments demonstrate that our proposed VideoControlNet inherits
+the generation capability of the pre-trained large diffusion model and extends
+the image diffusion model to the video diffusion model by using motion
+information. More results are provided at our project page.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty Guided Adaptive Warping for Robust and Efficient Stereo
+  Matching <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junpeng Jing, Jiankun Li, Pengfei Xiong, Jiangyu Liu, Shuaicheng Liu, Yichen Guo, Xin Deng, Mai Xu, Lai Jiang, Leonid Sigal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Correlation based stereo matching has achieved outstanding performance, which
+pursues cost volume between two feature maps. Unfortunately, current methods
+with a fixed model do not work uniformly well across various datasets, greatly
+limiting their real-world applicability. To tackle this issue, this paper
+proposes a new perspective to dynamically calculate correlation for robust
+stereo matching. A novel Uncertainty Guided Adaptive Correlation (UGAC) module
+is introduced to robustly adapt the same model for different scenarios.
+Specifically, a variance-based uncertainty estimation is employed to adaptively
+adjust the sampling area during warping operation. Additionally, we improve the
+traditional non-parametric warping with learnable parameters, such that the
+position-specific weights can be learned. We show that by empowering the
+recurrent network with the UGAC module, stereo matching can be exploited more
+robustly and effectively. Extensive experiments demonstrate that our method
+achieves state-of-the-art performance over the ETH3D, KITTI, and Middlebury
+datasets when employing the same fixed model over these datasets without any
+retraining procedure. To target real-time applications, we further design a
+lightweight model based on UGAC, which also outperforms other methods over
+KITTI benchmarks with only 0.6 M parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PNT-Edge: Towards Robust Edge Detection with Noisy Labels by Learning
+  Pixel-level Noise Transitions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjie Xuan, Shanshan Zhao, Yu Yao, Juhua Liu, Tongliang Liu, Yixin Chen, Bo Du, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relying on large-scale training data with pixel-level labels, previous edge
+detection methods have achieved high performance. However, it is hard to
+manually label edges accurately, especially for large datasets, and thus the
+datasets inevitably contain noisy labels. This label-noise issue has been
+studied extensively for classification, while still remaining under-explored
+for edge detection. To address the label-noise issue for edge detection, this
+paper proposes to learn Pixel-level NoiseTransitions to model the
+label-corruption process. To achieve it, we develop a novel Pixel-wise Shift
+Learning (PSL) module to estimate the transition from clean to noisy labels as
+a displacement field. Exploiting the estimated noise transitions, our model,
+named PNT-Edge, is able to fit the prediction to clean labels. In addition, a
+local edge density regularization term is devised to exploit local structure
+information for better transition learning. This term encourages learning large
+shifts for the edges with complex local structures. Experiments on SBD and
+Cityscapes demonstrate the effectiveness of our method in relieving the impact
+of label noise. Codes will be available at github.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pre-Train</span>ing with Diffusion models for Dental Radiography segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jérémy Rousseau, Christian Alaka, Emma Covili, Hippolyte Mayard, Laura Misrachi, Willy Au
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical radiography segmentation, and specifically dental radiography, is
+highly limited by the cost of labeling which requires specific expertise and
+labor-intensive annotations. In this work, we propose a straightforward
+pre-training method for semantic segmentation leveraging Denoising Diffusion
+Probabilistic Models (DDPM), which have shown impressive results for generative
+modeling. Our straightforward approach achieves remarkable performance in terms
+of label efficiency and does not require architectural modifications between
+pre-training and downstream tasks. We propose to first pre-train a Unet by
+exploiting the DDPM training objective, and then fine-tune the resulting model
+on a segmentation task. Our experimental results on the segmentation of dental
+radiographs demonstrate that the proposed method is competitive with
+state-of-the-art pre-training methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures, Deep Generative Models workshop @ MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ECO: Ensembling Context Optimization for Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Agnolucci, Alberto Baldrati, Francesco Todino, Federico Becattini, Marco Bertini, Alberto Del Bimbo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image recognition has recently witnessed a paradigm shift, where
+vision-language models are now used to perform few-shot classification based on
+textual prompts. Among these, the CLIP model has shown remarkable capabilities
+for zero-shot transfer by matching an image and a custom textual prompt in its
+latent space. This has paved the way for several works that focus on
+engineering or learning textual contexts for maximizing CLIP's classification
+capabilities. In this paper, we follow this trend by learning an ensemble of
+prompts for image classification. We show that learning diverse and possibly
+shorter contexts improves considerably and consistently the results rather than
+relying on a single trainable prompt. In particular, we report better few-shot
+capabilities with no additional cost at inference time. We demonstrate the
+capabilities of our approach on 11 different benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Set-level Guidance Attack: Boosting Adversarial Transferability of
+  Vision-Language <span class="highlight-title">Pre-train</span>ing Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Lu, Zhiqiang Wang, Teng Wang, Weili Guan, Hongchang Gao, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language pre-training (VLP) models have shown vulnerability to
+adversarial examples in multimodal tasks. Furthermore, malicious adversaries
+can be deliberately transferred to attack other black-box models. However,
+existing work has mainly focused on investigating white-box attacks. In this
+paper, we present the first study to investigate the adversarial
+transferability of recent VLP models. We observe that existing methods exhibit
+much lower transferability, compared to the strong attack performance in
+white-box settings. The transferability degradation is partly caused by the
+under-utilization of cross-modal interactions. Particularly, unlike unimodal
+learning, VLP models rely heavily on cross-modal interactions and the
+multimodal alignments are many-to-many, e.g., an image can be described in
+various natural languages. To this end, we propose a highly transferable
+Set-level Guidance Attack (SGA) that thoroughly leverages modality interactions
+and incorporates alignment-preserving augmentation with cross-modal guidance.
+Experimental results demonstrate that SGA could generate adversarial examples
+that can strongly transfer across different VLP models on multiple downstream
+vision-language tasks. On image-text retrieval, SGA significantly enhances the
+attack success rate for transfer attacks from ALBEF to TCL by a large margin
+(at least 9.78% and up to 30.21%), compared to the state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Establishing Systematic Classification Requirements for
+  Automated Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ken T. Mori, Trent Brown, Steven Peters
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the presence of the classification task in many different benchmark
+datasets for perception in the automotive domain, few efforts have been
+undertaken to define consistent classification requirements. This work
+addresses the topic by proposing a structured method to generate a
+classification structure. First, legal categories are identified based on
+behavioral requirements for the vehicle. This structure is further
+substantiated by considering the two aspects of collision safety for objects as
+well as perceptual categories. A classification hierarchy is obtained by
+applying the method to an exemplary legal text. A comparison of the results
+with benchmark dataset categories shows limited agreement. This indicates the
+necessity for explicit consideration of legal requirements regarding
+perception.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE IV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unite-Divide-Unite: Joint Boosting Trunk and Structure for High-accuracy
+  Dichotomous Image Segmentation <span class="chip">ACM MM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialun Pei, Zhangjun Zhou, Yueming Jin, He Tang, Pheng-Ann Heng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-accuracy Dichotomous Image Segmentation (DIS) aims to pinpoint
+category-agnostic foreground objects from natural scenes. The main challenge
+for DIS involves identifying the highly accurate dominant area while rendering
+detailed object structure. However, directly using a general encoder-decoder
+architecture may result in an oversupply of high-level features and neglect the
+shallow spatial information necessary for partitioning meticulous structures.
+To fill this gap, we introduce a novel Unite-Divide-Unite Network (UDUN} that
+restructures and bipartitely arranges complementary features to simultaneously
+boost the effectiveness of trunk and structure identification. The proposed
+UDUN proceeds from several strengths. First, a dual-size input feeds into the
+shared backbone to produce more holistic and detailed features while keeping
+the model lightweight. Second, a simple Divide-and-Conquer Module (DCM) is
+proposed to decouple multiscale low- and high-level features into our structure
+decoder and trunk decoder to obtain structure and trunk information
+respectively. Moreover, we design a Trunk-Structure Aggregation module (TSA) in
+our union decoder that performs cascade integration for uniform high-accuracy
+segmentation. As a result, UDUN performs favorably against state-of-the-art
+competitors in all six evaluation metrics on overall DIS-TE, i.e., achieving
+0.772 weighted F-measure and 977 HCE. Using 1024*1024 input, our model enables
+real-time inference at 65.3 fps with ResNet-18.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ACM MM2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Semantic Subspace Traverser: Empowering 3D Generative Model with
+  Shape Editing Capability <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruowei Wang, Yu Liu, Pei Su, Jianwei Zhang, Qijun Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Shape generation is the practice of producing 3D shapes as various
+representations for 3D content creation. Previous studies on 3D shape
+generation have focused on shape quality and structure, without or less
+considering the importance of semantic information. Consequently, such
+generative models often fail to preserve the semantic consistency of shape
+structure or enable manipulation of the semantic attributes of shapes during
+generation. In this paper, we proposed a novel semantic generative model named
+3D Semantic Subspace Traverser that utilizes semantic attributes for
+category-specific 3D shape generation and editing. Our method utilizes implicit
+functions as the 3D shape representation and combines a novel latent-space GAN
+with a linear subspace model to discover semantic dimensions in the local
+latent space of 3D shapes. Each dimension of the subspace corresponds to a
+particular semantic attribute, and we can edit the attributes of generated
+shapes by traversing the coefficients of those dimensions. Experimental results
+demonstrate that our method can produce plausible shapes with complex
+structures and enable the editing of semantic attributes. The code and trained
+models are available at
+https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in ICCV 2023. Code:
+  https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controllable Guide-Space for Generalizable Face Forgery Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Guo, Cheng Zhen, Pengfei Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies on face forgery detection have shown satisfactory performance
+for methods involved in training datasets, but are not ideal enough for unknown
+domains. This motivates many works to improve the generalization, but
+forgery-irrelevant information, such as image background and identity, still
+exists in different domain features and causes unexpected clustering, limiting
+the generalization. In this paper, we propose a controllable guide-space (GS)
+method to enhance the discrimination of different forgery domains, so as to
+increase the forgery relevance of features and thereby improve the
+generalization. The well-designed guide-space can simultaneously achieve both
+the proper separation of forgery domains and the large distance between
+real-forgery domains in an explicit and controllable manner. Moreover, for
+better discrimination, we use a decoupling module to weaken the interference of
+forgery-irrelevant correlations between domains. Furthermore, we make
+adjustments to the decision boundary manifold according to the clustering
+degree of the same domain features within the neighborhood. Extensive
+experiments in multiple in-domain and cross-domain settings confirm that our
+method can achieve state-of-the-art generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consensus-Adaptive RANSAC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Cavalli, Daniel Barath, Marc Pollefeys, Viktor Larsson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RANSAC and its variants are widely used for robust estimation, however, they
+commonly follow a greedy approach to finding the highest scoring model while
+ignoring other model hypotheses. In contrast, Iteratively Reweighted Least
+Squares (IRLS) techniques gradually approach the model by iteratively updating
+the weight of each correspondence based on the residuals from previous
+iterations. Inspired by these methods, we propose a new RANSAC framework that
+learns to explore the parameter space by considering the residuals seen so far
+via a novel attention layer. The attention mechanism operates on a batch of
+point-to-model residuals, and updates a per-point estimation state to take into
+account the consensus found through a lightweight one-step transformer. This
+rich state then guides the minimal sampling between iterations as well as the
+model refinement. We evaluate the proposed approach on essential and
+fundamental matrix estimation on a number of indoor and outdoor datasets. It
+outperforms state-of-the-art estimators by a significant margin adding only a
+small runtime overhead. Moreover, we demonstrate good generalization properties
+of our trained model, indicating its effectiveness across different datasets
+and tasks. The proposed attention mechanism and one-step transformer provide an
+adaptive behavior that enhances the performance of RANSAC, making it a more
+effective tool for robust estimation. Code is available at
+https://github.com/cavalli1234/CA-RANSAC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topologically-Regularized Multiple Instance Learning for Red Blood Cell
+  Disease Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salome Kazeminia, Ario Sadafi, Asya Makhro, Anna Bogdanova, Carsten Marr, Bastian Rieck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diagnosing rare anemia disorders using microscopic images is challenging for
+skilled specialists and machine-learning methods alike. Due to thousands of
+disease-relevant cells in a single blood sample, this constitutes a complex
+multiple-instance learning (MIL) problem. While the spatial neighborhood of red
+blood cells is not meaningful per se, the topology, i.e., the geometry of blood
+samples as a whole, contains informative features to remedy typical MIL issues,
+such as vanishing gradients and overfitting when training on limited data. We
+thus develop a topology-based approach that extracts multi-scale topological
+features from bags of single red blood cell images. The topological features
+are used to regularize the model, enforcing the preservation of characteristic
+topological properties of the data. Applied to a dataset of 71 patients
+suffering from rare anemia disorders with 521 microscopic images of red blood
+cells, our experiments show that topological regularization is an effective
+method that leads to more than 3% performance improvements for the automated
+classification of rare anemia disorders based on single-cell images. This is
+the first approach that uses topological properties for regularizing the MIL
+process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retinotopy Inspired Brain Encoding Model and the All-for-One Training
+  Recipe 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14021v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14021v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huzheng Yang, Jianbo Shi, James Gee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain encoding models aim to predict brain voxel-wise responses to stimuli
+images, replicating brain signals captured by neuroimaging techniques. There is
+a large volume of publicly available data, but training a comprehensive brain
+encoding model is challenging. The main difficulties stem from a) diversity
+within individual brain, with functional heterogeneous brain regions; b)
+diversity of brains from different subjects, due to genetic and developmental
+differences; c) diversity of imaging modalities and processing pipelines. We
+use this diversity to our advantage by introducing the All-for-One training
+recipe, which divides the challenging one-big-model problem into multiple small
+models, with the small models aggregating the knowledge while preserving the
+distinction between the different functional regions. Agnostic of the training
+recipe, we use biological knowledge of the brain, specifically retinotopy, to
+introduce inductive bias to learn a 3D brain-to-image mapping that ensures a)
+each neuron knows which image regions and semantic levels to gather
+information, and b) no neurons are left behind in the model.
+  We pre-trained a brain encoding model using over one million data points from
+five public datasets spanning three imaging modalities. To the best of our
+knowledge, this is the most comprehensive brain encoding model to the date. We
+demonstrate the effectiveness of the pre-trained model as a drop-in replacement
+for commonly used vision backbone models. Furthermore, we demonstrate the
+application of the model to brain decoding. Code and the model checkpoint will
+be made available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One-Nearest Neighborhood Guides Inlier Estimation for Unsupervised Point
+  Cloud Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongzhe Yuan, Yue Wu, Maoguo Gong, Qiguang Miao, A. K. Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The precision of unsupervised point cloud registration methods is typically
+limited by the lack of reliable inlier estimation and self-supervised signal,
+especially in partially overlapping scenarios. In this paper, we propose an
+effective inlier estimation method for unsupervised point cloud registration by
+capturing geometric structure consistency between the source point cloud and
+its corresponding reference point cloud copy. Specifically, to obtain a high
+quality reference point cloud copy, an One-Nearest Neighborhood (1-NN) point
+cloud is generated by input point cloud. This facilitates matching map
+construction and allows for integrating dual neighborhood matching scores of
+1-NN point cloud and input point cloud to improve matching confidence.
+Benefiting from the high quality reference copy, we argue that the neighborhood
+graph formed by inlier and its neighborhood should have consistency between
+source point cloud and its corresponding reference copy. Based on this
+observation, we construct transformation-invariant geometric structure
+representations and capture geometric structure consistency to score the inlier
+confidence for estimated correspondences between source point cloud and its
+reference copy. This strategy can simultaneously provide the reliable
+self-supervised signal for model optimization. Finally, we further calculate
+transformation estimation by the weighted SVD algorithm with the estimated
+correspondences and corresponding inlier confidence. We train the proposed
+model in an unsupervised manner, and extensive experiments on synthetic and
+real-world datasets illustrate the effectiveness of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RPG-Palm: Realistic Pseudo-data Generation for Palmprint Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Shen, Jianlong Jin, Ruixin Zhang, Huaen Li, Yingyi Zhang, Jingyun Zhang, Shouhong Ding, Yang Zhao, Wei Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Palmprint recently shows great potential in recognition applications as it is
+a privacy-friendly and stable biometric. However, the lack of large-scale
+public palmprint datasets limits further research and development of palmprint
+recognition. In this paper, we propose a novel realistic pseudo-palmprint
+generation (RPG) model to synthesize palmprints with massive identities. We
+first introduce a conditional modulation generator to improve the intra-class
+diversity. Then an identity-aware loss is proposed to ensure identity
+consistency against unpaired training. We further improve the B\'ezier palm
+creases generation strategy to guarantee identity independence. Extensive
+experimental results demonstrate that synthetic pretraining significantly
+boosts the recognition model performance. For example, our model improves the
+state-of-the-art B\'ezierPalm by more than $5\%$ and $14\%$ in terms of
+TAR@FAR=1e-6 under the $1:1$ and $1:3$ Open-set protocol. When accessing only
+$10\%$ of the real training data, our method still outperforms ArcFace with
+$100\%$ real training data, indicating that we are closer to real-data-free
+palmprint recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages,8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ESSAformer: Efficient <span class="highlight-title">Transformer</span> for Hyperspectral Image
+  Super-resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14010v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14010v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingjin Zhang, Chi Zhang, Qiming Zhang, Jie Guo, Xinbo Gao, Jing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single hyperspectral image super-resolution (single-HSI-SR) aims to restore a
+high-resolution hyperspectral image from a low-resolution observation. However,
+the prevailing CNN-based approaches have shown limitations in building
+long-range dependencies and capturing interaction information between spectral
+features. This results in inadequate utilization of spectral information and
+artifacts after upsampling. To address this issue, we propose ESSAformer, an
+ESSA attention-embedded Transformer network for single-HSI-SR with an iterative
+refining structure. Specifically, we first introduce a robust and
+spectral-friendly similarity metric, \ie, the spectral correlation coefficient
+of the spectrum (SCC), to replace the original attention matrix and
+incorporates inductive biases into the model to facilitate training. Built upon
+it, we further utilize the kernelizable attention technique with theoretical
+support to form a novel efficient SCC-kernel-based self-attention (ESSA) and
+reduce attention computation to linear complexity. ESSA enlarges the receptive
+field for features after upsampling without bringing much computation and
+allows the model to effectively utilize spatial-spectral information from
+different scales, resulting in the generation of more natural high-resolution
+images. Without the need for pretraining on large-scale datasets, our
+experiments demonstrate ESSA's effectiveness in both visual quality and
+quantitative results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Car-Studio: Learning Car Radiance Fields from Single-View and Endless
+  In-the-wild Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14009v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14009v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu Liu, Hao Zhao, Yang Yu, Guyue Zhou, Ming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositional neural scene graph studies have shown that radiance fields can
+be an efficient tool in an editable autonomous driving simulator. However,
+previous studies learned within a sequence of autonomous driving datasets,
+resulting in unsatisfactory blurring when rotating the car in the simulator. In
+this letter, we propose a pipeline for learning unconstrained images and
+building a dataset from processed images. To meet the requirements of the
+simulator, which demands that the vehicle maintain clarity when the perspective
+changes and that the contour remains sharp from the background to avoid
+artifacts when editing, we design a radiation field of the vehicle, a crucial
+part of the urban scene foreground. Through experiments, we demonstrate that
+our model achieves competitive performance compared to baselines. Using the
+datasets built from in-the-wild images, our method gradually presents a
+controllable appearance editing function. We will release the dataset and code
+on https://lty2226262.github.io/car-studio/ to facilitate further research in
+the field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submissions to the IEEE Robotics and Automation Letters (RA-L),
+  Project Page: https://lty2226262.github.io/car-studio/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Frequency Filters As Efficient Global Token Mixers <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Huang, Zhizheng Zhang, Cuiling Lan, Zheng-Jun Zha, Yan Lu, Baining Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent vision transformers, large-kernel CNNs and MLPs have attained
+remarkable successes in broad vision tasks thanks to their effective
+information fusion in the global scope. However, their efficient deployments,
+especially on mobile devices, still suffer from noteworthy challenges due to
+the heavy computational costs of self-attention mechanisms, large kernels, or
+fully connected layers. In this work, we apply conventional convolution theorem
+to deep learning for addressing this and reveal that adaptive frequency filters
+can serve as efficient global token mixers. With this insight, we propose
+Adaptive Frequency Filtering (AFF) token mixer. This neural operator transfers
+a latent representation to the frequency domain via a Fourier transform and
+performs semantic-adaptive frequency filtering via an elementwise
+multiplication, which mathematically equals to a token mixing operation in the
+original latent space with a dynamic convolution kernel as large as the spatial
+resolution of this latent representation. We take AFF token mixers as primary
+neural operators to build a lightweight neural network, dubbed AFFNet.
+Extensive experiments demonstrate the effectiveness of our proposed AFF token
+mixer and show that AFFNet achieve superior accuracy and efficiency trade-offs
+compared to other lightweight network designs on broad visual tasks, including
+visual recognition and dense prediction tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Snippet-to-Motion Progression for Skeleton-based Human Motion
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14006v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14006v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinshun Wang, Qiongjie Cui, Chen Chen, Shen Zhao, Mengyuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing Graph Convolutional Networks to achieve human motion prediction
+largely adopt a one-step scheme, which output the prediction straight from
+history input, failing to exploit human motion patterns. We observe that human
+motions have transitional patterns and can be split into snippets
+representative of each transition. Each snippet can be reconstructed from its
+starting and ending poses referred to as the transitional poses. We propose a
+snippet-to-motion multi-stage framework that breaks motion prediction into
+sub-tasks easier to accomplish. Each sub-task integrates three modules:
+transitional pose prediction, snippet reconstruction, and snippet-to-motion
+prediction. Specifically, we propose to first predict only the transitional
+poses. Then we use them to reconstruct the corresponding snippets, obtaining a
+close approximation to the true motion sequence. Finally we refine them to
+produce the final prediction output. To implement the network, we propose a
+novel unified graph modeling, which allows for direct and effective feature
+propagation compared to existing approaches which rely on separate space-time
+modeling. Extensive experiments on Human 3.6M, CMU Mocap and 3DPW datasets
+verify the effectiveness of our method which achieves state-of-the-art
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal reasoning in typical computer vision tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                         Zhang,  Kexuan,  Sun,  Qiyu,  Zhao,  Chaoqiang,  Tang,  Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has revolutionized the field of artificial intelligence. Based
+on the statistical correlations uncovered by deep learning-based methods,
+computer vision technology has contributed to tremendous growth in areas such
+as autonomous driving and robotics. Despite being the basis of deep learning,
+such correlation is not stable and is susceptible to uncontrolled factors. In
+the absence of the guidance of prior knowledge, statistical correlations can
+easily turn into spurious correlations and cause confounders. As a result,
+researchers are beginning to refine deep learning-based methods with causal
+theory. Causal theory models the intrinsic causal structure unaffected by data
+bias and is effective in avoiding spurious correlations. This paper aims to
+comprehensively review the existing causal methods in typical vision and
+vision-language tasks such as semantic segmentation, object detection, and
+image captioning. The advantages of causality and the approaches for building
+causal paradigms will be summarized. Future roadmaps are also proposed,
+including facilitating the development of causal theory and its application in
+other complex scenes and systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ METAVerse: Meta-Learning Traversability Cost Map for Off-Road Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwon Seo, Taekyung Kim, Seongyong Ahn, Kiho Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous navigation in off-road conditions requires an accurate estimation
+of terrain traversability. However, traversability estimation in unstructured
+environments is subject to high uncertainty due to the variability of numerous
+factors that influence vehicle-terrain interaction. Consequently, it is
+challenging to obtain a generalizable model that can accurately predict
+traversability in a variety of environments. This paper presents METAVerse, a
+meta-learning framework for learning a global model that accurately and
+reliably predicts terrain traversability across diverse environments. We train
+the traversability prediction network to generate a dense and continuous-valued
+cost map from a sparse LiDAR point cloud, leveraging vehicle-terrain
+interaction feedback in a self-supervised manner. Meta-learning is utilized to
+train a global model with driving data collected from multiple environments,
+effectively minimizing estimation uncertainty. During deployment, online
+adaptation is performed to rapidly adapt the network to the local environment
+by exploiting recent interaction experiences. To conduct a comprehensive
+evaluation, we collect driving data from various terrains and demonstrate that
+our method can obtain a global model that minimizes uncertainty. Moreover, by
+integrating our model with a model predictive controller, we demonstrate that
+the reduced uncertainty results in safe and stable navigation in unstructured
+and unknown terrains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our video can be found at https://youtu.be/4rIAMM1ZKMo</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Representation-Enhanced Sampling for Bayesian Active Learning in
+  Musculoskeletal Segmentation of Lower Extremities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ganping Li, Yoshito Otake, Mazen Soufi, Masashi Taniguchi, Masahide Yagi, Noriaki Ichihashi, Keisuke Uemura, Masaki Takao, Nobuhiko Sugano, Yoshinobu Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Obtaining manual annotations to train deep learning (DL) models for
+auto-segmentation is often time-consuming. Uncertainty-based Bayesian active
+learning (BAL) is a widely-adopted method to reduce annotation efforts. Based
+on BAL, this study introduces a hybrid representation-enhanced sampling
+strategy that integrates density and diversity criteria to save manual
+annotation costs by efficiently selecting the most informative samples.
+  Methods: The experiments are performed on two lower extremity (LE) datasets
+of MRI and CT images by a BAL framework based on Bayesian U-net. Our method
+selects uncertain samples with high density and diversity for manual revision,
+optimizing for maximal similarity to unlabeled instances and minimal similarity
+to existing training data. We assess the accuracy and efficiency using Dice and
+a proposed metric called reduced annotation cost (RAC), respectively. We
+further evaluate the impact of various acquisition rules on BAL performance and
+design an ablation study for effectiveness estimation.
+  Results: The proposed method showed superiority or non-inferiority to other
+methods on both datasets across two acquisition rules, and quantitative results
+reveal the pros and cons of the acquisition rules. Our ablation study in
+volume-wise acquisition shows that the combination of density and diversity
+criteria outperforms solely using either of them in musculoskeletal
+segmentation.
+  Conclusion: Our sampling method is proven efficient in reducing annotation
+costs in image segmentation tasks. The combination of the proposed method and
+our BAL framework provides a semi-automatic way for efficient annotation of
+medical image datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Security against Adversarial Examples Using a Random Ensemble
+  of Encrypted Vision <span class="highlight-title">Transformer</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryota Iijima, Miki Tanaka, Sayaka Shiota, Hitoshi Kiya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are well known to be vulnerable to adversarial
+examples (AEs). In addition, AEs have adversarial transferability, which means
+AEs generated for a source model can fool another black-box model (target
+model) with a non-trivial probability. In previous studies, it was confirmed
+that the vision transformer (ViT) is more robust against the property of
+adversarial transferability than convolutional neural network (CNN) models such
+as ConvMixer, and moreover encrypted ViT is more robust than ViT without any
+encryption. In this article, we propose a random ensemble of encrypted ViT
+models to achieve much more robust models. In experiments, the proposed scheme
+is verified to be more robust against not only black-box attacks but also
+white-box ones than convention methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of Video Quality <span class="highlight-title">Dataset</span>s via Design of Minimalistic Video
+  Quality Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Sun, Wen Wen, Xiongkuo Min, Long Lan, Guangtao Zhai, Kede Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blind video quality assessment (BVQA) plays an indispensable role in
+monitoring and improving the end-users' viewing experience in various
+real-world video-enabled media applications. As an experimental field, the
+improvements of BVQA models have been measured primarily on a few human-rated
+VQA datasets. Thus, it is crucial to gain a better understanding of existing
+VQA datasets in order to properly evaluate the current progress in BVQA.
+Towards this goal, we conduct a first-of-its-kind computational analysis of VQA
+datasets via designing minimalistic BVQA models. By minimalistic, we restrict
+our family of BVQA models to build only upon basic blocks: a video preprocessor
+(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an
+optional temporal quality analyzer, and a quality regressor, all with the
+simplest possible instantiations. By comparing the quality prediction
+performance of different model variants on eight VQA datasets with realistic
+distortions, we find that nearly all datasets suffer from the easy dataset
+problem of varying severity, some of which even admit blind image quality
+assessment (BIQA) solutions. We additionally justify our claims by contrasting
+our model generalizability on these VQA datasets, and by ablating a dizzying
+set of BVQA design choices related to the basic building blocks. Our results
+cast doubt on the current progress in BVQA, and meanwhile shed light on good
+practices of constructing next-generation VQA datasets and models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controlling the Latent Space of GANs through Reinforcement Learning: A
+  Case Study on Task-based Image-to-Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahyar Abbasian, Taha Rajabzadeh, Ahmadreza Moradipari, Seyed Amir Hossein Aqajari, Hongsheng Lu, Amir Rahmani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GAN) have emerged as a formidable AI tool to
+generate realistic outputs based on training datasets. However, the challenge
+of exerting control over the generation process of GANs remains a significant
+hurdle. In this paper, we propose a novel methodology to address this issue by
+integrating a reinforcement learning (RL) agent with a latent-space GAN
+(l-GAN), thereby facilitating the generation of desired outputs. More
+specifically, we have developed an actor-critic RL agent with a meticulously
+designed reward policy, enabling it to acquire proficiency in navigating the
+latent space of the l-GAN and generating outputs based on specified tasks. To
+substantiate the efficacy of our approach, we have conducted a series of
+experiments employing the MNIST dataset, including arithmetic addition as an
+illustrative task. The outcomes of these experiments serve to validate our
+methodology. Our pioneering integration of an RL agent with a GAN model
+represents a novel advancement, holding great potential for enhancing
+generative networks in the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 7 figures, 2 tables, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tracking Anything in High Quality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13974v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13974v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawen Zhu, Zhenyu Chen, Zeqi Hao, Shijie Chang, Lu Zhang, Dong Wang, Huchuan Lu, Bin Luo, Jun-Yan He, Jin-Peng Lan, Hanyuan Chen, Chenyang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual object tracking is a fundamental video task in computer vision.
+Recently, the notably increasing power of perception algorithms allows the
+unification of single/multiobject and box/mask-based tracking. Among them, the
+Segment Anything Model (SAM) attracts much attention. In this report, we
+propose HQTrack, a framework for High Quality Tracking anything in videos.
+HQTrack mainly consists of a video multi-object segmenter (VMOS) and a mask
+refiner (MR). Given the object to be tracked in the initial frame of a video,
+VMOS propagates the object masks to the current frame. The mask results at this
+stage are not accurate enough since VMOS is trained on several closeset video
+object segmentation (VOS) datasets, which has limited ability to generalize to
+complex and corner scenes. To further improve the quality of tracking masks, a
+pretrained MR model is employed to refine the tracking results. As a compelling
+testament to the effectiveness of our paradigm, without employing any tricks
+such as test-time data augmentations and model ensemble, HQTrack ranks the 2nd
+place in the Visual Object Tracking and Segmentation (VOTS2023) challenge. Code
+and models are available at https://github.com/jiawen-zhu/HQTrack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual <span class="highlight-title">Prompt</span> Flexible-Modal Face Anti-Spoofing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zitong Yu, Rizhao Cai, Yawen Cui, Ajian Liu, Changsheng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, vision transformer based multimodal learning methods have been
+proposed to improve the robustness of face anti-spoofing (FAS) systems.
+However, multimodal face data collected from the real world is often imperfect
+due to missing modalities from various imaging sensors. Recently,
+flexible-modal FAS~\cite{yu2023flexible} has attracted more attention, which
+aims to develop a unified multimodal FAS model using complete multimodal face
+data but is insensitive to test-time missing modalities. In this paper, we
+tackle one main challenge in flexible-modal FAS, i.e., when missing modality
+occurs either during training or testing in real-world situations. Inspired by
+the recent success of the prompt learning in language models, we propose
+\textbf{V}isual \textbf{P}rompt flexible-modal \textbf{FAS} (VP-FAS), which
+learns the modal-relevant prompts to adapt the frozen pre-trained foundation
+model to downstream flexible-modal FAS task. Specifically, both vanilla visual
+prompts and residual contextual prompts are plugged into multimodal
+transformers to handle general missing-modality cases, while only requiring
+less than 4\% learnable parameters compared to training the entire model.
+Furthermore, missing-modality regularization is proposed to force models to
+learn consistent multimodal feature embeddings when missing partial modalities.
+Extensive experiments conducted on two multimodal FAS benchmark datasets
+demonstrate the effectiveness of our VP-FAS framework that improves the
+performance under various missing-modality cases while alleviating the
+requirement of heavy model re-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2303.03369 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heterogeneous Embodied Multi-Agent Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinzhu Liu, Di Guo, Huaping Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent embodied tasks have recently been studied in complex indoor
+visual environments. Collaboration among multiple agents can improve work
+efficiency and has significant practical value. However, most of the existing
+research focuses on homogeneous multi-agent tasks. Compared with homogeneous
+agents, heterogeneous agents can leverage their different capabilities to
+allocate corresponding sub-tasks and cooperate to complete complex tasks.
+Heterogeneous multi-agent tasks are common in real-world scenarios, and the
+collaboration strategy among heterogeneous agents is a challenging and
+important problem to be solved. To study collaboration among heterogeneous
+agents, we propose the heterogeneous multi-agent tidying-up task, in which
+multiple heterogeneous agents with different capabilities collaborate with each
+other to detect misplaced objects and place them in reasonable locations. This
+is a demanding task since it requires agents to make the best use of their
+different capabilities to conduct reasonable task planning and complete the
+whole task. To solve this task, we build a heterogeneous multi-agent tidying-up
+benchmark dataset in a large number of houses with multiple rooms based on
+ProcTHOR-10K. We propose the hierarchical decision model based on misplaced
+object detection, reasonable receptacle prediction, as well as the
+handshake-based group communication mechanism. Extensive experiments are
+conducted to demonstrate the effectiveness of the proposed model. The project's
+website and videos of experiments can be found at https://hetercol.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Hidden Dance of Phonemes and Visage: Unveiling the Enigmatic Link
+  between Phonemes and Facial Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liao Qu, Xianwei Zou, Xiang Li, Yandong Wen, Rita Singh, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work unveils the enigmatic link between phonemes and facial features.
+Traditional studies on voice-face correlations typically involve using a long
+period of voice input, including generating face images from voices and
+reconstructing 3D face meshes from voices. However, in situations like
+voice-based crimes, the available voice evidence may be short and limited.
+Additionally, from a physiological perspective, each segment of speech --
+phoneme -- corresponds to different types of airflow and movements in the face.
+Therefore, it is advantageous to discover the hidden link between phonemes and
+face attributes. In this paper, we propose an analysis pipeline to help us
+explore the voice-face relationship in a fine-grained manner, i.e., phonemes
+v.s. facial anthropometric measurements (AM). We build an estimator for each
+phoneme-AM pair and evaluate the correlation through hypothesis testing. Our
+results indicate that AMs are more predictable from vowels compared to
+consonants, particularly with plosives. Additionally, we observe that if a
+specific AM exhibits more movement during phoneme pronunciation, it is more
+predictable. Our findings support those in physiology regarding correlation and
+lay the groundwork for future research on speech-face multimodal learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Interspeech 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Voice-Face Correlation: A Geometry View 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Li, Yandong Wen, Muqiao Yang, Jinglu Wang, Rita Singh, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous works on voice-face matching and voice-guided face synthesis
+demonstrate strong correlations between voice and face, but mainly rely on
+coarse semantic cues such as gender, age, and emotion. In this paper, we aim to
+investigate the capability of reconstructing the 3D facial shape from voice
+from a geometry perspective without any semantic information. We propose a
+voice-anthropometric measurement (AM)-face paradigm, which identifies
+predictable facial AMs from the voice and uses them to guide 3D face
+reconstruction. By leveraging AMs as a proxy to link the voice and face
+geometry, we can eliminate the influence of unpredictable AMs and make the face
+geometry tractable. Our approach is evaluated on our proposed dataset with
+ground-truth 3D face scans and corresponding voice recordings, and we find
+significant correlations between voice and specific parts of the face geometry,
+such as the nasal cavity and cranium. Our work offers a new perspective on
+voice-face correlation and can serve as a good empirical study for
+anthropometry science.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Centroid-aware feature recalibration for cancer grading in pathology
+  images <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaeung Lee, Keunho Byeon, Jin Tae Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cancer grading is an essential task in pathology. The recent developments of
+artificial neural networks in computational pathology have shown that these
+methods hold great potential for improving the accuracy and quality of cancer
+diagnosis. However, the issues with the robustness and reliability of such
+methods have not been fully resolved yet. Herein, we propose a centroid-aware
+feature recalibration network that can conduct cancer grading in an accurate
+and robust manner. The proposed network maps an input pathology image into an
+embedding space and adjusts it by using centroids embedding vectors of
+different cancer grades via attention mechanism. Equipped with the recalibrated
+embedding vector, the proposed network classifiers the input pathology image
+into a pertinent class label, i.e., cancer grade. We evaluate the proposed
+network using colorectal cancer datasets that were collected under different
+environments. The experimental results confirm that the proposed network is
+able to conduct cancer grading in pathology images with high accuracy
+regardless of the environmental changes in the datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023; 10 pages; 1 figure; Project code:
+  https://github.com/colin19950703/CaFeNet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Semi-Supervised Semantic Segmentation with Dual-Level Siamese
+  Structure Network <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhibo Tain, Xiaolin Zhang, Peng Zhang, Kun Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised semantic segmentation (SSS) is an important task that
+utilizes both labeled and unlabeled data to reduce expenses on labeling
+training examples. However, the effectiveness of SSS algorithms is limited by
+the difficulty of fully exploiting the potential of unlabeled data. To address
+this, we propose a dual-level Siamese structure network (DSSN) for pixel-wise
+contrastive learning. By aligning positive pairs with a pixel-wise contrastive
+loss using strong augmented views in both low-level image space and high-level
+feature space, the proposed DSSN is designed to maximize the utilization of
+available unlabeled data. Additionally, we introduce a novel class-aware
+pseudo-label selection strategy for weak-to-strong supervision, which addresses
+the limitations of most existing methods that do not perform selection or apply
+a predefined threshold for all classes. Specifically, our strategy selects the
+top high-confidence prediction of the weak view for each class to generate
+pseudo labels that supervise the strong augmented views. This strategy is
+capable of taking into account the class imbalance and improving the
+performance of long-tailed classes. Our proposed method achieves
+state-of-the-art results on two datasets, PASCAL VOC 2012 and Cityscapes,
+outperforming other SSS algorithms by a significant margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023 accpeted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AIDE: A Vision-Driven Multi-View, Multi-Modal, Multi-Tasking <span class="highlight-title">Dataset</span> for
+  Assistive Driving Perception <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13933v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13933v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dingkang Yang, Shuai Huang, Zhi Xu, Zhenpeng Li, Shunli Wang, Mingcheng Li, Yuzheng Wang, Yang Liu, Kun Yang, Zhaoyu Chen, Yan Wang, Jing Liu, Peixuan Zhang, Peng Zhai, Lihua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driver distraction has become a significant cause of severe traffic accidents
+over the past decade. Despite the growing development of vision-driven driver
+monitoring systems, the lack of comprehensive perception datasets restricts
+road safety and traffic security. In this paper, we present an AssIstive
+Driving pErception dataset (AIDE) that considers context information both
+inside and outside the vehicle in naturalistic scenarios. AIDE facilitates
+holistic driver monitoring through three distinctive characteristics, including
+multi-view settings of driver and scene, multi-modal annotations of face, body,
+posture, and gesture, and four pragmatic task designs for driving
+understanding. To thoroughly explore AIDE, we provide experimental benchmarks
+on three kinds of baseline frameworks via extensive methods. Moreover, two
+fusion strategies are introduced to give new insights into learning effective
+multi-stream/modal representations. We also systematically investigate the
+importance and rationality of the key components in AIDE and benchmarks. The
+project link is https://github.com/ydk122024/AIDE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatio-Temporal Domain Awareness for Multi-Agent Collaborative
+  Perception <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun Yang, Dingkang Yang, Jingyu Zhang, Mingcheng Li, Yang Liu, Jing Liu, Hanqi Wang, Peng Sun, Liang Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent collaborative perception as a potential application for
+vehicle-to-everything communication could significantly improve the perception
+performance of autonomous vehicles over single-agent perception. However,
+several challenges remain in achieving pragmatic information sharing in this
+emerging research. In this paper, we propose SCOPE, a novel collaborative
+perception framework that aggregates the spatio-temporal awareness
+characteristics across on-road agents in an end-to-end manner. Specifically,
+SCOPE has three distinct strengths: i) it considers effective semantic cues of
+the temporal context to enhance current representations of the target agent;
+ii) it aggregates perceptually critical spatial information from heterogeneous
+agents and overcomes localization errors via multi-scale feature interactions;
+iii) it integrates multi-source representations of the target agent based on
+their complementary contributions by an adaptive fusion paradigm. To thoroughly
+evaluate SCOPE, we consider both real-world and simulated scenarios of
+collaborative 3D object detection tasks on three datasets. Extensive
+experiments demonstrate the superiority of our approach and the necessity of
+the proposed components.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DFR-Net: Density Feature Refinement Network for Image Dehazing Utilizing
+  Haze Density Difference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongze Wang, Haitao Zhao, Lujian Yao, Jingchao Peng, Kaijie Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In image dehazing task, haze density is a key feature and affects the
+performance of dehazing methods. However, some of the existing methods lack a
+comparative image to measure densities, and others create intermediate results
+but lack the exploitation of their density differences, which can facilitate
+perception of density. To address these deficiencies, we propose a
+density-aware dehazing method named Density Feature Refinement Network
+(DFR-Net) that extracts haze density features from density differences and
+leverages density differences to refine density features. In DFR-Net, we first
+generate a proposal image that has lower overall density than the hazy input,
+bringing in global density differences. Additionally, the dehazing residual of
+the proposal image reflects the level of dehazing performance and provides
+local density differences that indicate localized hard dehazing or high density
+areas. Subsequently, we introduce a Global Branch (GB) and a Local Branch (LB)
+to achieve density-awareness. In GB, we use Siamese networks for feature
+extraction of hazy inputs and proposal images, and we propose a Global Density
+Feature Refinement (GDFR) module that can refine features by pushing features
+with different global densities further away. In LB, we explore local density
+features from the dehazing residuals between hazy inputs and proposal images
+and introduce an Intermediate Dehazing Residual Feedforward (IDRF) module to
+update local features and pull them closer to clear image features. Sufficient
+experiments demonstrate that the proposed method achieves results beyond the
+state-of-the-art methods on various datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EasyNet: An Easy Network for 3D Industrial Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruitao Chen, Guoyang Xie, Jiaqi Liu, Jinbao Wang, Ziqi Luo, Jinfan Wang, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D anomaly detection is an emerging and vital computer vision task in
+industrial manufacturing (IM). Recently many advanced algorithms have been
+published, but most of them cannot meet the needs of IM. There are several
+disadvantages: i) difficult to deploy on production lines since their
+algorithms heavily rely on large pre-trained models; ii) hugely increase
+storage overhead due to overuse of memory banks; iii) the inference speed
+cannot be achieved in real-time. To overcome these issues, we propose an easy
+and deployment-friendly network (called EasyNet) without using pre-trained
+models and memory banks: firstly, we design a multi-scale multi-modality
+feature encoder-decoder to accurately reconstruct the segmentation maps of
+anomalous regions and encourage the interaction between RGB images and depth
+images; secondly, we adopt a multi-modality anomaly segmentation network to
+achieve a precise anomaly map; thirdly, we propose an attention-based
+information entropy fusion module for feature fusion during inference, making
+it suitable for real-time deployment. Extensive experiments show that EasyNet
+achieves an anomaly detection AUROC of 92.6% without using pre-trained models
+and memory banks. In addition, EasyNet is faster than existing methods, with a
+high frame rate of 94.55 FPS on a Tesla V100 GPU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ trajdata: A Unified Interface to Multiple Human Trajectory <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boris Ivanovic, Guanyu Song, Igor Gilitschenski, Marco Pavone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of trajectory forecasting has grown significantly in recent years,
+partially owing to the release of numerous large-scale, real-world human
+trajectory datasets for autonomous vehicles (AVs) and pedestrian motion
+tracking. While such datasets have been a boon for the community, they each use
+custom and unique data formats and APIs, making it cumbersome for researchers
+to train and evaluate methods across multiple datasets. To remedy this, we
+present trajdata: a unified interface to multiple human trajectory datasets. At
+its core, trajdata provides a simple, uniform, and efficient representation and
+API for trajectory and map data. As a demonstration of its capabilities, in
+this work we conduct a comprehensive empirical evaluation of existing
+trajectory datasets, providing users with a rich understanding of the data
+underpinning much of current pedestrian and AV motion forecasting research, and
+proposing suggestions for future datasets from these insights. trajdata is
+permissively licensed (Apache 2.0) and can be accessed online at
+https://github.com/NVlabs/trajdata
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 15 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Points-to-3D: Bridging the Gap between Sparse Points and
+  Shape-Controllable Text-to-3D Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13908v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13908v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaohui Yu, Qiang Zhou, Jingliang Li, Zhe Zhang, Zhibin Wang, Fan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-3D generation has recently garnered significant attention, fueled by
+2D diffusion models trained on billions of image-text pairs. Existing methods
+primarily rely on score distillation to leverage the 2D diffusion priors to
+supervise the generation of 3D models, e.g., NeRF. However, score distillation
+is prone to suffer the view inconsistency problem, and implicit NeRF modeling
+can also lead to an arbitrary shape, thus leading to less realistic and
+uncontrollable 3D generation. In this work, we propose a flexible framework of
+Points-to-3D to bridge the gap between sparse yet freely available 3D points
+and realistic shape-controllable 3D generation by distilling the knowledge from
+both 2D and 3D diffusion models. The core idea of Points-to-3D is to introduce
+controllable sparse 3D points to guide the text-to-3D generation. Specifically,
+we use the sparse point cloud generated from the 3D diffusion model, Point-E,
+as the geometric prior, conditioned on a single reference image. To better
+utilize the sparse 3D points, we propose an efficient point cloud guidance loss
+to adaptively drive the NeRF's geometry to align with the shape of the sparse
+3D points. In addition to controlling the geometry, we propose to optimize the
+NeRF for a more view-consistent appearance. To be specific, we perform score
+distillation to the publicly available 2D image diffusion model ControlNet,
+conditioned on text as well as depth map of the learned compact geometry.
+Qualitative and quantitative comparisons demonstrate that Points-to-3D improves
+view consistency and achieves good shape controllability for text-to-3D
+generation. Points-to-3D provides users with a new way to improve and control
+text-to-3D generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ YOLOBench: Benchmarking Efficient Object Detectors on Embedded Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan Lazarevich, Matteo Grimaldi, Ravish Kumar, Saptarshi Mitra, Shahrukh Khan, Sudhakar Sah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present YOLOBench, a benchmark comprised of 550+ YOLO-based object
+detection models on 4 different datasets and 4 different embedded hardware
+platforms (x86 CPU, ARM CPU, Nvidia GPU, NPU). We collect accuracy and latency
+numbers for a variety of YOLO-based one-stage detectors at different model
+scales by performing a fair, controlled comparison of these detectors with a
+fixed training environment (code and training hyperparameters).
+Pareto-optimality analysis of the collected data reveals that, if modern
+detection heads and training techniques are incorporated into the learning
+process, multiple architectures of the YOLO series achieve a good
+accuracy-latency trade-off, including older models like YOLOv3 and YOLOv4. We
+also evaluate training-free accuracy estimators used in neural architecture
+search on YOLOBench and demonstrate that, while most state-of-the-art zero-cost
+accuracy estimators are outperformed by a simple baseline like MAC count, some
+of them can be effectively used to predict Pareto-optimal detection models. We
+showcase that by using a zero-cost proxy to identify a YOLO architecture
+competitive against a state-of-the-art YOLOv8 model on a Raspberry Pi 4 CPU.
+The code and data are available at
+https://github.com/Deeplite/deeplite-torch-zoo
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regularizing Neural Networks with Meta-Learning Generative Models <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shin'ya Yamaguchi, Daiki Chijiwa, Sekitoshi Kanai, Atsutoshi Kumagai, Hisashi Kashima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates methods for improving generative data augmentation
+for deep learning. Generative data augmentation leverages the synthetic samples
+produced by generative models as an additional dataset for classification with
+small dataset settings. A key challenge of generative data augmentation is that
+the synthetic data contain uninformative samples that degrade accuracy. This is
+because the synthetic samples do not perfectly represent class categories in
+real data and uniform sampling does not necessarily provide useful samples for
+tasks. In this paper, we present a novel strategy for generative data
+augmentation called meta generative regularization (MGR). To avoid the
+degradation of generative data augmentation, MGR utilizes synthetic samples in
+the regularization term for feature extractors instead of in the loss function,
+e.g., cross-entropy. These synthetic samples are dynamically determined to
+minimize the validation losses through meta-learning. We observed that MGR can
+avoid the performance degradation of na\"ive generative data augmentation and
+boost the baselines. Experiments on six datasets showed that MGR is effective
+particularly when datasets are smaller and stably outperforms baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Data-centric Machine Learning Research (DMLR) Workshop at
+  ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AViT: Adapting Vision <span class="highlight-title">Transformer</span>s for Small Skin Lesion Segmentation
+  <span class="highlight-title">Dataset</span>s <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyi Du, Nourhan Bayasi, Ghassan Harmarneh, Rafeef Garbi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Skin lesion segmentation (SLS) plays an important role in skin lesion
+analysis. Vision transformers (ViTs) are considered an auspicious solution for
+SLS, but they require more training data compared to convolutional neural
+networks (CNNs) due to their inherent parameter-heavy structure and lack of
+some inductive biases. To alleviate this issue, current approaches fine-tune
+pre-trained ViT backbones on SLS datasets, aiming to leverage the knowledge
+learned from a larger set of natural images to lower the amount of skin
+training data needed. However, fully fine-tuning all parameters of large
+backbones is computationally expensive and memory intensive. In this paper, we
+propose AViT, a novel efficient strategy to mitigate ViTs' data-hunger by
+transferring any pre-trained ViTs to the SLS task. Specifically, we integrate
+lightweight modules (adapters) within the transformer layers, which modulate
+the feature representation of a ViT without updating its pre-trained weights.
+In addition, we employ a shallow CNN as a prompt generator to create a prompt
+embedding from the input image, which grasps fine-grained information and CNN's
+inductive biases to guide the segmentation task on small datasets. Our
+quantitative experiments on 4 skin lesion datasets demonstrate that AViT
+achieves competitive, and at times superior, performance to SOTA but with
+significantly fewer trainable parameters. Our code is available at
+https://github.com/siyi-wind/AViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, accepted by MICCAI ISIC Workshop 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UP<span class="highlight-title">GPT</span>: Universal Diffusion Model for Person Image Generation, Editing
+  and Pose Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08870v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08870v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soon Yau Cheong, Armin Mustafa, Andrew Gilbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image models (T2I) such as StableDiffusion have been used to generate
+high quality images of people. However, due to the random nature of the
+generation process, the person has a different appearance e.g. pose, face, and
+clothing, despite using the same text prompt. The appearance inconsistency
+makes T2I unsuitable for pose transfer. We address this by proposing a
+multimodal diffusion model that accepts text, pose, and visual prompting. Our
+model is the first unified method to perform all person image tasks -
+generation, pose transfer, and mask-less edit. We also pioneer using small
+dimensional 3D body model parameters directly to demonstrate new capability -
+simultaneous pose and camera view interpolation while maintaining the person's
+appearance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Early Detection of Bark Beetle Attack Using Remote Sensing and Machine
+  Learning: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03829v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03829v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Mojtaba Marvasti-Zadeh, Devin Goodsman, Nilanjan Ray, Nadir Erbilgin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper provides a comprehensive review of past and current advances in
+the early detection of bark beetle-induced tree mortality from three primary
+perspectives: bark beetle & host interactions, RS, and ML/DL. In contrast to
+prior efforts, this review encompasses all RS systems and emphasizes ML/DL
+methods to investigate their strengths and weaknesses. We parse existing
+literature based on multi- or hyper-spectral analyses and distill their
+knowledge based on: bark beetle species & attack phases with a primary emphasis
+on early stages of attacks, host trees, study regions, RS platforms & sensors,
+spectral/spatial/temporal resolutions, spectral signatures, spectral vegetation
+indices (SVIs), ML approaches, learning schemes, task categories, models,
+algorithms, classes/clusters, features, and DL networks & architectures.
+Although DL-based methods and the random forest (RF) algorithm showed promising
+results, highlighting their potential to detect subtle changes across visible,
+thermal, and short-wave infrared (SWIR) spectral regions, they still have
+limited effectiveness and high uncertainties. To inspire novel solutions to
+these shortcomings, we delve into the principal challenges & opportunities from
+different perspectives, enabling a deeper understanding of the current state of
+research and guiding future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMBench: Is Your Multi-modal Model an All-around Player? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06281v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06281v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Liu, Haodong Duan, Yuanhan Zhang, Bo Li, Songyang Zhang, Wangbo Zhao, Yike Yuan, Jiaqi Wang, Conghui He, Ziwei Liu, Kai Chen, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models have recently achieved remarkable progress,
+exhibiting great perception and reasoning abilities concerning visual
+information. However, how to effectively evaluate these large vision-language
+models remains a major obstacle, hindering future model development.
+Traditional benchmarks like VQAv2 or COCO Caption provide quantitative
+performance measurements but suffer from a lack of fine-grained ability
+assessment and non-robust evaluation metrics. Recent subjective benchmarks,
+such as OwlEval, offer comprehensive evaluations of a model's abilities by
+incorporating human labor, but they are not scalable and display significant
+bias. In response to these challenges, we propose MMBench, a novel
+multi-modality benchmark. MMBench methodically develops a comprehensive
+evaluation pipeline, primarily comprised of two elements. The first element is
+a meticulously curated dataset that surpasses existing similar benchmarks in
+terms of the number and variety of evaluation questions and abilities. The
+second element introduces a novel CircularEval strategy and incorporates the
+use of ChatGPT. This implementation is designed to convert free-form
+predictions into pre-defined choices, thereby facilitating a more robust
+evaluation of the model's predictions. MMBench is a systematically-designed
+objective benchmark for robustly evaluating the various abilities of
+vision-language models. We hope MMBench will assist the research community in
+better evaluating their models and encourage future advancements in this
+domain. Project page: https://opencompass.org.cn/mmbench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Event-based Stereo Visual Odometry with Native Temporal Resolution via
+  Continuous-time Gaussian Process Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01188v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01188v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianeng Wang, Jonathan D. Gammell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event-based cameras asynchronously capture individual visual changes in a
+scene. This makes them more robust than traditional frame-based cameras to
+highly dynamic motions and poor illumination. It also means that every
+measurement in a scene can occur at a unique time.
+  Handling these different measurement times is a major challenge of using
+event-based cameras. It is often addressed in visual odometry (VO) pipelines by
+approximating temporally close measurements as occurring at one common time.
+This grouping simplifies the estimation problem but, absent additional sensors,
+sacrifices the inherent temporal resolution of event-based cameras.
+  This paper instead presents a complete stereo VO pipeline that estimates
+directly with individual event-measurement times without requiring any grouping
+or approximation in the estimation state. It uses continuous-time trajectory
+estimation to maintain the temporal fidelity and asynchronous nature of
+event-based cameras through Gaussian process regression with a physically
+motivated prior. Its performance is evaluated on the MVSEC dataset, where it
+achieves 7.9e-3 and 5.9e-3 RMS relative error on two independent sequences,
+outperforming the existing publicly available event-based stereo VO pipeline by
+two and four times, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Robotics and Automation Letters (RA-L). Manuscript
+  #23-1314. 8 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unifying Flow, Stereo and Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.05783v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.05783v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haofei Xu, Jing Zhang, Jianfei Cai, Hamid Rezatofighi, Fisher Yu, Dacheng Tao, Andreas Geiger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a unified formulation and model for three motion and 3D perception
+tasks: optical flow, rectified stereo matching and unrectified stereo depth
+estimation from posed images. Unlike previous specialized architectures for
+each specific task, we formulate all three tasks as a unified dense
+correspondence matching problem, which can be solved with a single model by
+directly comparing feature similarities. Such a formulation calls for
+discriminative feature representations, which we achieve using a Transformer,
+in particular the cross-attention mechanism. We demonstrate that
+cross-attention enables integration of knowledge from another image via
+cross-view interactions, which greatly improves the quality of the extracted
+features. Our unified model naturally enables cross-task transfer since the
+model architecture and parameters are shared across tasks. We outperform RAFT
+with our unified model on the challenging Sintel dataset, and our final model
+that uses a few additional task-specific refinement steps outperforms or
+compares favorably to recent state-of-the-art methods on 10 popular flow,
+stereo and depth datasets, while being simpler and more efficient in terms of
+model design and inference speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TPAMI 2023, Project Page: https://haofeixu.github.io/unimatch, Code:
+  https://github.com/autonomousvision/unimatch, Demo:
+  https://huggingface.co/spaces/haofeixu/unimatch</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dominating Set Database Selection for Visual Place Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05123v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05123v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasiia Kornilova, Ivan Moskalenko, Timofei Pushkin, Fakhriddin Tojiboev, Rahim Tariverdizadeh, Gonzalo Ferrer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an approach for creating a visual place recognition (VPR)
+database for localization in indoor environments from RGBD scanning sequences.
+The proposed approach is formulated as a minimization problem in terms of
+dominating set algorithm for graph, constructed from spatial information, and
+referred as DominatingSet. Our algorithm shows better scene coverage in
+comparison to other methodologies that are used for database creation. Also, we
+demonstrate that using DominatingSet, a database size could be up to 250-1400
+times smaller than the original scanning sequence while maintaining a recall
+rate of more than 80% on testing sequences. We evaluated our algorithm on
+7-scenes and BundleFusion datasets and an additionally recorded sequence in a
+highly repetitive office setting. In addition, the database selection can
+produce weakly-supervised labels for fine-tuning neural place recognition
+algorithms to particular settings, improving even more their accuracy. The
+paper also presents a fully automated pipeline for VPR database creation from
+RGBD scanning sequences, as well as a set of metrics for VPR database
+evaluation. The code and released data are available on our web-page~ --
+https://prime-slam.github.io/place-recognition-db/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Landmarks Motion from Speech for Speaker-Agnostic 3D Talking
+  Heads Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01415v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01415v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Nocentini, Claudio Ferrari, Stefano Berretti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach for generating 3D talking heads from raw
+audio inputs. Our method grounds on the idea that speech related movements can
+be comprehensively and efficiently described by the motion of a few control
+points located on the movable parts of the face, i.e., landmarks. The
+underlying musculoskeletal structure then allows us to learn how their motion
+influences the geometrical deformations of the whole face. The proposed method
+employs two distinct models to this aim: the first one learns to generate the
+motion of a sparse set of landmarks from the given audio. The second model
+expands such landmarks motion to a dense motion field, which is utilized to
+animate a given 3D mesh in neutral state. Additionally, we introduce a novel
+loss function, named Cosine Loss, which minimizes the angle between the
+generated motion vectors and the ground truth ones. Using landmarks in 3D
+talking head generation offers various advantages such as consistency,
+reliability, and obviating the need for manual-annotation. Our approach is
+designed to be identity-agnostic, enabling high-quality facial animations for
+any users without additional data or training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-modal Manifold Cutmix for <span class="highlight-title">Self-supervised</span> Video Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.03906v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.03906v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Srijan Das, Michael S. Ryoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address the challenge of obtaining large-scale unlabelled
+video datasets for contrastive representation learning in real-world
+applications. We present a novel video augmentation technique for
+self-supervised learning, called Cross-Modal Manifold Cutmix (CMMC), which
+generates augmented samples by combining different modalities in videos. By
+embedding a video tesseract into another across two modalities in the feature
+space, our method enhances the quality of learned video representations. We
+perform extensive experiments on two small-scale video datasets, UCF101 and
+HMDB51, for action recognition and video retrieval tasks. Our approach is also
+shown to be effective on the NTU dataset with limited domain knowledge. Our
+CMMC achieves comparable performance to other self-supervised methods while
+using less training data for both downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MVA 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MICDIR: Multi-scale Inverse-consistent Deformable Image Registration
+  using UNetMSS with Self-Constructing Graph Latent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.04317v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.04317v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumick Chatterjee, Himanshi Bajaj, Istiyak H. Siddiquee, Nandish Bandi Subbarayappa, Steve Simon, Suraj Bangalore Shashidhar, Oliver Speck, Andreas Nürnberge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image registration is the process of bringing different images into a common
+coordinate system - a technique widely used in various applications of computer
+vision, such as remote sensing, image retrieval, and, most commonly, medical
+imaging. Deep learning based techniques have been applied successfully to
+tackle various complex medical image processing problems, including medical
+image registration. Over the years, several image registration techniques have
+been proposed using deep learning. Deformable image registration techniques
+such as Voxelmorph have been successful in capturing finer changes and
+providing smoother deformations. However, Voxelmorph, as well as ICNet and
+FIRE, do not explicitly encode global dependencies (i.e. the overall anatomical
+view of the supplied image) and, therefore, cannot track large deformations. In
+order to tackle the aforementioned problems, this paper extends the Voxelmorph
+approach in three different ways. To improve the performance in case of small
+as well as large deformations, supervision of the model at different
+resolutions has been integrated using a multi-scale UNet. To support the
+network to learn and encode the minute structural co-relations of the given
+image-pairs, a self-constructing graph network (SCGNet) has been used as the
+latent of the multi-scale UNet - which can improve the learning process of the
+model and help the model to generalise better. And finally, to make the
+deformations inverse-consistent, cycle consistency loss has been employed. On
+the task of registration of brain MRIs, the proposed method achieved
+significant improvements over ANTs and VoxelMorph, obtaining a Dice score of
+0.8013 \pm 0.0243 for intramodal and 0.6211 \pm 0.0309 for intermodal, while
+VoxelMorph achieved 0.7747 \pm 0.0260 and 0.6071 \pm 0.0510, respectively
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FacEDiM: A Face Embedding Distribution Model for Few-Shot Biometric
+  Authentication of Cattle <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14831v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14831v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meshia Cédric Oveneke, Rucha Vaishampayan, Deogratias Lukamba Nsadisa, Jenny Ambukiyenyi Onya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work proposes to solve the problem of few-shot biometric authentication
+by computing the Mahalanobis distance between testing embeddings and a
+multivariate Gaussian distribution of training embeddings obtained using
+pre-trained CNNs. Experimental results show that models pre-trained on the
+ImageNet dataset significantly outperform models pre-trained on human faces.
+With a VGG16 model, we obtain a FRR of 1.25% for a FAR of 1.18% on a dataset of
+20 cattle identities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 1 figure, 1 table, paper accepted at Black In AI at the 36th
+  Conference on Neural Information Processing Systems (NeurIPS 2022), New
+  Orleans, USA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PKU-GoodsAD: A Supermarket Goods <span class="highlight-title">Dataset</span> for Unsupervised Anomaly
+  Detection and Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04956v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04956v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Zhang, Runwei Ding, Miaoju Ban, Ge Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual anomaly detection is essential and commonly used for many tasks in the
+field of computer vision. Recent anomaly detection datasets mainly focus on
+industrial automated inspection, medical image analysis and video surveillance.
+In order to broaden the application and research of anomaly detection in
+unmanned supermarkets and smart manufacturing, we introduce the supermarket
+goods anomaly detection (GoodsAD) dataset. It contains 6124 high-resolution
+images of 484 different appearance goods divided into 6 categories. Each
+category contains several common different types of anomalies such as
+deformation, surface damage and opened. Anomalies contain both texture changes
+and structural changes. It follows the unsupervised setting and only normal
+(defect-free) images are used for training. Pixel-precise ground truth regions
+are provided for all anomalies. Moreover, we also conduct a thorough evaluation
+of current state-of-the-art unsupervised anomaly detection methods. This
+initial benchmark indicates that some methods which perform well on the
+industrial anomaly detection dataset (e.g., MVTec AD), show poor performance on
+our dataset. This is a comprehensive, multi-object dataset for supermarket
+goods anomaly detection that focuses on real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AMAE: Adaptation of <span class="highlight-title">Pre-Train</span>ed Masked Autoencoder for Dual-Distribution
+  Anomaly Detection in Chest X-Rays <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12721v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12721v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Behzad Bozorgtabar, Dwarikanath Mahapatra, Jean-Philippe Thiran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly detection in medical images such as chest radiographs is
+stepping into the spotlight as it mitigates the scarcity of the labor-intensive
+and costly expert annotation of anomaly data. However, nearly all existing
+methods are formulated as a one-class classification trained only on
+representations from the normal class and discard a potentially significant
+portion of the unlabeled data. This paper focuses on a more practical setting,
+dual distribution anomaly detection for chest X-rays, using the entire training
+data, including both normal and unlabeled images. Inspired by a modern
+self-supervised vision transformer model trained using partial image inputs to
+reconstruct missing image regions -- we propose AMAE, a two-stage algorithm for
+adaptation of the pre-trained masked autoencoder (MAE). Starting from MAE
+initialization, AMAE first creates synthetic anomalies from only normal
+training images and trains a lightweight classifier on frozen transformer
+features. Subsequently, we propose an adaptation strategy to leverage unlabeled
+images containing anomalies. The adaptation scheme is accomplished by assigning
+pseudo-labels to unlabeled images and using two separate MAE based modules to
+model the normative and anomalous distributions of pseudo-labeled images. The
+effectiveness of the proposed adaptation strategy is evaluated with different
+anomaly ratios in an unlabeled training set. AMAE leads to consistent
+performance gains over competing self-supervised and dual distribution anomaly
+detection methods, setting the new state-of-the-art on three public chest X-ray
+benchmarks: RSNA, NIH-CXR, and VinDr-CXR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Sequence Descriptor based on Spatio-Temporal Attention for
+  Visual Place Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11467v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11467v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fenglin Zhang, Junqiao Zhao, Yingfeng Cai, Gengxuan Tian, Wenjie Mu, Chen Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Place Recognition (VPR) aims to retrieve frames from a geotagged
+database that are located at the same place as the query frame. To improve the
+robustness of VPR in perceptually aliasing scenarios, sequence-based VPR
+methods are proposed. These methods are either based on matching between frame
+sequences or extracting sequence descriptors for direct retrieval. However, the
+former is usually based on the assumption of constant velocity, which is
+difficult to hold in practice, and is computationally expensive and subject to
+sequence length. Although the latter overcomes these problems, existing
+sequence descriptors are constructed by aggregating features of multiple frames
+only, without interaction on temporal information, and thus cannot obtain
+descriptors with spatio-temporal discrimination. In this paper, we propose a
+sequence descriptor that effectively incorporates spatio-temporal information.
+Specifically, spatial attention within the same frame is utilized to learn
+spatial feature patterns, while attention in corresponding local regions of
+different frames is utilized to learn the persistence or change of features
+over time. We use a sliding window to control the temporal range of attention
+and use relative position encoding to construct sequential relationships
+between different features. This allows our descriptors to capture the
+intrinsic dynamics in a sequence of frames. Comprehensive experiments on
+challenging benchmark datasets show that the proposed approach outperforms
+recent state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> dense representation learning for live-cell microscopy
+  with time arrow prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05511v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05511v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Gallusser, Max Stieber, Martin Weigert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art object detection and segmentation methods for microscopy
+images rely on supervised machine learning, which requires laborious manual
+annotation of training data. Here we present a self-supervised method based on
+time arrow prediction pre-training that learns dense image representations from
+raw, unlabeled live-cell microscopy videos. Our method builds upon the task of
+predicting the correct order of time-flipped image regions via a single-image
+feature extractor followed by a time arrow prediction head that operates on the
+fused features. We show that the resulting dense representations capture
+inherently time-asymmetric biological processes such as cell divisions on a
+pixel-level. We furthermore demonstrate the utility of these representations on
+several live-cell microscopy datasets for detection and segmentation of
+dividing cells, as well as for cell state classification. Our method
+outperforms supervised methods, particularly when only limited ground truth
+annotations are available as is commonly the case in practice. We provide code
+at https://github.com/weigertlab/tarrow.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Few-shot Medical Image Segmentation via Cross-Reference <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09630v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09630v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Huang, Jianming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models have become the mainstream method for medical image
+segmentation, but they require a large manually labeled dataset for training
+and are difficult to extend to unseen categories. Few-shot segmentation(FSS)
+has the potential to address these challenges by learning new categories from a
+small number of labeled samples. The majority of the current methods employ a
+prototype learning architecture, which involves expanding support prototype
+vectors and concatenating them with query features to conduct conditional
+segmentation. However, such framework potentially focuses more on query
+features while may neglect the correlation between support and query features.
+In this paper, we propose a novel self-supervised few shot medical image
+segmentation network with Cross-Reference Transformer, which addresses the lack
+of interaction between the support image and the query image. We first enhance
+the correlation features between the support set image and the query image
+using a bidirectional cross-attention module. Then, we employ a cross-reference
+mechanism to mine and enhance the similar parts of support features and query
+features in high-dimensional channels. Experimental results show that the
+proposed model achieves good results on both CT dataset and MRI dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages,4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Factor Fields: A Unified Framework for Neural Fields and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anpei Chen, Zexiang Xu, Xinyue Wei, Siyu Tang, Hao Su, Andreas Geiger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Factor Fields, a novel framework for modeling and representing
+signals. Factor Fields decomposes a signal into a product of factors, each of
+which is represented by a neural or regular field representation operating on a
+coordinate transformed input signal. We show that this decomposition yields a
+unified framework that generalizes several recent signal representations
+including NeRF, PlenOxels, EG3D, Instant-NGP, and TensoRF. Moreover, the
+framework allows for the creation of powerful new signal representations, such
+as the Coefficient-Basis Factorization (CoBaFa) which we propose in this paper.
+As evidenced by our experiments, CoBaFa leads to improvements over previous
+fast reconstruction methods in terms of the three critical goals in neural
+signal representation: approximation quality, compactness and efficiency.
+Experimentally, we demonstrate that our representation achieves better image
+approximation quality on 2D image regression tasks, higher geometric quality
+when reconstructing 3D signed distance fields and higher compactness for
+radiance field reconstruction tasks compared to previous fast reconstruction
+methods. Besides, our CoBaFa representation enables generalization by sharing
+the basis across signals during training, enabling generalization tasks such as
+image regression with sparse observations and few-shot radiance field
+reconstruction. Project Page: https://apchenstu.github.io/FactorFields/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neglected Free Lunch -- Learning Image Classifiers Using Annotation
+  Byproducts <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17595v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17595v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyoon Han, Junsuk Choe, Seonghyeok Chun, John Joon Young Chung, Minsuk Chang, Sangdoo Yun, Jean Y. Song, Seong Joon Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised learning of image classifiers distills human knowledge into a
+parametric model through pairs of images and corresponding labels (X,Y). We
+argue that this simple and widely used representation of human knowledge
+neglects rich auxiliary information from the annotation procedure, such as the
+time-series of mouse traces and clicks left after image selection. Our insight
+is that such annotation byproducts Z provide approximate human attention that
+weakly guides the model to focus on the foreground cues, reducing spurious
+correlations and discouraging shortcut learning. To verify this, we create
+ImageNet-AB and COCO-AB. They are ImageNet and COCO training sets enriched with
+sample-wise annotation byproducts, collected by replicating the respective
+original annotation tasks. We refer to the new paradigm of training models with
+annotation byproducts as learning using annotation byproducts (LUAB). We show
+that a simple multitask loss for regressing Z together with Y already improves
+the generalisability and robustness of the learned models. Compared to the
+original supervised learning, LUAB does not require extra annotation costs.
+ImageNet-AB and COCO-AB are at https://github.com/naver-ai/NeglectedFreeLunch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code & data at https://github.com/naver-ai/NeglectedFreeLunch. To be
+  presented at ICCV'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DNN-Compressed Domain Visual Recognition with Feature Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08000v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08000v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingpeng Deng, Lina J. Karam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning-based image compression was shown to achieve a competitive
+performance with state-of-the-art transform-based codecs. This motivated the
+development of new learning-based visual compression standards such as JPEG-AI.
+Of particular interest to these emerging standards is the development of
+learning-based image compression systems targeting both humans and machines.
+This paper is concerned with learning-based compression schemes whose
+compressed-domain representations can be utilized to perform visual processing
+and computer vision tasks directly in the compressed domain. In our work, we
+adopt a learning-based compressed-domain classification framework for
+performing visual recognition using the compressed-domain latent representation
+at varying bit-rates. We propose a novel feature adaptation module integrating
+a lightweight attention model to adaptively emphasize and enhance the key
+features within the extracted channel-wise information. Also, we design an
+adaptation training strategy to utilize the pretrained pixel-domain weights.
+For comparison, in addition to the performance results that are obtained using
+our proposed latent-based compressed-domain method, we also present performance
+results using compressed but fully decoded images in the pixel domain as well
+as original uncompressed images. The obtained performance results show that our
+proposed compressed-domain classification model can distinctly outperform the
+existing compressed-domain classification models, and that it can also yield
+similar accuracy results with a much higher computational efficiency as
+compared to the pixel-domain models that are trained using fully decoded
+images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TD-GEM: Text-Driven Garment Editing Mapper 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18120v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18120v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reza Dadfar, Sanaz Sabzevari, Mårten Björkman, Danica Kragic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language-based fashion image editing allows users to try out variations of
+desired garments through provided text prompts. Inspired by research on
+manipulating latent representations in StyleCLIP and HairCLIP, we focus on
+these latent spaces for editing fashion items of full-body human datasets.
+Currently, there is a gap in handling fashion image editing due to the
+complexity of garment shapes and textures and the diversity of human poses. In
+this paper, we propose an editing optimizer scheme method called Text-Driven
+Garment Editing Mapper (TD-GEM), aiming to edit fashion items in a disentangled
+way. To this end, we initially obtain a latent representation of an image
+through generative adversarial network inversions such as Encoder for Editing
+(e4e) or Pivotal Tuning Inversion (PTI) for more accurate results. An
+optimization-based Contrastive Language-Image Pre-training (CLIP) is then
+utilized to guide the latent representation of a fashion image in the direction
+of a target attribute expressed in terms of a text prompt. Our TD-GEM
+manipulates the image accurately according to the target attribute, while other
+parts of the image are kept untouched. In the experiments, we evaluate TD-GEM
+on two different attributes (i.e., "color" and "sleeve length"), which
+effectively generates realistic images compared to the recent manipulation
+schemes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FsaNet: Frequency Self-attention for Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15595v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15595v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengyu Zhang, Ashkan Panahi, Guangjun Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Considering the spectral properties of images, we propose a new
+self-attention mechanism with highly reduced computational complexity, up to a
+linear rate. To better preserve edges while promoting similarity within
+objects, we propose individualized processes over different frequency bands. In
+particular, we study a case where the process is merely over low-frequency
+components. By ablation study, we show that low frequency self-attention can
+achieve very close or better performance relative to full frequency even
+without retraining the network. Accordingly, we design and embed novel
+plug-and-play modules to the head of a CNN network that we refer to as FsaNet.
+The frequency self-attention 1) requires only a few low frequency coefficients
+as input, 2) can be mathematically equivalent to spatial domain self-attention
+with linear structures, 3) simplifies token mapping ($1\times1$ convolution)
+stage and token mixing stage simultaneously. We show that frequency
+self-attention requires $87.29\% \sim 90.04\%$ less memory, $96.13\% \sim
+98.07\%$ less FLOPs, and $97.56\% \sim 98.18\%$ in run time than the regular
+self-attention. Compared to other ResNet101-based self-attention networks,
+\ourM achieves a new \sArt result ($83.0\%$ mIoU) on Cityscape test dataset and
+competitive results on ADE20k and VOCaug. \ourM can also enhance MASK R-CNN for
+instance segmentation on COCO. In addition, utilizing the proposed module,
+Segformer can be boosted on a series of models with different scales, and
+Segformer-B5 can be improved even without retraining. Code is accessible at
+\url{https://github.com/zfy-csu/FsaNet
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EgoVSR: Towards High-Quality Egocentric Video Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14708v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14708v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichen Chi, Junhao Gu, Jiamiao Zhang, Wenming Yang, Yapeng Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the limitations of capture devices and scenarios, egocentric videos
+frequently have low visual quality, mainly caused by high compression and
+severe motion blur. With the increasing application of egocentric videos, there
+is an urgent need to enhance the quality of these videos through
+super-resolution. However, existing Video Super-Resolution (VSR) works,
+focusing on third-person view videos, are actually unsuitable for handling
+blurring artifacts caused by rapid ego-motion and object motion in egocentric
+videos. To this end, we propose EgoVSR, a VSR framework specifically designed
+for egocentric videos. We explicitly tackle motion blurs in egocentric videos
+using a Dual Branch Deblur Network (DB$^2$Net) in the VSR framework. Meanwhile,
+a blurring mask is introduced to guide the DB$^2$Net learning, and can be used
+to localize blurred areas in video frames. We also design a MaskNet to predict
+the mask, as well as a mask loss to optimize the mask estimation. Additionally,
+an online motion blur synthesis model for common VSR training data is proposed
+to simulate motion blurs as in egocentric videos. In order to validate the
+effectiveness of our proposed method, we introduce an EgoVSR dataset containing
+a large amount of fast-motion egocentric video sequences. Extensive experiments
+demonstrate that our EgoVSR model can efficiently super-resolve low-quality
+egocentric videos and outperform strong comparison baselines. Our code,
+pre-trained models and data can be found at https://github.com/chiyich/EGOVSR/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Null-text Guidance in Diffusion Models is Secretly a Cartoon-style
+  Creator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06710v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06710v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Zhao, Heliang Zheng, Chaoyue Wang, Long Lan, Wanrong Huang, Wenjing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classifier-free guidance is an effective sampling technique in diffusion
+models that has been widely adopted. The main idea is to extrapolate the model
+in the direction of text guidance and away from null-text guidance. In this
+paper, we demonstrate that null-text guidance in diffusion models is secretly a
+cartoon-style creator, i.e., the generated images can be efficiently
+transformed into cartoons by simply perturbing the null-text guidance.
+Specifically, we proposed two disturbance methods, i.e., Rollback disturbance
+(Back-D) and Image disturbance (Image-D), to construct misalignment between the
+noisy images used for predicting null-text guidance and text guidance
+(subsequently referred to as \textbf{null-text noisy image} and \textbf{text
+noisy image} respectively) in the sampling process. Back-D achieves
+cartoonization by altering the noise level of null-text noisy image via
+replacing $x_t$ with $x_{t+\Delta t}$. Image-D, alternatively, produces
+high-fidelity, diverse cartoons by defining $x_t$ as a clean input image, which
+further improves the incorporation of finer image details. Through
+comprehensive experiments, we delved into the principle of noise disturbing for
+null-text and uncovered that the efficacy of disturbance depends on the
+correlation between the null-text noisy image and the source image. Moreover,
+our proposed techniques, which can generate cartoon images and cartoonize
+specific ones, are training-free and easily integrated as a plug-and-play
+component in any classifier-free guided diffusion model. Project page is
+available at \url{https://nulltextforcartoon.github.io/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning Based 3D Segmentation: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.05423v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.05423v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong He, Hongshan Yu, Xiaoyan Liu, Zhengeng Yang, Wei Sun, Ajmal Mian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D segmentation is a fundamental and challenging problem in computer vision
+with applications in autonomous driving, robotics, augmented reality and
+medical image analysis. It has received significant attention from the computer
+vision, graphics and machine learning communities. Conventional methods for 3D
+segmentation, based on hand-crafted features and machine learning classifiers,
+lack generalization ability. Driven by their success in 2D computer vision,
+deep learning techniques have recently become the tool of choice for 3D
+segmentation tasks. This has led to an influx of a large number of methods in
+the literature that have been evaluated on different benchmark datasets.
+Whereas survey papers on RGB-D and point cloud segmentation exist, there is a
+lack of an in-depth and recent survey that covers all 3D data modalities and
+application domains. This paper fills the gap and provides a comprehensive
+survey of the recent progress made in deep learning based 3D segmentation. It
+covers over 180 works, analyzes their strengths and limitations and discusses
+their competitive results on benchmark datasets. The survey provides a summary
+of the most commonly used pipelines and finally highlights promising research
+directions for the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 10 tables, 8 figures, update the transformer-based methods
+  for 3D segmentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scale-Aware Modulation Meet <span class="highlight-title">Transformer</span> <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08579v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08579v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weifeng Lin, Ziheng Wu, Jiayu Chen, Jun Huang, Lianwen Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a new vision Transformer, Scale-Aware Modulation
+Transformer (SMT), that can handle various downstream tasks efficiently by
+combining the convolutional network and vision Transformer. The proposed
+Scale-Aware Modulation (SAM) in the SMT includes two primary novel designs.
+Firstly, we introduce the Multi-Head Mixed Convolution (MHMC) module, which can
+capture multi-scale features and expand the receptive field. Secondly, we
+propose the Scale-Aware Aggregation (SAA) module, which is lightweight but
+effective, enabling information fusion across different heads. By leveraging
+these two modules, convolutional modulation is further enhanced. Furthermore,
+in contrast to prior works that utilized modulations throughout all stages to
+build an attention-free network, we propose an Evolutionary Hybrid Network
+(EHN), which can effectively simulate the shift from capturing local to global
+dependencies as the network becomes deeper, resulting in superior performance.
+Extensive experiments demonstrate that SMT significantly outperforms existing
+state-of-the-art models across a wide range of visual tasks. Specifically, SMT
+with 11.5M / 2.4GFLOPs and 32M / 7.7GFLOPs can achieve 82.2% and 84.3% top-1
+accuracy on ImageNet-1K, respectively. After pretrained on ImageNet-22K in
+224^2 resolution, it attains 87.1% and 88.1% top-1 accuracy when finetuned with
+resolution 224^2 and 384^2, respectively. For object detection with Mask R-CNN,
+the SMT base trained with 1x and 3x schedule outperforms the Swin Transformer
+counterpart by 4.2 and 1.3 mAP on COCO, respectively. For semantic segmentation
+with UPerNet, the SMT base test at single- and multi-scale surpasses Swin by
+2.0 and 1.1 mIoU respectively on the ADE20K.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HOICLIP: Efficient Knowledge Transfer for HOI Detection with
+  Vision-Language Models <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15786v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15786v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shan Ning, Longtian Qiu, Yongfei Liu, Xuming He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-Object Interaction (HOI) detection aims to localize human-object pairs
+and recognize their interactions. Recently, Contrastive Language-Image
+Pre-training (CLIP) has shown great potential in providing interaction prior
+for HOI detectors via knowledge distillation. However, such approaches often
+rely on large-scale training data and suffer from inferior performance under
+few/zero-shot scenarios. In this paper, we propose a novel HOI detection
+framework that efficiently extracts prior knowledge from CLIP and achieves
+better generalization. In detail, we first introduce a novel interaction
+decoder to extract informative regions in the visual feature map of CLIP via a
+cross-attention mechanism, which is then fused with the detection backbone by a
+knowledge integration block for more accurate human-object pair detection. In
+addition, prior knowledge in CLIP text encoder is leveraged to generate a
+classifier by embedding HOI descriptions. To distinguish fine-grained
+interactions, we build a verb classifier from training data via visual semantic
+arithmetic and a lightweight verb representation adapter. Furthermore, we
+propose a training-free enhancement to exploit global HOI predictions from
+CLIP. Extensive experiments demonstrate that our method outperforms the state
+of the art by a large margin on various settings, e.g. +4.04 mAP on HICO-Det.
+The source code is available in https://github.com/Artanic30/HOICLIP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023.Open sourced, Code and Model Available</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Stable Signature: Rooting Watermarks in Latent Diffusion Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15435v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15435v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Fernandez, Guillaume Couairon, Hervé Jégou, Matthijs Douze, Teddy Furon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative image modeling enables a wide range of applications but raises
+ethical concerns about responsible deployment. This paper introduces an active
+strategy combining image watermarking and Latent Diffusion Models. The goal is
+for all generated images to conceal an invisible watermark allowing for future
+detection and/or identification. The method quickly fine-tunes the latent
+decoder of the image generator, conditioned on a binary signature. A
+pre-trained watermark extractor recovers the hidden signature from any
+generated image and a statistical test then determines whether it comes from
+the generative model. We evaluate the invisibility and robustness of the
+watermarks on a variety of generation tasks, showing that Stable Signature
+works even after the images are modified. For instance, it detects the origin
+of an image generated from a text prompt, then cropped to keep $10\%$ of the
+content, with $90$+$\%$ accuracy at a false positive rate below 10$^{-6}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICCV 2023. Code at
+  https://github.com/facebookresearch/stable_signature - webpage at
+  https://pierrefdz.github.io/publications/stablesignature</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17723v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17723v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seunghyeon Seo, Yeonjin Chang, Nojun Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Field (NeRF) has been a mainstream in novel view synthesis
+with its remarkable quality of rendered images and simple architecture.
+Although NeRF has been developed in various directions improving continuously
+its performance, the necessity of a dense set of multi-view images still exists
+as a stumbling block to progress for practical application. In this work, we
+propose FlipNeRF, a novel regularization method for few-shot novel view
+synthesis by utilizing our proposed flipped reflection rays. The flipped
+reflection rays are explicitly derived from the input ray directions and
+estimated normal vectors, and play a role of effective additional training rays
+while enabling to estimate more accurate surface normals and learn the 3D
+geometry effectively. Since the surface normal and the scene depth are both
+derived from the estimated densities along a ray, the accurate surface normal
+leads to more exact depth estimation, which is a key factor for few-shot novel
+view synthesis. Furthermore, with our proposed Uncertainty-aware Emptiness Loss
+and Bottleneck Feature Consistency Loss, FlipNeRF is able to estimate more
+reliable outputs with reducing floating artifacts effectively across the
+different scene structures, and enhance the feature-level consistency between
+the pair of the rays cast toward the photo-consistent pixels without any
+additional feature extractor, respectively. Our FlipNeRF achieves the SOTA
+performance on the multiple benchmarks across all the scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Project Page: https://shawn615.github.io/flipnerf/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlphaNet: Improving Long-Tail Classification By Combining Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2008.07073v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2008.07073v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nadine Chang, Jayanth Koushik, Aarti Singh, Martial Hebert, Yu-Xiong Wang, Michael J. Tarr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods in long-tail learning focus on improving performance for data-poor
+(rare) classes; however, performance for such classes remains much lower than
+performance for more data-rich (frequent) classes. Analyzing the predictions of
+long-tail methods for rare classes reveals that a large number of errors are
+due to misclassification of rare items as visually similar frequent classes. To
+address this problem, we introduce AlphaNet, a method that can be applied to
+existing models, performing post hoc correction on classifiers of rare classes.
+Starting with a pre-trained model, we find frequent classes that are closest to
+rare classes in the model's representation space and learn weights to update
+rare class classifiers with a linear combination of frequent class classifiers.
+AlphaNet, applied to several models, greatly improves test accuracy for rare
+classes in multiple long-tailed datasets, with very little change to overall
+accuracy. Our method also provides a way to control the trade-off between rare
+class and overall accuracy, making it practical for long-tail classification in
+the wild.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Priors in Deep Image Restoration and Enhancement: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02070v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02070v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfan Lu, Yiqi Lin, Hao Wu, Yunhao Luo, Xu Zheng, Hui Xiong, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image restoration and enhancement is a process of improving the image quality
+by removing degradations, such as noise, blur, and resolution degradation. Deep
+learning (DL) has recently been applied to image restoration and enhancement.
+Due to its ill-posed property, plenty of works have been explored priors to
+facilitate training deep neural networks (DNNs). However, the importance of
+priors has not been systematically studied and analyzed by far in the research
+community. Therefore, this paper serves as the first study that provides a
+comprehensive overview of recent advancements in priors for deep image
+restoration and enhancement. Our work covers five primary contents: (1) A
+theoretical analysis of priors for deep image restoration and enhancement; (2)
+A hierarchical and structural taxonomy of priors commonly used in the DL-based
+methods; (3) An insightful discussion on each prior regarding its principle,
+potential, and applications; (4) A summary of crucial problems by highlighting
+the potential future directions, especially adopting the large-scale foundation
+models as prior, to spark more research in the community; (5) An open-source
+repository that provides a taxonomy of all mentioned works and code links.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scene Graph Generation from Hierarchical Relationship Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06842v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06842v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Jiang, Camillo J. Taylor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach for inferring relationships between
+objects in visual scenes. It explicitly exploits an informative hierarchical
+structure that can be imposed to divide the object and relationship categories
+into disjoint super-categories. Specifically, our proposed method incorporates
+a Bayes prediction head, enabling joint predictions of the super-category as
+the type of relationship between the two objects, along with the detailed
+relationship within that super-category. This design reduces the impact of
+class imbalance problems. Furthermore, we also modify the supervised
+contrastive learning to adapt our hierarchical classification scheme.
+Experimental evaluations on the Visual Genome and OpenImage V6 datasets
+demonstrate that this factorized approach allows a relatively simple model to
+achieve competitive performance, particularly in predicate classification and
+zero-shot tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DifFSS: Diffusion Model for Few-Shot Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weimin Tan, Siyuan Chen, Bo Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated excellent performance in image generation.
+Although various few-shot semantic segmentation (FSS) models with different
+network structures have been proposed, performance improvement has reached a
+bottleneck. This paper presents the first work to leverage the diffusion model
+for FSS task, called DifFSS. DifFSS, a novel FSS paradigm, can further improve
+the performance of the state-of-the-art FSS models by a large margin without
+modifying their network structure. Specifically, we utilize the powerful
+generation ability of diffusion models to generate diverse auxiliary support
+images by using the semantic mask, scribble or soft HED boundary of the support
+image as control conditions. This generation process simulates the variety
+within the class of the query image, such as color, texture variation,
+lighting, $etc$. As a result, FSS models can refer to more diverse support
+images, yielding more robust representations, thereby achieving a consistent
+improvement in segmentation performance. Extensive experiments on three
+publicly available datasets based on existing advanced FSS models demonstrate
+the effectiveness of the diffusion model for FSS task. Furthermore, we explore
+in detail the impact of different input settings of the diffusion model on
+segmentation performance. Hopefully, this completely new paradigm will bring
+inspiration to the study of FSS task integrated with AI-generated content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code is available at https://github.com/TrinitialChan/DifFSS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAS Video-QA: Self-Adaptive Sampling for Efficient Video
+  Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Han, Hui Chen, Min-Yen Kan, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video question--answering is a fundamental task in the field of video
+understanding. Although current vision--language models (VLMs) equipped with
+Video Transformers have enabled temporal modeling and yielded superior results,
+they are at the cost of huge computational power and thus too expensive to
+deploy in real-time application scenarios. An economical workaround only
+samples a small portion of frames to represent the main content of that video
+and tune an image--text model on these sampled frames. Recent video
+understanding models usually randomly sample a set of frames or clips,
+regardless of internal correlations between their visual contents, nor their
+relevance to the problem. We argue that such kinds of aimless sampling may omit
+the key frames from which the correct answer can be deduced, and the situation
+gets worse when the sampling sparsity increases, which always happens as the
+video lengths increase. To mitigate this issue, we propose two frame sampling
+strategies, namely the most domain frames (MDF) and most implied frames (MIF),
+to maximally preserve those frames that are most likely vital to the given
+questions. MDF passively minimizes the risk of key frame omission in a
+bootstrap manner, while MIS actively searches key frames customized for each
+video--question pair with the assistance of auxiliary models. The experimental
+results on three public datasets from three advanced VLMs (CLIP, GIT and
+All-in-one) demonstrate that our proposed strategies can boost the performance
+for image--text pretrained models. The source codes pertaining to the method
+proposed in this paper are publicly available at
+https://github.com/declare-lab/sas-vqa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Table and Image Generation for Investigating Knowledge of Entities in
+  <span class="highlight-title">Pre-train</span>ed Vision and Language Models <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02115v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02115v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hidetaka Kamigaito, Katsuhiko Hayashi, Taro Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a table and image generation task to verify how the
+knowledge about entities acquired from natural language is retained in Vision &
+Language (V&L) models. This task consists of two parts: the first is to
+generate a table containing knowledge about an entity and its related image,
+and the second is to generate an image from an entity with a caption and a
+table containing related knowledge of the entity. In both tasks, the model must
+know the entities used to perform the generation properly. We created the
+Wikipedia Table and Image Generation (WikiTIG) dataset from about 200,000
+infoboxes in English Wikipedia articles to perform the proposed tasks. We
+evaluated the performance on the tasks with respect to the above research
+question using the V&L model OFA, which has achieved state-of-the-art results
+in multiple tasks. Experimental results show that OFA forgets part of its
+entity knowledge by pre-training as a complement to improve the performance of
+image related tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MDViT: Multi-domain Vision <span class="highlight-title">Transformer</span> for Small Medical Image
+  Segmentation <span class="highlight-title">Dataset</span>s <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02100v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02100v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyi Du, Nourhan Bayasi, Ghassan Harmarneh, Rafeef Garbi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite its clinical utility, medical image segmentation (MIS) remains a
+daunting task due to images' inherent complexity and variability. Vision
+transformers (ViTs) have recently emerged as a promising solution to improve
+MIS; however, they require larger training datasets than convolutional neural
+networks. To overcome this obstacle, data-efficient ViTs were proposed, but
+they are typically trained using a single source of data, which overlooks the
+valuable knowledge that could be leveraged from other available datasets.
+Naivly combining datasets from different domains can result in negative
+knowledge transfer (NKT), i.e., a decrease in model performance on some domains
+with non-negligible inter-domain heterogeneity. In this paper, we propose
+MDViT, the first multi-domain ViT that includes domain adapters to mitigate
+data-hunger and combat NKT by adaptively exploiting knowledge in multiple small
+data resources (domains). Further, to enhance representation learning across
+domains, we integrate a mutual knowledge distillation paradigm that transfers
+knowledge between a universal network (spanning all the domains) and auxiliary
+domain-specific branches. Experiments on 4 skin lesion segmentation datasets
+show that MDViT outperforms state-of-the-art algorithms, with superior
+segmentation performance and a fixed model size, at inference time, even as
+more domains are added. Our code is available at
+https://github.com/siyi-wind/MDViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, accepted by 26th International Conference on
+  Medical Image Computing and Computer Assisted Intervention (MICCAI 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforced Disentanglement for Face Swapping without Skip Connection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07928v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07928v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohang Ren, Xingyu Chen, Pengfei Yao, Heung-Yeung Shum, Baoyuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The SOTA face swap models still suffer the problem of either target identity
+(i.e., shape) being leaked or the target non-identity attributes (i.e.,
+background, hair) failing to be fully preserved in the final results. We show
+that this insufficient disentanglement is caused by two flawed designs that
+were commonly adopted in prior models: (1) counting on only one compressed
+encoder to represent both the semantic-level non-identity facial
+attributes(i.e., pose) and the pixel-level non-facial region details, which is
+contradictory to satisfy at the same time; (2) highly relying on long
+skip-connections between the encoder and the final generator, leaking a certain
+amount of target face identity into the result. To fix them, we introduce a new
+face swap framework called 'WSC-swap' that gets rid of skip connections and
+uses two target encoders to respectively capture the pixel-level non-facial
+region attributes and the semantic non-identity attributes in the face region.
+To further reinforce the disentanglement learning for the target encoder, we
+employ both identity removal loss via adversarial training (i.e., GAN) and the
+non-identity preservation loss via prior 3DMM models like [11]. Extensive
+experiments on both FaceForensics++ and CelebA-HQ show that our results
+significantly outperform previous works on a rich set of metrics, including one
+novel metric for measuring identity consistency that was completely neglected
+before.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedIIC: Towards Robust Federated Learning for Class-Imbalanced Medical
+  Image Classification <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.13803v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.13803v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nannan Wu, Li Yu, Xin Yang, Kwang-Ting Cheng, Zengqiang Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL), training deep models from decentralized data without
+privacy leakage, has shown great potential in medical image computing recently.
+However, considering the ubiquitous class imbalance in medical data, FL can
+exhibit performance degradation, especially for minority classes (e.g. rare
+diseases). Existing methods towards this problem mainly focus on training a
+balanced classifier to eliminate class prior bias among classes, but neglect to
+explore better representation to facilitate classification performance. In this
+paper, we present a privacy-preserving FL method named FedIIC to combat class
+imbalance from two perspectives: feature learning and classifier learning. In
+feature learning, two levels of contrastive learning are designed to extract
+better class-specific features with imbalanced data in FL. In classifier
+learning, per-class margins are dynamically set according to real-time
+difficulty and class priors, which helps the model learn classes equally.
+Experimental results on publicly-available datasets demonstrate the superior
+performance of FedIIC in dealing with both real-world and simulated
+multi-source medical imaging data under class imbalance. Code is available at
+https://github.com/wnn2000/FedIIC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Priming Cross-Session Motor Imagery Classification with A Universal Deep
+  Domain Adaptation Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.09559v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.09559v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengqing Miao, Xin Zhang, Carlo Menon, Yelong Zheng, Meirong Zhao, Dong Ming
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motor imagery (MI) is a common brain computer interface (BCI) paradigm. EEG
+is non-stationary with low signal-to-noise, classifying motor imagery tasks of
+the same participant from different EEG recording sessions is generally
+challenging, as EEG data distribution may vary tremendously among different
+acquisition sessions. Although it is intuitive to consider the cross-session MI
+classification as a domain adaptation problem, the rationale and feasible
+approach is not elucidated. In this paper, we propose a Siamese deep domain
+adaptation (SDDA) framework for cross-session MI classification based on
+mathematical models in domain adaptation theory. The proposed framework can be
+easily applied to most existing artificial neural networks without altering the
+network structure, which facilitates our method with great flexibility and
+transferability. In the proposed framework, domain invariants were firstly
+constructed jointly with channel normalization and Euclidean alignment. Then,
+embedding features from source and target domain were mapped into the
+Reproducing Kernel Hilbert Space (RKHS) and aligned accordingly. A cosine-based
+center loss was also integrated into the framework to improve the
+generalizability of the SDDA. The proposed framework was validated with two
+classic and popular convolutional neural networks from BCI research field
+(EEGNet and ConvNet) in two MI-EEG public datasets (BCI Competition IV IIA,
+IIB). Compared to the vanilla EEGNet and ConvNet, the proposed SDDA framework
+was able to boost the MI classification accuracy by 15.2%, 10.2% respectively
+in IIA dataset, and 5.5%, 4.2% in IIB dataset. The final MI classification
+accuracy reached 82.01% in IIA dataset and 87.52% in IIB, which outperformed
+the state-of-the-art methods in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 5figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MetaDT: Meta Decision Tree with Class Hierarchy for Interpretable
+  Few-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.01482v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.01482v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baoquan Zhang, Hao Jiang, Xutao Li, Shanshan Feng, Yunming Ye, Rui Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-Shot Learning (FSL) is a challenging task, which aims to recognize novel
+classes with few examples. Recently, lots of methods have been proposed from
+the perspective of meta-learning and representation learning. However, few
+works focus on the interpretability of FSL decision process. In this paper, we
+take a step towards the interpretable FSL by proposing a novel meta-learning
+based decision tree framework, namely, MetaDT. In particular, the FSL
+interpretability is achieved from two aspects, i.e., a concept aspect and a
+visual aspect. On the concept aspect, we first introduce a tree-like concept
+hierarchy as FSL prior. Then, resorting to the prior, we split each few-shot
+task to a set of subtasks with different concept levels and then perform class
+prediction via a model of decision tree. The advantage of such design is that a
+sequence of high-level concept decisions that lead up to a final class
+prediction can be obtained, which clarifies the FSL decision process. On the
+visual aspect, a set of subtask-specific classifiers with visual attention
+mechanism is designed to perform decision at each node of the decision tree. As
+a result, a subtask-specific heatmap visualization can be obtained to achieve
+the decision interpretability of each tree node. At last, to alleviate the data
+scarcity issue of FSL, we regard the prior of concept hierarchy as an
+undirected graph, and then design a graph convolution-based decision tree
+inference network as our meta-learner to infer parameters of the decision tree.
+Extensive experiments on performance comparison and interpretability analysis
+show superiority of our MetaDT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Skeleton Meta-Prototype Contrastive Learning with Hard
+  Skeleton Mining for Unsupervised Person Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12917v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12917v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haocong Rao, Cyril Leung, Chunyan Miao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With rapid advancements in depth sensors and deep learning, skeleton-based
+person re-identification (re-ID) models have recently achieved remarkable
+progress with many advantages. Most existing solutions learn single-level
+skeleton features from body joints with the assumption of equal skeleton
+importance, while they typically lack the ability to exploit more informative
+skeleton features from various levels such as limb level with more global body
+patterns. The label dependency of these methods also limits their flexibility
+in learning more general skeleton representations. This paper proposes a
+generic unsupervised Hierarchical skeleton Meta-Prototype Contrastive learning
+(Hi-MPC) approach with Hard Skeleton Mining (HSM) for person re-ID with
+unlabeled 3D skeletons. Firstly, we construct hierarchical representations of
+skeletons to model coarse-to-fine body and motion features from the levels of
+body joints, components, and limbs. Then a hierarchical meta-prototype
+contrastive learning model is proposed to cluster and contrast the most typical
+skeleton features ("prototypes") from different-level skeletons. By converting
+original prototypes into meta-prototypes with multiple homogeneous
+transformations, we induce the model to learn the inherent consistency of
+prototypes to capture more effective skeleton features for person re-ID.
+Furthermore, we devise a hard skeleton mining mechanism to adaptively infer the
+informative importance of each skeleton, so as to focus on harder skeletons to
+learn more discriminative skeleton representations. Extensive evaluations on
+five datasets demonstrate that our approach outperforms a wide variety of
+state-of-the-art skeleton-based methods. We further show the general
+applicability of our method to cross-view person re-ID and RGB-based scenarios
+with estimated skeletons.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Journal of Computer Vision (IJCV). Codes
+  are available at https://github.com/Kali-Hac/Hi-MPC. Supplemental materials
+  will be included in the published version</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">9</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chat<span class="highlight-title">GPT</span> and Persuasive Technologies for the Management and Delivery of
+  Personalized Recommendations in Hotel Hospitality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manolis Remountakis, Konstantinos Kotis, Babis Kourtzis, George E. Tsekouras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems have become indispensable tools in the hotel hospitality
+industry, enabling personalized and tailored experiences for guests. Recent
+advancements in large language models (LLMs), such as ChatGPT, and persuasive
+technologies, have opened new avenues for enhancing the effectiveness of those
+systems. This paper explores the potential of integrating ChatGPT and
+persuasive technologies for automating and improving hotel hospitality
+recommender systems. First, we delve into the capabilities of ChatGPT, which
+can understand and generate human-like text, enabling more accurate and
+context-aware recommendations. We discuss the integration of ChatGPT into
+recommender systems, highlighting the ability to analyze user preferences,
+extract valuable insights from online reviews, and generate personalized
+recommendations based on guest profiles. Second, we investigate the role of
+persuasive technology in influencing user behavior and enhancing the persuasive
+impact of hotel recommendations. By incorporating persuasive techniques, such
+as social proof, scarcity and personalization, recommender systems can
+effectively influence user decision-making and encourage desired actions, such
+as booking a specific hotel or upgrading their room. To investigate the
+efficacy of ChatGPT and persuasive technologies, we present a pilot experi-ment
+with a case study involving a hotel recommender system. We aim to study the
+impact of integrating ChatGPT and persua-sive techniques on user engagement,
+satisfaction, and conversion rates. The preliminary results demonstrate the
+potential of these technologies in enhancing the overall guest experience and
+business performance. Overall, this paper contributes to the field of hotel
+hospitality by exploring the synergistic relationship between LLMs and
+persuasive technology in recommender systems, ultimately influencing guest
+satisfaction and hotel revenue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models are Competitive Near Cold-start Recommenders for
+  Language- and Item-based Preferences <span class="chip">RecSys'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Scott Sanner, Krisztian Balog, Filip Radlinski, Ben Wedin, Lucas Dixon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional recommender systems leverage users' item preference history to
+recommend novel content that users may like. However, modern dialog interfaces
+that allow users to express language-based preferences offer a fundamentally
+different modality for preference input. Inspired by recent successes of
+prompting paradigms for large language models (LLMs), we study their use for
+making recommendations from both item-based and language-based preferences in
+comparison to state-of-the-art item-based collaborative filtering (CF) methods.
+To support this investigation, we collect a new dataset consisting of both
+item-based and language-based preferences elicited from users along with their
+ratings on a variety of (biased) recommended items and (unbiased) random items.
+Among numerous experimental results, we find that LLMs provide competitive
+recommendation performance for pure language-based preferences (no item
+preferences) in the near cold-start case in comparison to item-based CF
+methods, despite having no supervised training for this specific task
+(zero-shot) or only a few labels (few-shot). This is particularly promising as
+language-based preference representations are more explainable and scrutable
+than item-based or vector-based representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at RecSys'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Probabilistic Position Bias Model for Short-Video Recommendation Feeds <span class="chip">RecSys '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14059v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14059v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olivier Jeunen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern web-based platforms show ranked lists of recommendations to users,
+attempting to maximise user satisfaction or business metrics. Typically, the
+goal of such systems boils down to maximising the exposure probability for
+items that are deemed "reward-maximising" according to a metric of interest.
+This general framing comprises streaming applications, as well as e-commerce or
+job recommendations, and even web search. Position bias or user models can be
+used to estimate exposure probabilities for each use-case, specifically
+tailored to how users interact with the presented rankings. A unifying factor
+in these diverse problem settings is that typically only one or several items
+will be engaged with (clicked, streamed,...) before a user leaves the ranked
+list. Short-video feeds on social media platforms diverge from this general
+framing in several ways, most notably that users do not tend to leave the feed
+after e.g. liking a post. Indeed, seemingly infinite feeds invite users to
+scroll further down the ranked list. For this reason, existing position bias or
+user models tend to fall short in such settings, as they do not accurately
+capture users' interaction modalities.
+  In this work, we propose a novel and probabilistically sound personalised
+position bias model for feed recommendations. We focus on a 1st-level feed in a
+hierarchical structure, where users may enter a 2nd-level feed via any given
+1st-level item. We posit that users come to the platform with a scrolling
+budget drawn according to some distribution, and show how the survival function
+of said distribution can be used to obtain closed-form estimates for
+personalised exposure probabilities. Empirical insights from a large-scale
+social media platform show how our probabilistic position bias model more
+accurately captures empirical exposure than existing models, and paves the way
+for unbiased evaluation and learning-to-rank.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Appearing in the Proceedings of the Seventeenth ACM Conference on
+  Recommender Systems (RecSys '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-view Hypergraph Contrastive Policy Learning for Conversational
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Zhao, Wei Wei, Xian-Ling Mao, Shuai Zhu, Minghui Yang, Zujie Wen, Dangyang Chen, Feida Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational recommendation systems (CRS) aim to interactively acquire user
+preferences and accordingly recommend items to users. Accurately learning the
+dynamic user preferences is of crucial importance for CRS. Previous works learn
+the user preferences with pairwise relations from the interactive conversation
+and item knowledge, while largely ignoring the fact that factors for a
+relationship in CRS are multiplex. Specifically, the user likes/dislikes the
+items that satisfy some attributes (Like/Dislike view). Moreover social
+influence is another important factor that affects user preference towards the
+item (Social view), while is largely ignored by previous works in CRS. The user
+preferences from these three views are inherently different but also correlated
+as a whole. The user preferences from the same views should be more similar
+than that from different views. The user preferences from Like View should be
+similar to Social View while different from Dislike View. To this end, we
+propose a novel model, namely Multi-view Hypergraph Contrastive Policy Learning
+(MHCPL). Specifically, MHCPL timely chooses useful social information according
+to the interactive history and builds a dynamic hypergraph with three types of
+multiplex relations from different views. The multiplex relations in each view
+are successively connected according to their generation order.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain Disentanglement with Interpolative Data Augmentation for
+  Dual-Target Cross-Domain Recommendation <span class="chip">RecSys 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13910v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13910v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajie Zhu, Yan Wang, Feng Zhu, Zhu Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional single-target Cross-Domain Recommendation (CDR) aims to
+improve the recommendation performance on a sparser target domain by
+transferring the knowledge from a source domain that contains relatively richer
+information. By contrast, in recent years, dual-target CDR has been proposed to
+improve the recommendation performance on both domains simultaneously. However,
+to this end, there are two challenges in dual-target CDR: (1) how to generate
+both relevant and diverse augmented user representations, and (2) how to
+effectively decouple domain-independent information from domain-specific
+information, in addition to domain-shared information, to capture comprehensive
+user preferences. To address the above two challenges, we propose a
+Disentanglement-based framework with Interpolative Data Augmentation for
+dual-target Cross-Domain Recommendation, called DIDA-CDR. In DIDA-CDR, we first
+propose an interpolative data augmentation approach to generating both relevant
+and diverse augmented user representations to augment sparser domain and
+explore potential user preferences. We then propose a disentanglement module to
+effectively decouple domain-specific and domain-independent information to
+capture comprehensive user preferences. Both steps significantly contribute to
+capturing more comprehensive user preferences, thereby improving the
+recommendation performance on each domain. Extensive experiments conducted on
+five real-world datasets show the significant superiority of DIDA-CDR over the
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by RecSys 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Bert</span>4XMR: Cross-Market Recommendation with Bidirectional Encoder
+  Representations from <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15145v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15145v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Hu, Satoshi Nakagawa, Shi-Min Cai, Fuji Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world multinational e-commerce companies, such as Amazon and eBay, serve
+in multiple countries and regions. Some markets are data-scarce, while others
+are data-rich. In recent years, cross-market recommendation (XMR) has been
+proposed to bolster data-scarce markets by leveraging auxiliary information
+from data-rich markets. Previous XMR algorithms have employed techniques such
+as sharing bottom or incorporating inter-market similarity to optimize the
+performance of XMR. However, the existing approaches suffer from two crucial
+limitations: (1) They ignore the co-occurrences of items provided by data-rich
+markets. (2) They do not adequately tackle the issue of negative transfer
+stemming from disparities across diverse markets. In order to address these
+limitations, we propose a novel session-based model called Bert4XMR, which is
+able to model item co-occurrences across markets and mitigate negative
+transfer. Specifically, we employ the pre-training and fine-tuning paradigm to
+facilitate knowledge transfer across markets. Pre-training occurs on global
+markets to learn item co-occurrences, while fine-tuning happens in the target
+market for model customization. To mitigate potential negative transfer, we
+separate the item representations into market embeddings and item embeddings.
+Market embeddings model the bias associated with different markets, while item
+embeddings learn generic item representations. Extensive experiments conducted
+on seven real-world datasets illustrate our model's effectiveness. It
+outperforms the suboptimal model by an average of $4.82\%$, $4.73\%$, $7.66\%$,
+and $6.49\%$ across four metrics. Through the ablation study, we experimentally
+demonstrate that the market embedding approach helps prevent negative transfer,
+especially in data-scarce markets. Our implementations are available at
+https://github.com/laowangzi/Bert4XMR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Hierarchical Policy Learning for Conversational Recommendation
+  with Hypergraph-based Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02575v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02575v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Zhao, Wei Wei, Yifan Liu, Ziyang Wang, Wendi Li, Xian-Ling Mao, Shuai Zhu, Minghui Yang, Zujie Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational recommendation systems (CRS) aim to timely and proactively
+acquire user dynamic preferred attributes through conversations for item
+recommendation. In each turn of CRS, there naturally have two decision-making
+processes with different roles that influence each other: 1) director, which is
+to select the follow-up option (i.e., ask or recommend) that is more effective
+for reducing the action space and acquiring user preferences; and 2) actor,
+which is to accordingly choose primitive actions (i.e., asked attribute or
+recommended item) that satisfy user preferences and give feedback to estimate
+the effectiveness of the director's option. However, existing methods heavily
+rely on a unified decision-making module or heuristic rules, while neglecting
+to distinguish the roles of different decision procedures, as well as the
+mutual influences between them. To address this, we propose a novel
+Director-Actor Hierarchical Conversational Recommender (DAHCR), where the
+director selects the most effective option, followed by the actor accordingly
+choosing primitive actions that satisfy user preferences. Specifically, we
+develop a dynamic hypergraph to model user preferences and introduce an
+intrinsic motivation to train from weak supervision over the director. Finally,
+to alleviate the bad effect of model bias on the mutual influence between the
+director and actor, we model the director's option by sampling from a
+categorical distribution. Extensive experiments demonstrate that DAHCR
+outperforms state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sequential Recommendation with Graph Neural Networks <span class="chip">SIGIR 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.14226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.14226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianxin Chang, Chen Gao, Yu Zheng, Yiqun Hui, Yanan Niu, Yang Song, Depeng Jin, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation aims to leverage users' historical behaviors to
+predict their next interaction. Existing works have not yet addressed two main
+challenges in sequential recommendation. First, user behaviors in their rich
+historical sequences are often implicit and noisy preference signals, they
+cannot sufficiently reflect users' actual preferences. In addition, users'
+dynamic preferences often change rapidly over time, and hence it is difficult
+to capture user patterns in their historical sequences. In this work, we
+propose a graph neural network model called SURGE (short for SeqUential
+Recommendation with Graph neural nEtworks) to address these two issues.
+Specifically, SURGE integrates different types of preferences in long-term user
+behaviors into clusters in the graph by re-constructing loose item sequences
+into tight item-item interest graphs based on metric learning. This helps
+explicitly distinguish users' core interests, by forming dense clusters in the
+interest graph. Then, we perform cluster-aware and query-aware graph
+convolutional propagation and graph pooling on the constructed graph. It
+dynamically fuses and extracts users' current activated core interests from
+noisy user behavior sequences. We conduct extensive experiments on both public
+and proprietary industrial datasets. Experimental results demonstrate
+significant performance gains of our proposed method compared to
+state-of-the-art methods. Further studies on sequence length confirm that our
+method can model long behavioral sequences effectively and efficiently.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SIGIR 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fairness in Recommendation: Foundations, Methods and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.13619v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.13619v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunqi Li, Hanxiong Chen, Shuyuan Xu, Yingqiang Ge, Juntao Tan, Shuchang Liu, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As one of the most pervasive applications of machine learning, recommender
+systems are playing an important role on assisting human decision making. The
+satisfaction of users and the interests of platforms are closely related to the
+quality of the generated recommendation results. However, as a highly
+data-driven system, recommender system could be affected by data or algorithmic
+bias and thus generate unfair results, which could weaken the reliance of the
+systems. As a result, it is crucial to address the potential unfairness
+problems in recommendation settings. Recently, there has been growing attention
+on fairness considerations in recommender systems with more and more literature
+on approaches to promote fairness in recommendation. However, the studies are
+rather fragmented and lack a systematic organization, thus making it difficult
+to penetrate for new researchers to the domain. This motivates us to provide a
+systematic survey of existing works on fairness in recommendation. This survey
+focuses on the foundations for fairness in recommendation literature. It first
+presents a brief introduction about fairness in basic machine learning tasks
+such as classification and ranking in order to provide a general overview of
+fairness research, as well as introduce the more complex situations and
+challenges that need to be considered when studying fairness in recommender
+systems. After that, the survey will introduce fairness in recommendation with
+a focus on the taxonomies of current fairness definitions, the typical
+techniques for improving fairness, as well as the datasets for fairness studies
+in recommendation. The survey also talks about the challenges and opportunities
+in fairness research with the hope of promoting the fair recommendation
+research area and beyond.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 2 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">85</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TabR: Unlocking the Power of Retrieval-Augmented Tabular Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14338v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14338v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yury Gorishniy, Ivan Rubachev, Nikolay Kartashev, Daniil Shlenskii, Akim Kotelnikov, Artem Babenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) models for tabular data problems are receiving
+increasingly more attention, while the algorithms based on gradient-boosted
+decision trees (GBDT) remain a strong go-to solution. Following the recent
+trends in other domains, such as natural language processing and computer
+vision, several retrieval-augmented tabular DL models have been recently
+proposed. For a given target object, a retrieval-based model retrieves other
+relevant objects, such as the nearest neighbors, from the available (training)
+data and uses their features or even labels to make a better prediction.
+However, we show that the existing retrieval-based tabular DL solutions provide
+only minor, if any, benefits over the properly tuned simple retrieval-free
+baselines. Thus, it remains unclear whether the retrieval-based approach is a
+worthy direction for tabular DL.
+  In this work, we give a strong positive answer to this question. We start by
+incrementally augmenting a simple feed-forward architecture with an
+attention-like retrieval component similar to those of many (tabular)
+retrieval-based models. Then, we highlight several details of the attention
+mechanism that turn out to have a massive impact on the performance on tabular
+data problems, but that were not explored in prior work. As a result, we design
+TabR -- a simple retrieval-based tabular DL model which, on a set of public
+benchmarks, demonstrates the best average performance among tabular DL models,
+becomes the new state-of-the-art on several datasets, and even outperforms GBDT
+models on the recently proposed ``GBDT-friendly'' benchmark (see the first
+figure).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/yandex-research/tabular-dl-tabr</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Waypoint-Based Imitation Learning for Robotic Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucy Xiaoyang Shi, Archit Sharma, Tony Z. Zhao, Chelsea Finn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While imitation learning methods have seen a resurgent interest for robotic
+manipulation, the well-known problem of compounding errors continues to afflict
+behavioral cloning (BC). Waypoints can help address this problem by reducing
+the horizon of the learning problem for BC, and thus, the errors compounded
+over time. However, waypoint labeling is underspecified, and requires
+additional human supervision. Can we generate waypoints automatically without
+any additional human supervision? Our key insight is that if a trajectory
+segment can be approximated by linear motion, the endpoints can be used as
+waypoints. We propose Automatic Waypoint Extraction (AWE) for imitation
+learning, a preprocessing module to decompose a demonstration into a minimal
+set of waypoints which when interpolated linearly can approximate the
+trajectory up to a specified error threshold. AWE can be combined with any BC
+algorithm, and we find that AWE can increase the success rate of
+state-of-the-art algorithms by up to 25% in simulation and by 4-28% on
+real-world bimanual manipulation tasks, reducing the decision making horizon by
+up to a factor of 10. Videos and code are available at
+https://lucys0.github.io/awe/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Moral Beliefs Encoded in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nino Scherrer, Claudia Shi, Amir Feder, David M. Blei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a case study on the design, administration,
+post-processing, and evaluation of surveys on large language models (LLMs). It
+comprises two components: (1) A statistical method for eliciting beliefs
+encoded in LLMs. We introduce statistical measures and evaluation metrics that
+quantify the probability of an LLM "making a choice", the associated
+uncertainty, and the consistency of that choice. (2) We apply this method to
+study what moral beliefs are encoded in different LLMs, especially in ambiguous
+cases where the right choice is not obvious. We design a large-scale survey
+comprising 680 high-ambiguity moral scenarios (e.g., "Should I tell a white
+lie?") and 687 low-ambiguity moral scenarios (e.g., "Should I stop for a
+pedestrian on the road?"). Each scenario includes a description, two possible
+actions, and auxiliary labels indicating violated rules (e.g., "do not kill").
+We administer the survey to 28 open- and closed-source LLMs. We find that (a)
+in unambiguous scenarios, most models "choose" actions that align with
+commonsense. In ambiguous cases, most models express uncertainty. (b) Some
+models are uncertain about choosing the commonsense action because their
+responses are sensitive to the question-wording. (c) Some models reflect clear
+preferences in ambiguous scenarios. Specifically, closed-source models tend to
+agree with each other.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning by Guided Safe Exploration <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qisong Yang, Thiago D. Simão, Nils Jansen, Simon H. Tindemans, Matthijs T. J. Spaan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safety is critical to broadening the application of reinforcement learning
+(RL). Often, we train RL agents in a controlled environment, such as a
+laboratory, before deploying them in the real world. However, the real-world
+target task might be unknown prior to deployment. Reward-free RL trains an
+agent without the reward to adapt quickly once the reward is revealed. We
+consider the constrained reward-free setting, where an agent (the guide) learns
+to explore safely without the reward signal. This agent is trained in a
+controlled environment, which allows unsafe interactions and still provides the
+safety signal. After the target task is revealed, safety violations are not
+allowed anymore. Thus, the guide is leveraged to compose a safe behaviour
+policy. Drawing from transfer learning, we also regularize a target policy (the
+student) towards the guide while the student is unreliable and gradually
+eliminate the influence of the guide as training progresses. The empirical
+analysis shows that this method can achieve safe transfer learning and helps
+the student solve the target task faster.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accecpted at ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Constraint Enforcement Deep Reinforcement Learning Framework for
+  Optimal Energy Storage Systems Dispatch 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengren Hou, Edgar Mauricio Salazar Duque, Peter Palensky, Pedro P. Vergara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The optimal dispatch of energy storage systems (ESSs) presents formidable
+challenges due to the uncertainty introduced by fluctuations in dynamic prices,
+demand consumption, and renewable-based energy generation. By exploiting the
+generalization capabilities of deep neural networks (DNNs), deep reinforcement
+learning (DRL) algorithms can learn good-quality control models that adaptively
+respond to distribution networks' stochastic nature. However, current DRL
+algorithms lack the capabilities to enforce operational constraints strictly,
+often even providing unfeasible control actions. To address this issue, we
+propose a DRL framework that effectively handles continuous action spaces while
+strictly enforcing the environments and action space operational constraints
+during online operation. Firstly, the proposed framework trains an action-value
+function modeled using DNNs. Subsequently, this action-value function is
+formulated as a mixed-integer programming (MIP) formulation enabling the
+consideration of the environment's operational constraints. Comprehensive
+numerical simulations show the superior performance of the proposed MIP-DRL
+framework, effectively enforcing all constraints while delivering high-quality
+dispatch decisions when compared with state-of-the-art DRL algorithms and the
+optimal solution obtained with a perfect forecast of the stochastic variables.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to a publication in a journal. This
+  corresponds to the submitted version. After acceptance, it may be removed
+  depending on the journal's requirements for copyright</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chat<span class="highlight-title">GPT</span> and Persuasive Technologies for the Management and Delivery of
+  Personalized Recommendations in Hotel Hospitality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manolis Remountakis, Konstantinos Kotis, Babis Kourtzis, George E. Tsekouras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems have become indispensable tools in the hotel hospitality
+industry, enabling personalized and tailored experiences for guests. Recent
+advancements in large language models (LLMs), such as ChatGPT, and persuasive
+technologies, have opened new avenues for enhancing the effectiveness of those
+systems. This paper explores the potential of integrating ChatGPT and
+persuasive technologies for automating and improving hotel hospitality
+recommender systems. First, we delve into the capabilities of ChatGPT, which
+can understand and generate human-like text, enabling more accurate and
+context-aware recommendations. We discuss the integration of ChatGPT into
+recommender systems, highlighting the ability to analyze user preferences,
+extract valuable insights from online reviews, and generate personalized
+recommendations based on guest profiles. Second, we investigate the role of
+persuasive technology in influencing user behavior and enhancing the persuasive
+impact of hotel recommendations. By incorporating persuasive techniques, such
+as social proof, scarcity and personalization, recommender systems can
+effectively influence user decision-making and encourage desired actions, such
+as booking a specific hotel or upgrading their room. To investigate the
+efficacy of ChatGPT and persuasive technologies, we present a pilot experi-ment
+with a case study involving a hotel recommender system. We aim to study the
+impact of integrating ChatGPT and persua-sive techniques on user engagement,
+satisfaction, and conversion rates. The preliminary results demonstrate the
+potential of these technologies in enhancing the overall guest experience and
+business performance. Overall, this paper contributes to the field of hotel
+hospitality by exploring the synergistic relationship between LLMs and
+persuasive technology in recommender systems, ultimately influencing guest
+satisfaction and hotel revenue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unraveling the Complexity of Splitting Sequential Data: Tackling
+  Challenges in Video and Time Series Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Botache, Kristina Dingel, Rico Huhnstock, Arno Ehresmann, Bernhard Sick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Splitting of sequential data, such as videos and time series, is an essential
+step in various data analysis tasks, including object tracking and anomaly
+detection. However, splitting sequential data presents a variety of challenges
+that can impact the accuracy and reliability of subsequent analyses. This
+concept article examines the challenges associated with splitting sequential
+data, including data acquisition, data representation, split ratio selection,
+setting up quality criteria, and choosing suitable selection strategies. We
+explore these challenges through two real-world examples: motor test benches
+and particle tracking in liquids.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ General Purpose Artificial Intelligence Systems (GPAIS): Properties,
+  Definition, Taxonomy, Open Challenges and Implications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isaac Triguero, Daniel Molina, Javier Poyatos, Javier Del Ser, Francisco Herrera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most applications of Artificial Intelligence (AI) are designed for a confined
+and specific task. However, there are many scenarios that call for a more
+general AI, capable of solving a wide array of tasks without being specifically
+designed for them. The term General-Purpose Artificial Intelligence Systems
+(GPAIS) has been defined to refer to these AI systems. To date, the possibility
+of an Artificial General Intelligence, powerful enough to perform any
+intellectual task as if it were human, or even improve it, has remained an
+aspiration, fiction, and considered a risk for our society. Whilst we might
+still be far from achieving that, GPAIS is a reality and sitting at the
+forefront of AI research.
+  This work discusses existing definitions for GPAIS and proposes a new
+definition that allows for a gradual differentiation among types of GPAIS
+according to their properties and limitations. We distinguish between
+closed-world and open-world GPAIS, characterising their degree of autonomy and
+ability based on several factors such as adaptation to new tasks, competence in
+domains not intentionally trained for, ability to learn from few data, or
+proactive acknowledgment of their own limitations. We then propose a taxonomy
+of approaches to realise GPAIS, describing research trends such as the use of
+AI techniques to improve another AI or foundation models. As a prime example,
+we delve into generative AI, aligning them with the terms and concepts
+presented in the taxonomy. Through the proposed definition and taxonomy, our
+aim is to facilitate research collaboration across different areas that are
+tackling general-purpose tasks, as they share many common aspects. Finally, we
+discuss the current state of GPAIS, its challenges and prospects, implications
+for our society, and the need for responsible and trustworthy AI systems and
+regulation, with the goal of providing a holistic view of GPAIS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deepfake Image Generation for Improved Brain Tumor Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roa'a Al-Emaryeen, Sara Al-Nahhas, Fatima Himour, Waleed Mahafza, Omar Al-Kadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the world progresses in technology and health, awareness of disease by
+revealing asymptomatic signs improves. It is important to detect and treat
+tumors in early stage as it can be life-threatening. Computer-aided
+technologies are used to overcome lingering limitations facing disease
+diagnosis, while brain tumor segmentation remains a difficult process,
+especially when multi-modality data is involved. This is mainly attributed to
+ineffective training due to lack of data and corresponding labelling. This work
+investigates the feasibility of employing deep-fake image generation for
+effective brain tumor segmentation. To this end, a Generative Adversarial
+Network was used for image-to-image translation for increasing dataset size,
+followed by image segmentation using a U-Net-based convolutional neural network
+trained with deepfake images. Performance of the proposed approach is compared
+with ground truth of four publicly available datasets. Results show improved
+performance in terms of image segmentation quality metrics, and could
+potentially assist when training with limited data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures, 2 tables, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fluorescent Neuronal Cells v2: Multi-Task, Multi-Format Annotations for
+  Deep Learning in Microscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Clissa, Antonio Macaluso, Roberto Morelli, Alessandra Occhinegro, Emiliana Piscitiello, Ludovico Taddei, Marco Luppi, Roberto Amici, Matteo Cerri, Timna Hitrec, Lorenzo Rinaldi, Antonio Zoccoli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fluorescent Neuronal Cells v2 is a collection of fluorescence microscopy
+images and the corresponding ground-truth annotations, designed to foster
+innovative research in the domains of Life Sciences and Deep Learning. This
+dataset encompasses three image collections in which rodent neuronal cells'
+nuclei and cytoplasm are stained with diverse markers to highlight their
+anatomical or functional characteristics. Alongside the images, we provide
+ground-truth annotations for several learning tasks, including semantic
+segmentation, object detection, and counting. The contribution is two-fold.
+First, given the variety of annotations and their accessible formats, we
+envision our work facilitating methodological advancements in computer vision
+approaches for segmentation, detection, feature learning, unsupervised and
+self-supervised learning, transfer learning, and related areas. Second, by
+enabling extensive exploration and benchmarking, we hope Fluorescent Neuronal
+Cells v2 will catalyze breakthroughs in fluorescence microscopy analysis and
+promote cutting-edge discoveries in life sciences. The data are available at:
+https://amsacta.unibo.it/id/eprint/7347
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages; 5 figures; 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evolving Multi-Objective Neural Network Controllers for Robot Swarms <span class="chip">AAMAS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karl Mason, Sabine Hauert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many swarm robotics tasks consist of multiple conflicting objectives. This
+research proposes a multi-objective evolutionary neural network approach to
+developing controllers for swarms of robots. The swarm robot controllers are
+trained in a low-fidelity Python simulator and then tested in a high-fidelity
+simulated environment using Webots. Simulations are then conducted to test the
+scalability of the evolved multi-objective robot controllers to environments
+with a larger number of robots. The results presented demonstrate that the
+proposed approach can effectively control each of the robots. The robot swarm
+exhibits different behaviours as the weighting for each objective is adjusted.
+The results also confirm that multi-objective neural network controllers
+evolved in a low-fidelity simulator can be transferred to high-fidelity
+simulated environments and that the controllers can scale to environments with
+a larger number of robots without further retraining needed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was presented at the 2023 Autonomous Robots and Multirobot
+  Systems (ARMS) Workshop, at The 22nd International Conference on Autonomous
+  Agents and Multiagent Systems (AAMAS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models are Competitive Near Cold-start Recommenders for
+  Language- and Item-based Preferences <span class="chip">RecSys'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Scott Sanner, Krisztian Balog, Filip Radlinski, Ben Wedin, Lucas Dixon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional recommender systems leverage users' item preference history to
+recommend novel content that users may like. However, modern dialog interfaces
+that allow users to express language-based preferences offer a fundamentally
+different modality for preference input. Inspired by recent successes of
+prompting paradigms for large language models (LLMs), we study their use for
+making recommendations from both item-based and language-based preferences in
+comparison to state-of-the-art item-based collaborative filtering (CF) methods.
+To support this investigation, we collect a new dataset consisting of both
+item-based and language-based preferences elicited from users along with their
+ratings on a variety of (biased) recommended items and (unbiased) random items.
+Among numerous experimental results, we find that LLMs provide competitive
+recommendation performance for pure language-based preferences (no item
+preferences) in the near cold-start case in comparison to item-based CF
+methods, despite having no supervised training for this specific task
+(zero-shot) or only a few labels (few-shot). This is particularly promising as
+language-based preference representations are more explainable and scrutable
+than item-based or vector-based representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at RecSys'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Modeling and Monitoring of Dependent Processes under Resource
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanapol Kosolwattana, Huazheng Wang, Ying Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monitoring a population of dependent processes under limited resources is
+critical for abnormal events detection. A novel online collaborative learning
+method is proposed to adaptively allocate the resources for exploitation of
+high-risk processes and exploration of dependent dynamics. Efficiency of the
+proposed method is proved through theoretical analysis and experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application of Random Forest and Support Vector Machine for
+  Investigation of Pressure Filtration Performance, a Zinc Plant Filter Cake
+  Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masoume Kazemi, Davood Moradkhani, Alireza Abbas Alipour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The hydrometallurgical method of zinc production involves leaching zinc from
+ore and then separating the solid residue from the liquid solution by pressure
+filtration. This separation process is very important since the solid residue
+contains some moisture that can reduce the amount of zinc recovered. This study
+modeled the pressure filtration process through Random Forest (RF) and Support
+Vector Machine (SVM). The models take continuous variables (extracted features)
+from the lab samples as inputs. Thus, regression models namely Random Forest
+Regression (RFR) and Support Vector Regression (SVR) were chosen. A total
+dataset was obtained during the pressure filtration process in two conditions:
+1) Polypropylene (S1) and 2) Polyester fabrics (S2). To predict the cake
+moisture, solids concentration (0.2 and 0.38), temperature (35 and 65
+centigrade), pH (2, 3.5, and 5), pressure, cake thickness (14, 20, 26, and 34
+mm), air-blow time (2, 10 and 15 min) and filtration time were applied as input
+variables. The models' predictive accuracy was evaluated by the coefficient of
+determination (R2) parameter. The results revealed that the RFR model is
+superior to the SVR model for cake moisture prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Learning of Discrete-Continuous Computation Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Friede, Mathias Niepert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerous models for supervised and reinforcement learning benefit from
+combinations of discrete and continuous model components. End-to-end learnable
+discrete-continuous models are compositional, tend to generalize better, and
+are more interpretable. A popular approach to building discrete-continuous
+computation graphs is that of integrating discrete probability distributions
+into neural networks using stochastic softmax tricks. Prior work has mainly
+focused on computation graphs with a single discrete component on each of the
+graph's execution paths. We analyze the behavior of more complex stochastic
+computations graphs with multiple sequential discrete components. We show that
+it is challenging to optimize the parameters of these models, mainly due to
+small gradients and local minima. We then propose two new strategies to
+overcome these challenges. First, we show that increasing the scale parameter
+of the Gumbel noise perturbations during training improves the learning
+behavior. Second, we propose dropout residual connections specifically tailored
+to stochastic, discrete-continuous computation graphs. With an extensive set of
+experiments, we show that we can train complex discrete-continuous models which
+one cannot train with standard stochastic softmax tricks. We also show that
+complex discrete-stochastic models generalize better than their continuous
+counterparts on several benchmark datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A comparison of machine learning surrogate models of street-scale
+  flooding in Norfolk, Virginia 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14185v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14185v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diana McSpadden, Steven Goldenberg, Binata Roy, Malachi Schram, Jonathan L. Goodall, Heather Richter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-lying coastal cities, exemplified by Norfolk, Virginia, face the
+challenge of street flooding caused by rainfall and tides, which strain
+transportation and sewer systems and can lead to property damage. While
+high-fidelity, physics-based simulations provide accurate predictions of urban
+pluvial flooding, their computational complexity renders them unsuitable for
+real-time applications. Using data from Norfolk rainfall events between 2016
+and 2018, this study compares the performance of a previous surrogate model
+based on a random forest algorithm with two deep learning models: Long
+Short-Term Memory (LSTM) and Gated Recurrent Unit (GRU). This investigation
+underscores the importance of using a model architecture that supports the
+communication of prediction uncertainty and the effective integration of
+relevant, multi-modal features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Disentangled Discrete Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Friede, Christian Reimers, Heiner Stuckenschmidt, Mathias Niepert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent successes in image generation, model-based reinforcement learning, and
+text-to-image generation have demonstrated the empirical advantages of discrete
+latent representations, although the reasons behind their benefits remain
+unclear. We explore the relationship between discrete latent spaces and
+disentangled representations by replacing the standard Gaussian variational
+autoencoder (VAE) with a tailored categorical variational autoencoder. We show
+that the underlying grid structure of categorical distributions mitigates the
+problem of rotational invariance associated with multivariate Gaussian
+distributions, acting as an efficient inductive prior for disentangled
+representations. We provide both analytical and empirical findings that
+demonstrate the advantages of discrete VAEs for learning disentangled
+representations. Furthermore, we introduce the first unsupervised model
+selection strategy that favors disentangled representations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Design of Synthetic Active Inference Agents by Mere Mortals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14145v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14145v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bert de Vries
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The theoretical properties of active inference agents are impressive, but how
+do we realize effective agents in working hardware and software on edge
+devices? This is an interesting problem because the computational load for
+policy exploration explodes exponentially, while the computational resources
+are very limited for edge devices. In this paper, we discuss the necessary
+features for a software toolbox that supports a competent non-expert engineer
+to develop working active inference agents. We introduce a toolbox-in-progress
+that aims to accelerate the democratization of active inference agents in a
+similar way as TensorFlow propelled applications of deep learning technology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Piecewise-Stationary Combinatorial Semi-Bandit with Causally Related
+  Rewards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Behzad Nourani-Koliji, Steven Bilaj, Amir Rezaei Balef, Setareh Maghsudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the piecewise stationary combinatorial semi-bandit problem with
+causally related rewards. In our nonstationary environment, variations in the
+base arms' distributions, causal relationships between rewards, or both, change
+the reward generation process. In such an environment, an optimal
+decision-maker must follow both sources of change and adapt accordingly. The
+problem becomes aggravated in the combinatorial semi-bandit setting, where the
+decision-maker only observes the outcome of the selected bundle of arms. The
+core of our proposed policy is the Upper Confidence Bound (UCB) algorithm. We
+assume the agent relies on an adaptive approach to overcome the challenge. More
+specifically, it employs a change-point detector based on the Generalized
+Likelihood Ratio (GLR) test. Besides, we introduce the notion of group restart
+as a new alternative restarting strategy in the decision making process in
+structured environments. Finally, our algorithm integrates a mechanism to trace
+the variations of the underlying graph structure, which captures the causal
+relationships between the rewards in the bandit setting. Theoretically, we
+establish a regret upper bound that reflects the effects of the number of
+structural- and distribution changes on the performance. The outcome of our
+numerical experiments in real-world scenarios exhibits applicability and
+superior performance of our proposal compared to the state-of-the-art
+benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Developing and Evaluating Tiny to Medium-Sized Turkish <span class="highlight-title">BERT</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Himmet Toprak Kesgin, Muzaffer Kaan Yuce, Mehmet Fatih Amasyali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces and evaluates tiny, mini, small, and medium-sized
+uncased Turkish BERT models, aiming to bridge the research gap in
+less-resourced languages. We trained these models on a diverse dataset
+encompassing over 75GB of text from multiple sources and tested them on several
+tasks, including mask prediction, sentiment analysis, news classification, and,
+zero-shot classification. Despite their smaller size, our models exhibited
+robust performance, including zero-shot task, while ensuring computational
+efficiency and faster execution times. Our findings provide valuable insights
+into the development and application of smaller language models, especially in
+the context of the Turkish language.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GraphRNN Revisited: An Ablation Study and Extensions for Directed
+  Acyclic Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taniya Das, Mark Koch, Maya Ravichandran, Nikhil Khatri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GraphRNN is a deep learning-based architecture proposed by You et al. for
+learning generative models for graphs. We replicate the results of You et al.
+using a reproduced implementation of the GraphRNN architecture and evaluate
+this against baseline models using new metrics. Through an ablation study, we
+find that the BFS traversal suggested by You et al. to collapse representations
+of isomorphic graphs contributes significantly to model performance.
+Additionally, we extend GraphRNN to generate directed acyclic graphs by
+replacing the BFS traversal with a topological sort. We demonstrate that this
+method improves significantly over a directed-multiclass variant of GraphRNN on
+a real-world dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Actions Speak What You Want: Provably Sample-Efficient Reinforcement
+  Learning of the Quantal Stackelberg Equilibrium from Strategic Feedbacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Chen, Mengdi Wang, Zhuoran Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study reinforcement learning (RL) for learning a Quantal Stackelberg
+Equilibrium (QSE) in an episodic Markov game with a leader-follower structure.
+In specific, at the outset of the game, the leader announces her policy to the
+follower and commits to it. The follower observes the leader's policy and, in
+turn, adopts a quantal response policy by solving an entropy-regularized policy
+optimization problem induced by leader's policy. The goal of the leader is to
+find her optimal policy, which yields the optimal expected total return, by
+interacting with the follower and learning from data. A key challenge of this
+problem is that the leader cannot observe the follower's reward, and needs to
+infer the follower's quantal response model from his actions against leader's
+policies. We propose sample-efficient algorithms for both the online and
+offline settings, in the context of function approximation. Our algorithms are
+based on (i) learning the quantal response model via maximum likelihood
+estimation and (ii) model-free or model-based RL for solving the leader's
+decision making problem, and we show that they achieve sublinear regret upper
+bounds. Moreover, we quantify the uncertainty of these estimators and leverage
+the uncertainty to implement optimistic and pessimistic algorithms for online
+and offline settings. Besides, when specialized to the linear and myopic
+setting, our algorithms are also computationally efficient. Our theoretical
+analysis features a novel performance-difference lemma which incorporates the
+error of quantal response model, which might be of independent interest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>129 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Domain Discrepancy Adjustment for Active Multi-Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Liu, Bo Zhou, Zhipeng Zhao, Zening Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-source unsupervised domain adaptation (MUDA) aims to transfer knowledge
+from related source domains to an unlabeled target domain. While recent MUDA
+methods have shown promising results, most focus on aligning the overall
+feature distributions across source domains, which can lead to negative effects
+due to redundant features within each domain. Moreover, there is a significant
+performance gap between MUDA and supervised methods. To address these
+challenges, we propose a novel approach called Dynamic Domain Discrepancy
+Adjustment for Active Multi-Domain Adaptation (D3AAMDA). Firstly, we establish
+a multi-source dynamic modulation mechanism during the training process based
+on the degree of distribution differences between source and target domains.
+This mechanism controls the alignment level of features between each source
+domain and the target domain, effectively leveraging the local advantageous
+feature information within the source domains. Additionally, we propose a
+Multi-source Active Boundary Sample Selection (MABS) strategy, which utilizes a
+guided dynamic boundary loss to design an efficient query function for
+selecting important samples. This strategy achieves improved generalization to
+the target domain with minimal sampling costs. We extensively evaluate our
+proposed method on commonly used domain adaptation datasets, comparing it
+against existing UDA and ADA methods. The experimental results unequivocally
+demonstrate the superiority of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine Learning Applications In Healthcare: The State Of Knowledge and
+  Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mrinmoy Roy, Sarwar J. Minar, Porarthi Dhar, A T M Omor Faruq
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detection of easily missed hidden patterns with fast processing power makes
+machine learning (ML) indispensable to today's healthcare system. Though many
+ML applications have already been discovered and many are still under
+investigation, only a few have been adopted by current healthcare systems. As a
+result, there exists an enormous opportunity in healthcare system for ML but
+distributed information, scarcity of properly arranged and easily explainable
+documentation in related sector are major impede which are making ML
+applications difficult to healthcare professionals. This study aimed to gather
+ML applications in different areas of healthcare concisely and more effectively
+so that necessary information can be accessed immediately with relevant
+references. We divided our study into five major groups: community level work,
+risk management/ preventive care, healthcare operation management, remote care,
+and early detection. Dividing these groups into subgroups, we provided relevant
+references with description in tabular form for quick access. Our objective is
+to inform people about ML applicability in healthcare industry, reduce the
+knowledge gap of clinicians about the ML applications and motivate healthcare
+professionals towards more machine learning based healthcare system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pre-Train</span>ing with Diffusion models for Dental Radiography segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jérémy Rousseau, Christian Alaka, Emma Covili, Hippolyte Mayard, Laura Misrachi, Willy Au
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical radiography segmentation, and specifically dental radiography, is
+highly limited by the cost of labeling which requires specific expertise and
+labor-intensive annotations. In this work, we propose a straightforward
+pre-training method for semantic segmentation leveraging Denoising Diffusion
+Probabilistic Models (DDPM), which have shown impressive results for generative
+modeling. Our straightforward approach achieves remarkable performance in terms
+of label efficiency and does not require architectural modifications between
+pre-training and downstream tasks. We propose to first pre-train a Unet by
+exploiting the DDPM training objective, and then fine-tune the resulting model
+on a segmentation task. Our experimental results on the segmentation of dental
+radiographs demonstrate that the proposed method is competitive with
+state-of-the-art pre-training methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures, Deep Generative Models workshop @ MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topologically-Regularized Multiple Instance Learning for Red Blood Cell
+  Disease Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salome Kazeminia, Ario Sadafi, Asya Makhro, Anna Bogdanova, Carsten Marr, Bastian Rieck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diagnosing rare anemia disorders using microscopic images is challenging for
+skilled specialists and machine-learning methods alike. Due to thousands of
+disease-relevant cells in a single blood sample, this constitutes a complex
+multiple-instance learning (MIL) problem. While the spatial neighborhood of red
+blood cells is not meaningful per se, the topology, i.e., the geometry of blood
+samples as a whole, contains informative features to remedy typical MIL issues,
+such as vanishing gradients and overfitting when training on limited data. We
+thus develop a topology-based approach that extracts multi-scale topological
+features from bags of single red blood cell images. The topological features
+are used to regularize the model, enforcing the preservation of characteristic
+topological properties of the data. Applied to a dataset of 71 patients
+suffering from rare anemia disorders with 521 microscopic images of red blood
+cells, our experiments show that topological regularization is an effective
+method that leads to more than 3% performance improvements for the automated
+classification of rare anemia disorders based on single-cell images. This is
+the first approach that uses topological properties for regularizing the MIL
+process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are <span class="highlight-title">Transformer</span>s with One Layer Self-Attention Using Low-Rank Weight
+  Matrices Universal Approximators? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14023v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14023v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tokio Kajitsuka, Issei Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing analyses of the expressive capacity of Transformer models have
+required excessively deep layers for data memorization, leading to a
+discrepancy with the Transformers actually used in practice. This is primarily
+due to the interpretation of the softmax function as an approximation of the
+hardmax function. By clarifying the connection between the softmax function and
+the Boltzmann operator, we prove that a single layer of self-attention with
+low-rank weight matrices possesses the capability to perfectly capture the
+context of an entire input sequence. As a consequence, we show that
+single-layer Transformer has a memorization capacity for finite samples, and
+that Transformers consisting of one self-attention layer with two feed-forward
+neural networks are universal approximators for continuous functions on a
+compact domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MCMC-Correction of Score-Based Diffusion Models for Model Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anders Sjöberg, Jakob Lindqvist, Magnus Önnheim, Mats Jirstrand, Lennart Svensson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models can be parameterised in terms of either a score or an energy
+function. The energy parameterisation has better theoretical properties, mainly
+that it enables an extended sampling procedure with a Metropolis--Hastings
+correction step, based on the change in total energy in the proposed samples.
+However, it seems to yield slightly worse performance, and more importantly,
+due to the widespread popularity of score-based diffusion, there are limited
+availability of off-the-shelf pre-trained energy-based ones. This limitation
+undermines the purpose of model composition, which aims to combine pre-trained
+models to sample from new distributions. Our proposal, however, suggests
+retaining the score parameterization and instead computing the energy-based
+acceptance probability through line integration of the score function. This
+allows us to re-use existing diffusion models and still combine the reverse
+process with various Markov-Chain Monte Carlo (MCMC) methods. We evaluate our
+method on a 2D experiment and find that it achieve similar or arguably better
+performance than the energy parameterisation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast algorithms for k-submodular maximization subject to a matroid
+  constraint 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuxian Niu, Qian Liu, Yang Zhou, Min Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we apply a Threshold-Decreasing Algorithm to maximize
+$k$-submodular functions under a matroid constraint, which reduces the query
+complexity of the algorithm compared to the greedy algorithm with little loss
+in approximation ratio. We give a $(\frac{1}{2} - \epsilon)$-approximation
+algorithm for monotone $k$-submodular function maximization, and a
+$(\frac{1}{3} - \epsilon)$-approximation algorithm for non-monotone case, with
+complexity $O(\frac{n(k\cdot EO + IO)}{\epsilon} \log \frac{r}{\epsilon})$,
+where $r$ denotes the rank of the matroid, and $IO, EO$ denote the number of
+oracles to evaluate whether a subset is an independent set and to compute the
+function value of $f$, respectively. Since the constraint of total size can be
+looked as a special matroid, called uniform matroid, then we present the fast
+algorithm for maximizing $k$-submodular functions subject to a total size
+constraint as corollaries. corollaries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Take Your Pick: Enabling Effective Personalized Federated Learning
+  within Low-dimensional Feature Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guogang Zhu, Xuefeng Liu, Shaojie Tang, Jianwei Niu, Xinghao Wu, Jiaxing Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized federated learning (PFL) is a popular framework that allows
+clients to have different models to address application scenarios where
+clients' data are in different domains. The typical model of a client in PFL
+features a global encoder trained by all clients to extract universal features
+from the raw data and personalized layers (e.g., a classifier) trained using
+the client's local data. Nonetheless, due to the differences between the data
+distributions of different clients (aka, domain gaps), the universal features
+produced by the global encoder largely encompass numerous components irrelevant
+to a certain client's local task. Some recent PFL methods address the above
+problem by personalizing specific parameters within the encoder. However, these
+methods encounter substantial challenges attributed to the high dimensionality
+and non-linearity of neural network parameter space. In contrast, the feature
+space exhibits a lower dimensionality, providing greater intuitiveness and
+interpretability as compared to the parameter space. To this end, we propose a
+novel PFL framework named FedPick. FedPick achieves PFL in the low-dimensional
+feature space by selecting task-relevant features adaptively for each client
+from the features generated by the global encoder based on its local data
+distribution. It presents a more accessible and interpretable implementation of
+PFL compared to those methods working in the parameter space. Extensive
+experimental results show that FedPick could effectively select task-relevant
+features for each client and improve model performance in cross-domain FL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BovineTalk: Machine Learning for Vocalization Analysis of Dairy Cattle
+  under Negative Affective States 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dinu Gavojdian, Teddy Lazebnik, Madalina Mincu, Ariel Oren, Ioana Nicolae, Anna Zamansky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a critical need to develop and validate non-invasive animal-based
+indicators of affective states in livestock species, in order to integrate them
+into on-farm assessment protocols, potentially via the use of precision
+livestock farming (PLF) tools. One such promising approach is the use of vocal
+indicators. The acoustic structure of vocalizations and their functions were
+extensively studied in important livestock species, such as pigs, horses,
+poultry and goats, yet cattle remain understudied in this context to date. Cows
+were shown to produce two types vocalizations: low-frequency calls (LF),
+produced with the mouth closed, or partially closed, for close distance
+contacts and open mouth emitted high-frequency calls (HF), produced for long
+distance communication, with the latter considered to be largely associated
+with negative affective states. Moreover, cattle vocalizations were shown to
+contain information on individuality across a wide range of contexts, both
+negative and positive. Nowadays, dairy cows are facing a series of negative
+challenges and stressors in a typical production cycle, making vocalizations
+during negative affective states of special interest for research. One
+contribution of this study is providing the largest to date pre-processed
+(clean from noises) dataset of lactating adult multiparous dairy cows during
+negative affective states induced by visual isolation challenges. Here we
+present two computational frameworks - deep learning based and explainable
+machine learning based, to classify high and low-frequency cattle calls, and
+individual cow voice recognition. Our models in these two frameworks reached
+87.2% and 89.4% accuracy for LF and HF classification, with 68.9% and 72.5%
+accuracy rates for the cow individual identification, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ METAVerse: Meta-Learning Traversability Cost Map for Off-Road Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwon Seo, Taekyung Kim, Seongyong Ahn, Kiho Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous navigation in off-road conditions requires an accurate estimation
+of terrain traversability. However, traversability estimation in unstructured
+environments is subject to high uncertainty due to the variability of numerous
+factors that influence vehicle-terrain interaction. Consequently, it is
+challenging to obtain a generalizable model that can accurately predict
+traversability in a variety of environments. This paper presents METAVerse, a
+meta-learning framework for learning a global model that accurately and
+reliably predicts terrain traversability across diverse environments. We train
+the traversability prediction network to generate a dense and continuous-valued
+cost map from a sparse LiDAR point cloud, leveraging vehicle-terrain
+interaction feedback in a self-supervised manner. Meta-learning is utilized to
+train a global model with driving data collected from multiple environments,
+effectively minimizing estimation uncertainty. During deployment, online
+adaptation is performed to rapidly adapt the network to the local environment
+by exploiting recent interaction experiences. To conduct a comprehensive
+evaluation, we collect driving data from various terrains and demonstrate that
+our method can obtain a global model that minimizes uncertainty. Moreover, by
+integrating our model with a model predictive controller, we demonstrate that
+the reduced uncertainty results in safe and stable navigation in unstructured
+and unknown terrains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our video can be found at https://youtu.be/4rIAMM1ZKMo</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ This is not correct! Negation-aware Evaluation of Language Generation
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13989v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13989v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miriam Anschütz, Diego Miguel Lozano, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models underestimate the impact of negations on how much they
+change the meaning of a sentence. Therefore, learned evaluation metrics based
+on these models are insensitive to negations. In this paper, we propose
+NegBLEURT, a negation-aware version of the BLEURT evaluation metric. For that,
+we designed a rule-based sentence negation tool and used it to create the
+CANNOT negation evaluation dataset. Based on this dataset, we fine-tuned a
+sentence transformer and an evaluation metric to improve their negation
+sensitivity. Evaluating these models on existing benchmarks shows that our
+fine-tuned models outperform existing metrics on the negated sentences by far
+while preserving their base models' performances on other perturbations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INLG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controlling the Latent Space of GANs through Reinforcement Learning: A
+  Case Study on Task-based Image-to-Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahyar Abbasian, Taha Rajabzadeh, Ahmadreza Moradipari, Seyed Amir Hossein Aqajari, Hongsheng Lu, Amir Rahmani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GAN) have emerged as a formidable AI tool to
+generate realistic outputs based on training datasets. However, the challenge
+of exerting control over the generation process of GANs remains a significant
+hurdle. In this paper, we propose a novel methodology to address this issue by
+integrating a reinforcement learning (RL) agent with a latent-space GAN
+(l-GAN), thereby facilitating the generation of desired outputs. More
+specifically, we have developed an actor-critic RL agent with a meticulously
+designed reward policy, enabling it to acquire proficiency in navigating the
+latent space of the l-GAN and generating outputs based on specified tasks. To
+substantiate the efficacy of our approach, we have conducted a series of
+experiments employing the MNIST dataset, including arithmetic addition as an
+illustrative task. The outcomes of these experiments serve to validate our
+methodology. Our pioneering integration of an RL agent with a GAN model
+represents a novel advancement, holding great potential for enhancing
+generative networks in the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 7 figures, 2 tables, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Deep Neural Networks via Linear Separability of Hidden
+  Layers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Zhang, Xinyu Chen, Wensheng Li, Lixue Liu, Wei Wu, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we measure the linear separability of hidden layer outputs to
+study the characteristics of deep neural networks. In particular, we first
+propose Minkowski difference based linear separability measures (MD-LSMs) to
+evaluate the linear separability degree of two points sets. Then, we
+demonstrate that there is a synchronicity between the linear separability
+degree of hidden layer outputs and the network training performance, i.e., if
+the updated weights can enhance the linear separability degree of hidden layer
+outputs, the updated network will achieve a better training performance, and
+vice versa. Moreover, we study the effect of activation function and network
+size (including width and depth) on the linear separability of hidden layers.
+Finally, we conduct the numerical experiments to validate our findings on some
+popular deep networks including multilayer perceptron (MLP), convolutional
+neural network (CNN), deep belief network (DBN), ResNet, VGGNet, AlexNet,
+vision transformer (ViT) and GoogLeNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Entropy Neural Estimation for Graph Contrastive Learning <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Ma, Xiaolin Zhang, Peng Zhang, Kun Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning on graphs aims at extracting distinguishable high-level
+representations of nodes. In this paper, we theoretically illustrate that the
+entropy of a dataset can be approximated by maximizing the lower bound of the
+mutual information across different views of a graph, \ie, entropy is estimated
+by a neural network. Based on this finding, we propose a simple yet effective
+subset sampling strategy to contrast pairwise representations between views of
+a dataset. In particular, we randomly sample nodes and edges from a given graph
+to build the input subset for a view. Two views are fed into a parameter-shared
+Siamese network to extract the high-dimensional embeddings and estimate the
+information entropy of the entire graph. For the learning process, we propose
+to optimize the network using two objectives, simultaneously. Concretely, the
+input of the contrastive loss function consists of positive and negative pairs.
+Our selection strategy of pairs is different from previous works and we present
+a novel strategy to enhance the representation ability of the graph encoder by
+selecting nodes based on cross-view similarities. We enrich the diversity of
+the positive and negative pairs by selecting highly similar samples and totally
+different data with the guidance of cross-view similarity scores, respectively.
+We also introduce a cross-view consistency constraint on the representations
+generated from the different views. This objective guarantees the learned
+representations are consistent across views from the perspective of the entire
+graph. We conduct extensive experiments on seven graph benchmarks, and the
+proposed approach achieves competitive performance compared to the current
+state-of-the-art methods. The source code will be publicly released once this
+paper is accepted.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topology-aware Robust Optimization for Out-of-distribution
+  Generalization <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengchun Qiao, Xi Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) generalization is a challenging machine learning
+problem yet highly desirable in many high-stake applications. Existing methods
+suffer from overly pessimistic modeling with low generalization confidence. As
+generalizing to arbitrary test distributions is impossible, we hypothesize that
+further structure on the topology of distributions is crucial in developing
+strong OOD resilience. To this end, we propose topology-aware robust
+optimization (TRO) that seamlessly integrates distributional topology in a
+principled optimization framework. More specifically, TRO solves two
+optimization objectives: (1) Topology Learning which explores data manifold to
+uncover the distributional topology; (2) Learning on Topology which exploits
+the topology to constrain robust optimization for tightly-bounded
+generalization risks. We theoretically demonstrate the effectiveness of our
+approach and empirically show that it significantly outperforms the state of
+the arts in a wide range of tasks including classification, regression, and
+semantic segmentation. Moreover, we empirically find the data-driven
+distributional topology is consistent with domain knowledge, enhancing the
+explainability of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In ICLR 2023 (17 pages including appendix). The source code and
+  pre-trained models are publicly available at: https://github.com/joffery/TRO</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Semi-Supervised Semantic Segmentation with Dual-Level Siamese
+  Structure Network <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhibo Tain, Xiaolin Zhang, Peng Zhang, Kun Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised semantic segmentation (SSS) is an important task that
+utilizes both labeled and unlabeled data to reduce expenses on labeling
+training examples. However, the effectiveness of SSS algorithms is limited by
+the difficulty of fully exploiting the potential of unlabeled data. To address
+this, we propose a dual-level Siamese structure network (DSSN) for pixel-wise
+contrastive learning. By aligning positive pairs with a pixel-wise contrastive
+loss using strong augmented views in both low-level image space and high-level
+feature space, the proposed DSSN is designed to maximize the utilization of
+available unlabeled data. Additionally, we introduce a novel class-aware
+pseudo-label selection strategy for weak-to-strong supervision, which addresses
+the limitations of most existing methods that do not perform selection or apply
+a predefined threshold for all classes. Specifically, our strategy selects the
+top high-confidence prediction of the weak view for each class to generate
+pseudo labels that supervise the strong augmented views. This strategy is
+capable of taking into account the class imbalance and improving the
+performance of long-tailed classes. Our proposed method achieves
+state-of-the-art results on two datasets, PASCAL VOC 2012 and Cityscapes,
+outperforming other SSS algorithms by a significant margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023 accpeted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ trajdata: A Unified Interface to Multiple Human Trajectory <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boris Ivanovic, Guanyu Song, Igor Gilitschenski, Marco Pavone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of trajectory forecasting has grown significantly in recent years,
+partially owing to the release of numerous large-scale, real-world human
+trajectory datasets for autonomous vehicles (AVs) and pedestrian motion
+tracking. While such datasets have been a boon for the community, they each use
+custom and unique data formats and APIs, making it cumbersome for researchers
+to train and evaluate methods across multiple datasets. To remedy this, we
+present trajdata: a unified interface to multiple human trajectory datasets. At
+its core, trajdata provides a simple, uniform, and efficient representation and
+API for trajectory and map data. As a demonstration of its capabilities, in
+this work we conduct a comprehensive empirical evaluation of existing
+trajectory datasets, providing users with a rich understanding of the data
+underpinning much of current pedestrian and AV motion forecasting research, and
+proposing suggestions for future datasets from these insights. trajdata is
+permissively licensed (Apache 2.0) and can be accessed online at
+https://github.com/NVlabs/trajdata
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 15 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simulation-based Inference for Cardiovascular Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13918v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13918v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antoine Wehenkel, Jens Behrmann, Andrew C. Miller, Guillermo Sapiro, Ozan Sener, Marco Cuturi, Jörn-Henrik Jacobsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past decades, hemodynamics simulators have steadily evolved and have
+become tools of choice for studying cardiovascular systems in-silico. While
+such tools are routinely used to simulate whole-body hemodynamics from
+physiological parameters, solving the corresponding inverse problem of mapping
+waveforms back to plausible physiological parameters remains both promising and
+challenging. Motivated by advances in simulation-based inference (SBI), we cast
+this inverse problem as statistical inference. In contrast to alternative
+approaches, SBI provides \textit{posterior distributions} for the parameters of
+interest, providing a \textit{multi-dimensional} representation of uncertainty
+for \textit{individual} measurements. We showcase this ability by performing an
+in-silico uncertainty analysis of five biomarkers of clinical interest
+comparing several measurement modalities. Beyond the corroboration of known
+facts, such as the feasibility of estimating heart rate, our study highlights
+the potential of estimating new biomarkers from standard-of-care measurements.
+SBI reveals practically relevant findings that cannot be captured by standard
+sensitivity analyses, such as the existence of sub-populations for which
+parameter estimation exhibits distinct uncertainty regimes. Finally, we study
+the gap between in-vivo and in-silico with the MIMIC-III waveform database and
+critically discuss how cardiovascular simulations can inform real-world data
+analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BayesDAG: Gradient-Based Posterior Sampling for Causal Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yashas Annadani, Nick Pawlowski, Joel Jennings, Stefan Bauer, Cheng Zhang, Wenbo Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian causal discovery aims to infer the posterior distribution over
+causal models from observed data, quantifying epistemic uncertainty and
+benefiting downstream tasks. However, computational challenges arise due to
+joint inference over combinatorial space of Directed Acyclic Graphs (DAGs) and
+nonlinear functions. Despite recent progress towards efficient posterior
+inference over DAGs, existing methods are either limited to variational
+inference on node permutation matrices for linear causal models, leading to
+compromised inference accuracy, or continuous relaxation of adjacency matrices
+constrained by a DAG regularizer, which cannot ensure resulting graphs are
+DAGs. In this work, we introduce a scalable Bayesian causal discovery framework
+based on stochastic gradient Markov Chain Monte Carlo (SG-MCMC) that overcomes
+these limitations. Our approach directly samples DAGs from the posterior
+without requiring any DAG regularization, simultaneously draws function
+parameter samples and is applicable to both linear and nonlinear causal models.
+To enable our approach, we derive a novel equivalence to the permutation-based
+DAG learning, which opens up possibilities of using any relaxed gradient
+estimator defined over permutations. To our knowledge, this is the first
+framework applying gradient-based MCMC sampling for causal discovery. Empirical
+evaluations on synthetic and real-world datasets demonstrate our approach's
+effectiveness compared to state-of-the-art baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online learning in bandits with predicted context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongyi Guo, Susan Murphy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the contextual bandit problem where at each time, the agent only
+has access to a noisy version of the context and the error variance (or an
+estimator of this variance). This setting is motivated by a wide range of
+applications where the true context for decision-making is unobserved, and only
+a prediction of the context by a potentially complex machine learning algorithm
+is available. When the context error is non-diminishing, classical bandit
+algorithms fail to achieve sublinear regret. We propose the first online
+algorithm in this setting with sublinear regret compared to the appropriate
+benchmark. The key idea is to extend the measurement error model in classical
+statistics to the online decision-making setting, which is nontrivial due to
+the policy being dependent on the noisy context observations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Neural Networks-based Hybrid Framework For Predicting Particle
+  Crushing Strength 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongya Zheng, Tianli Zhang, Qingzheng Guan, Wenjie Huang, Zunlei Feng, Mingli Song, Chun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks have emerged as an effective machine learning tool for
+multi-disciplinary tasks such as pharmaceutical molecule classification and
+chemical reaction prediction, because they can model non-euclidean
+relationships between different entities. Particle crushing, as a significant
+field of civil engineering, describes the breakage of granular materials caused
+by the breakage of particle fragment bonds under the modeling of numerical
+simulations, which motivates us to characterize the mechanical behaviors of
+particle crushing through the connectivity of particle fragments with Graph
+Neural Networks (GNNs). However, there lacks an open-source large-scale
+particle crushing dataset for research due to the expensive costs of laboratory
+tests or numerical simulations. Therefore, we firstly generate a dataset with
+45,000 numerical simulations and 900 particle types to facilitate the research
+progress of machine learning for particle crushing. Secondly, we devise a
+hybrid framework based on GNNs to predict particle crushing strength in a
+particle fragment view with the advances of state of the art GNNs. Finally, we
+compare our hybrid framework against traditional machine learning methods and
+the plain MLP to verify its effectiveness. The usefulness of different features
+is further discussed through the gradient attribution explanation w.r.t the
+predictions. Our data and code are released at
+https://github.com/doujiang-zheng/GNN-For-Particle-Crushing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robustness Verification of Deep Neural Networks using Star-Based
+  Reachability Analysis with Variable-Length Time Series Input 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neelanjana Pal, Diego Manzanas Lopez, Taylor T Johnson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven, neural network (NN) based anomaly detection and predictive
+maintenance are emerging research areas. NN-based analytics of time-series data
+offer valuable insights into past behaviors and estimates of critical
+parameters like remaining useful life (RUL) of equipment and state-of-charge
+(SOC) of batteries. However, input time series data can be exposed to
+intentional or unintentional noise when passing through sensors, necessitating
+robust validation and verification of these NNs. This paper presents a case
+study of the robustness verification approach for time series regression NNs
+(TSRegNN) using set-based formal methods. It focuses on utilizing
+variable-length input data to streamline input manipulation and enhance network
+architecture generalizability. The method is applied to two data sets in the
+Prognostics and Health Management (PHM) application areas: (1) SOC estimation
+of a Lithium-ion battery and (2) RUL estimation of a turbine engine. The NNs'
+robustness is checked using star-based reachability analysis, and several
+performance measures evaluate the effect of bounded perturbations in the input
+on network outputs, i.e., future outcomes. Overall, the paper offers a
+comprehensive case study for validating and verifying NN-based analytics of
+time-series data in real-world applications, emphasizing the importance of
+robustness testing for accurate and reliable predictions, especially
+considering the impact of noise on future outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review, 26 Pages, 14 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Corruption-Robust Lipschitz Contextual Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiliang Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  I study the problem of learning a Lipschitz function with corrupted binary
+signals. The learner tries to learn a Lipschitz function $f$ that the adversary
+chooses. In each round, the adversary selects a context vector $x_t$ in the
+input space, and the learner makes a guess to the true function value $f(x_t)$
+and receives a binary signal indicating whether the guess was high or low. In a
+total of $C$ rounds, the signal may be corrupted, though the value of $C$ is
+unknown to the learner. The learner's goal is to incur a small cumulative loss.
+I present a natural yet powerful technique sanity check, which proves useful in
+designing corruption-robust algorithms. I design algorithms which (treating the
+Lipschitz parameter $L$ as constant): for the symmetric loss, the learner
+achieves regret $O(C\log T)$ with $d = 1$ and $O_d(C\log T + T^{(d-1)/d})$ with
+$d > 1$; for the pricing loss the learner achieves regret $\widetilde{O}
+(T^{d/(d+1)} + C\cdot T^{1/(d+1)})$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regularizing Neural Networks with Meta-Learning Generative Models <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shin'ya Yamaguchi, Daiki Chijiwa, Sekitoshi Kanai, Atsutoshi Kumagai, Hisashi Kashima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates methods for improving generative data augmentation
+for deep learning. Generative data augmentation leverages the synthetic samples
+produced by generative models as an additional dataset for classification with
+small dataset settings. A key challenge of generative data augmentation is that
+the synthetic data contain uninformative samples that degrade accuracy. This is
+because the synthetic samples do not perfectly represent class categories in
+real data and uniform sampling does not necessarily provide useful samples for
+tasks. In this paper, we present a novel strategy for generative data
+augmentation called meta generative regularization (MGR). To avoid the
+degradation of generative data augmentation, MGR utilizes synthetic samples in
+the regularization term for feature extractors instead of in the loss function,
+e.g., cross-entropy. These synthetic samples are dynamically determined to
+minimize the validation losses through meta-learning. We observed that MGR can
+avoid the performance degradation of na\"ive generative data augmentation and
+boost the baselines. Experiments on six datasets showed that MGR is effective
+particularly when datasets are smaller and stably outperforms baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Data-centric Machine Learning Research (DMLR) Workshop at
+  ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Estimation of the Local Robustness of Machine Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tessa Han, Suraj Srinivas, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models often need to be robust to noisy input data. The
+effect of real-world noise (which is often random) on model predictions is
+captured by a model's local robustness, i.e., the consistency of model
+predictions in a local region around an input. However, the na\"ive approach to
+computing local robustness based on Monte-Carlo sampling is statistically
+inefficient, leading to prohibitive computational costs for large-scale
+applications. In this work, we develop the first analytical estimators to
+efficiently compute local robustness of multi-class discriminative models using
+local linear function approximation and the multivariate Normal CDF. Through
+the derivation of these estimators, we show how local robustness is connected
+to concepts such as randomized smoothing and softmax probability. We also
+confirm empirically that these estimators accurately and efficiently compute
+the local robustness of standard deep learning models. In addition, we
+demonstrate these estimators' usefulness for various tasks involving local
+robustness, such as measuring robustness bias and identifying examples that are
+vulnerable to noise perturbation in a dataset. By developing these analytical
+estimators, this work not only advances conceptual understanding of local
+robustness, but also makes its computation practical, enabling the use of local
+robustness in critical downstream applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ExeDec: Execution Decomposition for Compositional Generalization in
+  Neural Program Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13883v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13883v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kensen Shi, Joey Hong, Manzil Zaheer, Pengcheng Yin, Charles Sutton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When writing programs, people have the ability to tackle a new complex task
+by decomposing it into smaller and more familiar subtasks. While it is
+difficult to measure whether neural program synthesis methods have similar
+capabilities, we can measure whether they compositionally generalize, that is,
+whether a model that has been trained on the simpler subtasks is subsequently
+able to solve more complex tasks. In this paper, we characterize several
+different forms of compositional generalization that are desirable in program
+synthesis, forming a meta-benchmark which we use to create generalization tasks
+for two popular datasets, RobustFill and DeepCoder. We then propose ExeDec, a
+novel decomposition-based synthesis strategy that predicts execution subgoals
+to solve problems step-by-step informed by program execution at each step.
+ExeDec has better synthesis performance and greatly improved compositional
+generalization ability compared to baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2204.03758</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Good Lattice Training: Physics-Informed Neural Networks Accelerated by
+  Number Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takashi Matsubara, Takaharu Yaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs) offer a novel and efficient approach
+to solving partial differential equations (PDEs). Their success lies in the
+physics-informed loss, which trains a neural network to satisfy a given PDE at
+specific points and to approximate the solution. However, the solutions to PDEs
+are inherently infinite-dimensional, and the distance between the output and
+the solution is defined by an integral over the domain. Therefore, the
+physics-informed loss only provides a finite approximation, and selecting
+appropriate collocation points becomes crucial to suppress the discretization
+errors, although this aspect has often been overlooked. In this paper, we
+propose a new technique called good lattice training (GLT) for PINNs, inspired
+by number theoretic methods for numerical analysis. GLT offers a set of
+collocation points that are effective even with a small number of points and
+for multi-dimensional spaces. Our experiments demonstrate that GLT requires
+2--20 times fewer collocation points (resulting in lower computational cost)
+than uniformly random sampling or Latin hypercube sampling, while achieving
+competitive performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning sources of variability from high-dimensional observational
+  studies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric W. Bridgeford, Jaewon Chung, Brian Gilbert, Sambit Panda, Adam Li, Cencheng Shen, Alexandra Badea, Brian Caffo, Joshua T. Vogelstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal inference studies whether the presence of a variable influences an
+observed outcome. As measured by quantities such as the "average treatment
+effect," this paradigm is employed across numerous biological fields, from
+vaccine and drug development to policy interventions. Unfortunately, the
+majority of these methods are often limited to univariate outcomes. Our work
+generalizes causal estimands to outcomes with any number of dimensions or any
+measurable space, and formulates traditional causal estimands for nominal
+variables as causal discrepancy tests. We propose a simple technique for
+adjusting universally consistent conditional independence tests and prove that
+these tests are universally consistent causal discrepancy tests. Numerical
+experiments illustrate that our method, Causal CDcorr, leads to improvements in
+both finite sample validity and power when compared to existing strategies. Our
+methods are all open source and available at github.com/ebridge2/cdcorr.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TreeFlow: Going beyond Tree-based Gaussian Probabilistic Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.04140v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.04140v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patryk Wielopolski, Maciej Zięba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The tree-based ensembles are known for their outstanding performance in
+classification and regression problems characterized by feature vectors
+represented by mixed-type variables from various ranges and domains. However,
+considering regression problems, they are primarily designed to provide
+deterministic responses or model the uncertainty of the output with Gaussian or
+parametric distribution. In this work, we introduce TreeFlow, the tree-based
+approach that combines the benefits of using tree ensembles with the
+capabilities of modeling flexible probability distributions using normalizing
+flows. The main idea of the solution is to use a tree-based model as a feature
+extractor and combine it with a conditional variant of normalizing flow.
+Consequently, our approach is capable of modeling complex distributions for the
+regression outputs. We evaluate the proposed method on challenging regression
+benchmarks with varying volume, feature characteristics, and target
+dimensionality. We obtain the SOTA results for both probabilistic and
+deterministic metrics on datasets with multi-modal target distributions and
+competitive results on unimodal ones compared to tree-based regression
+baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal Manoeuvre and Trajectory Prediction for Automated Driving on
+  Highways Using <span class="highlight-title">Transformer</span> Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sajjad Mozaffari, Mreza Alipour Sormoli, Konstantinos Koufos, Mehrdad Dianati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting the behaviour (i.e., manoeuvre/trajectory) of other road users,
+including vehicles, is critical for the safe and efficient operation of
+autonomous vehicles (AVs), a.k.a., automated driving systems (ADSs). Due to the
+uncertain future behaviour of vehicles, multiple future behaviour modes are
+often plausible for a vehicle in a given driving scene. Therefore, multimodal
+prediction can provide richer information than single-mode prediction, enabling
+AVs to perform a better risk assessment. To this end, we propose a novel
+multimodal prediction framework that can predict multiple plausible behaviour
+modes and their likelihoods. The proposed framework includes a bespoke problem
+formulation for manoeuvre prediction, a novel transformer-based prediction
+model, and a tailored training method for multimodal manoeuvre and trajectory
+prediction. The performance of the framework is evaluated using three public
+highway driving datasets, namely NGSIM, highD, and exiD. The results show that
+our framework outperforms the state-of-the-art multimodal methods in terms of
+prediction error and is capable of predicting plausible manoeuvre and
+trajectory modes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, submitted to IEEE RAL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An optimal control perspective on diffusion-based generative modeling <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.01364v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.01364v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julius Berner, Lorenz Richter, Karen Ullrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We establish a connection between stochastic optimal control and generative
+models based on stochastic differential equations (SDEs), such as recently
+developed diffusion probabilistic models. In particular, we derive a
+Hamilton-Jacobi-Bellman equation that governs the evolution of the
+log-densities of the underlying SDE marginals. This perspective allows to
+transfer methods from optimal control theory to generative modeling. First, we
+show that the evidence lower bound is a direct consequence of the well-known
+verification theorem from control theory. Further, we can formulate
+diffusion-based generative modeling as a minimization of the Kullback-Leibler
+divergence between suitable measures in path space. Finally, we develop a novel
+diffusion-based method for sampling from unnormalized densities -- a problem
+frequently occurring in statistics and computational sciences. We demonstrate
+that our time-reversed diffusion sampler (DIS) can outperform other
+diffusion-based sampling approaches on multiple numerical examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for oral presentation at NeurIPS 2022 Workshop on
+  Score-Based Methods</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uniformity Testing over Hypergrids with Subcube Conditioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09013v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09013v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Chen, Cassandra Marcussen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We give an algorithm for testing uniformity of distributions supported on
+hypergrids $[m_1] \times \cdots \times [m_n]$, which makes
+$\smash{\widetilde{O}(\text{poly}(m)\sqrt{n}/\epsilon^2)}$ many queries to a
+subcube conditional sampling oracle with $m=\max_i m_i$. When $m$ is a
+constant, our algorithm is nearly optimal and strengthens the algorithm of
+[CCK+21] which has the same query complexity but works for hypercubes $\{\pm
+1\}^n$ only.
+  A key technical contribution behind the analysis of our algorithm is a proof
+of a robust version of Pisier's inequality for functions over hypergrids using
+Fourier analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended results to the domain [m_1] x ... x [m_n] (previously was
+  [m]^n); substantial revisions to the introduction and conclusion of the paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Early Detection of Bark Beetle Attack Using Remote Sensing and Machine
+  Learning: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03829v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03829v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Mojtaba Marvasti-Zadeh, Devin Goodsman, Nilanjan Ray, Nadir Erbilgin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper provides a comprehensive review of past and current advances in
+the early detection of bark beetle-induced tree mortality from three primary
+perspectives: bark beetle & host interactions, RS, and ML/DL. In contrast to
+prior efforts, this review encompasses all RS systems and emphasizes ML/DL
+methods to investigate their strengths and weaknesses. We parse existing
+literature based on multi- or hyper-spectral analyses and distill their
+knowledge based on: bark beetle species & attack phases with a primary emphasis
+on early stages of attacks, host trees, study regions, RS platforms & sensors,
+spectral/spatial/temporal resolutions, spectral signatures, spectral vegetation
+indices (SVIs), ML approaches, learning schemes, task categories, models,
+algorithms, classes/clusters, features, and DL networks & architectures.
+Although DL-based methods and the random forest (RF) algorithm showed promising
+results, highlighting their potential to detect subtle changes across visible,
+thermal, and short-wave infrared (SWIR) spectral regions, they still have
+limited effectiveness and high uncertainties. To inspire novel solutions to
+these shortcomings, we delve into the principal challenges & opportunities from
+different perspectives, enabling a deeper understanding of the current state of
+research and guiding future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Estimating large causal polytrees from small samples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.07028v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.07028v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sourav Chatterjee, Mathukumalli Vidyasagar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of estimating a large causal polytree from a
+relatively small i.i.d. sample. This is motivated by the problem of determining
+causal structure when the number of variables is very large compared to the
+sample size, such as in gene regulatory networks. We give an algorithm that
+recovers the tree with high accuracy in such settings. The algorithm works
+under essentially no distributional or modeling assumptions other than some
+mild non-degeneracy conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages. The title of the paper has been slightly modified, by
+  removing the word "skeleton". This is because the original version of the
+  paper had an algorithm for recovering only the skeleton, while in this
+  version, we have a way of recovering the directionalities of the arrows as
+  well</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Combining optimal path search with task-dependent learning in a neural
+  network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.11104v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.11104v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomas Kulvicius, Minija Tamosiunaite, Florentin Wörgötter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding optimal paths in connected graphs requires determining the smallest
+total cost for traveling along the graph's edges. This problem can be solved by
+several classical algorithms where, usually, costs are predefined for all
+edges. Conventional planning methods can, thus, normally not be used when
+wanting to change costs in an adaptive way following the requirements of some
+task. Here we show that one can define a neural network representation of path
+finding problems by transforming cost values into synaptic weights, which
+allows for online weight adaptation using network learning mechanisms. When
+starting with an initial activity value of one, activity propagation in this
+network will lead to solutions, which are identical to those found by the
+Bellman-Ford algorithm. The neural network has the same algorithmic complexity
+as Bellman-Ford and, in addition, we can show that network learning mechanisms
+(such as Hebbian learning) can adapt the weights in the network augmenting the
+resulting paths according to some task at hand. We demonstrate this by learning
+to navigate in an environment with obstacles as well as by learning to follow
+certain sequences of path nodes. Hence, the here-presented novel algorithm may
+open up a different regime of applications where path-augmentation (by
+learning) is directly coupled with path finding in a natural way.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Embeddings for Numerical Features in Tabular Deep Learning <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.05556v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.05556v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yury Gorishniy, Ivan Rubachev, Artem Babenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Transformer-like deep architectures have shown strong performance
+on tabular data problems. Unlike traditional models, e.g., MLP, these
+architectures map scalar values of numerical features to high-dimensional
+embeddings before mixing them in the main backbone. In this work, we argue that
+embeddings for numerical features are an underexplored degree of freedom in
+tabular DL, which allows constructing more powerful DL models and competing
+with GBDT on some traditionally GBDT-friendly benchmarks. We start by
+describing two conceptually different approaches to building embedding modules:
+the first one is based on a piecewise linear encoding of scalar values, and the
+second one utilizes periodic activations. Then, we empirically demonstrate that
+these two approaches can lead to significant performance boosts compared to the
+embeddings based on conventional blocks such as linear layers and ReLU
+activations. Importantly, we also show that embedding numerical features is
+beneficial for many backbones, not only for Transformers. Specifically, after
+proper embeddings, simple MLP-like models can perform on par with the
+attention-based architectures. Overall, we highlight embeddings for numerical
+features as an important design aspect with good potential for further
+improvements in tabular DL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2022 camera-ready. Code:
+  https://github.com/yandex-research/tabular-dl-num-embeddings (v3: minor
+  fixes)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Deep Learning Models for Tabular Data <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.11959v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.11959v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yury Gorishniy, Ivan Rubachev, Valentin Khrulkov, Artem Babenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The existing literature on deep learning for tabular data proposes a wide
+range of novel architectures and reports competitive results on various
+datasets. However, the proposed models are usually not properly compared to
+each other and existing works often use different benchmarks and experiment
+protocols. As a result, it is unclear for both researchers and practitioners
+what models perform best. Additionally, the field still lacks effective
+baselines, that is, the easy-to-use models that provide competitive performance
+across different problems.
+  In this work, we perform an overview of the main families of DL architectures
+for tabular data and raise the bar of baselines in tabular DL by identifying
+two simple and powerful deep architectures. The first one is a ResNet-like
+architecture which turns out to be a strong baseline that is often missing in
+prior works. The second model is our simple adaptation of the Transformer
+architecture for tabular data, which outperforms other solutions on most tasks.
+Both models are compared to many existing architectures on a diverse set of
+tasks under the same training and tuning protocols. We also compare the best DL
+models with Gradient Boosted Decision Trees and conclude that there is still no
+universally superior solution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2021 camera-ready. Code:
+  https://github.com/yandex-research/tabular-dl-revisiting-models (v3: minor
+  fixes)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Client Selection in Federated Learning: Principles, Challenges, and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.01549v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.01549v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Fu, Huanle Zhang, Ge Gao, Mi Zhang, Xin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a privacy-preserving paradigm for training Machine Learning (ML) models,
+Federated Learning (FL) has received tremendous attention from both industry
+and academia. In a typical FL scenario, clients exhibit significant
+heterogeneity in terms of data distribution and hardware configurations. Thus,
+randomly sampling clients in each training round may not fully exploit the
+local updates from heterogeneous clients, resulting in lower model accuracy,
+slower convergence rate, degraded fairness, etc. To tackle the FL client
+heterogeneity problem, various client selection algorithms have been developed,
+showing promising performance improvement. In this paper, we systematically
+present recent advances in the emerging field of FL client selection and its
+challenges and research opportunities. We hope to facilitate practitioners in
+choosing the most suitable client selection mechanisms for their applications,
+as well as inspire researchers and newcomers to better understand this exciting
+research topic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Comparison and Calibration Assessment: User Guide for Consistent
+  Scoring Functions in Machine Learning and Actuarial Practice 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.12780v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.12780v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Fissler, Christian Lorentzen, Michael Mayer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the main tasks of actuaries and data scientists is to build good
+predictive models for certain phenomena such as the claim size or the number of
+claims in insurance. These models ideally exploit given feature information to
+enhance the accuracy of prediction. This user guide revisits and clarifies
+statistical techniques to assess the calibration or adequacy of a model on the
+one hand, and to compare and rank different models on the other hand. In doing
+so, it emphasises the importance of specifying the prediction target functional
+at hand a priori (e.g. the mean or a quantile) and of choosing the scoring
+function in model comparison in line with this target functional. Guidance for
+the practical choice of the scoring function is provided. Striving to bridge
+the gap between science and daily practice in application, it focuses mainly on
+the pedagogical presentation of existing results and of best practice. The
+results are accompanied and illustrated by two real data case studies on
+workers' compensation and customer churn.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>70 pages, 22 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TimeTuner: Diagnosing Time Representations for Time-Series Forecasting
+  with Counterfactual Explanations <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09916v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09916v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianing Hao, Qing Shi, Yilin Ye, Wei Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) approaches are being increasingly used for time-series
+forecasting, with many efforts devoted to designing complex DL models. Recent
+studies have shown that the DL success is often attributed to effective data
+representations, fostering the fields of feature engineering and representation
+learning. However, automated approaches for feature learning are typically
+limited with respect to incorporating prior knowledge, identifying interactions
+among variables, and choosing evaluation metrics to ensure that the models are
+reliable. To improve on these limitations, this paper contributes a novel
+visual analytics framework, namely TimeTuner, designed to help analysts
+understand how model behaviors are associated with localized correlations,
+stationarity, and granularity of time-series representations. The system mainly
+consists of the following two-stage technique: We first leverage counterfactual
+explanations to connect the relationships among time-series representations,
+multivariate features and model predictions. Next, we design multiple
+coordinated views including a partition-based correlation matrix and juxtaposed
+bivariate stripes, and provide a set of interactions that allow users to step
+into the transformation selection process, navigate through the feature space,
+and reason the model performance. We instantiate TimeTuner with two
+transformation methods of smoothing and sampling, and demonstrate its
+applicability on real-world time-series forecasting of univariate sunspots and
+multivariate air pollutants. Feedback from domain experts indicates that our
+system can help characterize time-series representations and guide the feature
+engineering processes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures, this paper has been accepted by IEEE VIS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Quantity-Aware Aggregation for Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.10848v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.10848v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwei Yi, Fangzhao Wu, Huishuai Zhang, Bin Zhu, Tao Qi, Guangzhong Sun, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) enables multiple clients to collaboratively train
+models without sharing their local data, and becomes an important
+privacy-preserving machine learning framework. However, classical FL faces
+serious security and robustness problem, e.g., malicious clients can poison
+model updates and at the same time claim large quantities to amplify the impact
+of their model updates in the model aggregation. Existing defense methods for
+FL, while all handling malicious model updates, either treat all quantities
+benign or simply ignore/truncate the quantities of all clients. The former is
+vulnerable to quantity-enhanced attack, while the latter leads to sub-optimal
+performance since the local data on different clients is usually in
+significantly different sizes. In this paper, we propose a robust
+quantity-aware aggregation algorithm for federated learning, called FedRA, to
+perform the aggregation with awareness of local data quantities while being
+able to defend against quantity-enhanced attacks. More specifically, we propose
+a method to filter malicious clients by jointly considering the uploaded model
+updates and data quantities from different clients, and performing
+quantity-aware weighted averaging on model updates from remaining clients.
+Moreover, as the number of malicious clients participating in the federated
+learning may dynamically change in different rounds, we also propose a
+malicious client number estimator to predict how many suspicious clients should
+be filtered in each round. Experiments on four public datasets demonstrate the
+effectiveness of our FedRA method in defending FL against quantity-enhanced
+attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MICDIR: Multi-scale Inverse-consistent Deformable Image Registration
+  using UNetMSS with Self-Constructing Graph Latent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.04317v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.04317v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumick Chatterjee, Himanshi Bajaj, Istiyak H. Siddiquee, Nandish Bandi Subbarayappa, Steve Simon, Suraj Bangalore Shashidhar, Oliver Speck, Andreas Nürnberge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image registration is the process of bringing different images into a common
+coordinate system - a technique widely used in various applications of computer
+vision, such as remote sensing, image retrieval, and, most commonly, medical
+imaging. Deep learning based techniques have been applied successfully to
+tackle various complex medical image processing problems, including medical
+image registration. Over the years, several image registration techniques have
+been proposed using deep learning. Deformable image registration techniques
+such as Voxelmorph have been successful in capturing finer changes and
+providing smoother deformations. However, Voxelmorph, as well as ICNet and
+FIRE, do not explicitly encode global dependencies (i.e. the overall anatomical
+view of the supplied image) and, therefore, cannot track large deformations. In
+order to tackle the aforementioned problems, this paper extends the Voxelmorph
+approach in three different ways. To improve the performance in case of small
+as well as large deformations, supervision of the model at different
+resolutions has been integrated using a multi-scale UNet. To support the
+network to learn and encode the minute structural co-relations of the given
+image-pairs, a self-constructing graph network (SCGNet) has been used as the
+latent of the multi-scale UNet - which can improve the learning process of the
+model and help the model to generalise better. And finally, to make the
+deformations inverse-consistent, cycle consistency loss has been employed. On
+the task of registration of brain MRIs, the proposed method achieved
+significant improvements over ANTs and VoxelMorph, obtaining a Dice score of
+0.8013 \pm 0.0243 for intramodal and 0.6211 \pm 0.0309 for intermodal, while
+VoxelMorph achieved 0.7747 \pm 0.0260 and 0.6071 \pm 0.0510, respectively
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FacEDiM: A Face Embedding Distribution Model for Few-Shot Biometric
+  Authentication of Cattle <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14831v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14831v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meshia Cédric Oveneke, Rucha Vaishampayan, Deogratias Lukamba Nsadisa, Jenny Ambukiyenyi Onya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work proposes to solve the problem of few-shot biometric authentication
+by computing the Mahalanobis distance between testing embeddings and a
+multivariate Gaussian distribution of training embeddings obtained using
+pre-trained CNNs. Experimental results show that models pre-trained on the
+ImageNet dataset significantly outperform models pre-trained on human faces.
+With a VGG16 model, we obtain a FRR of 1.25% for a FAR of 1.18% on a dataset of
+20 cattle identities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 1 figure, 1 table, paper accepted at Black In AI at the 36th
+  Conference on Neural Information Processing Systems (NeurIPS 2022), New
+  Orleans, USA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Factor Fields: A Unified Framework for Neural Fields and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anpei Chen, Zexiang Xu, Xinyue Wei, Siyu Tang, Hao Su, Andreas Geiger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Factor Fields, a novel framework for modeling and representing
+signals. Factor Fields decomposes a signal into a product of factors, each of
+which is represented by a neural or regular field representation operating on a
+coordinate transformed input signal. We show that this decomposition yields a
+unified framework that generalizes several recent signal representations
+including NeRF, PlenOxels, EG3D, Instant-NGP, and TensoRF. Moreover, the
+framework allows for the creation of powerful new signal representations, such
+as the Coefficient-Basis Factorization (CoBaFa) which we propose in this paper.
+As evidenced by our experiments, CoBaFa leads to improvements over previous
+fast reconstruction methods in terms of the three critical goals in neural
+signal representation: approximation quality, compactness and efficiency.
+Experimentally, we demonstrate that our representation achieves better image
+approximation quality on 2D image regression tasks, higher geometric quality
+when reconstructing 3D signed distance fields and higher compactness for
+radiance field reconstruction tasks compared to previous fast reconstruction
+methods. Besides, our CoBaFa representation enables generalization by sharing
+the basis across signals during training, enabling generalization tasks such as
+image regression with sparse observations and few-shot radiance field
+reconstruction. Project Page: https://apchenstu.github.io/FactorFields/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neglected Free Lunch -- Learning Image Classifiers Using Annotation
+  Byproducts <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17595v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17595v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyoon Han, Junsuk Choe, Seonghyeok Chun, John Joon Young Chung, Minsuk Chang, Sangdoo Yun, Jean Y. Song, Seong Joon Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised learning of image classifiers distills human knowledge into a
+parametric model through pairs of images and corresponding labels (X,Y). We
+argue that this simple and widely used representation of human knowledge
+neglects rich auxiliary information from the annotation procedure, such as the
+time-series of mouse traces and clicks left after image selection. Our insight
+is that such annotation byproducts Z provide approximate human attention that
+weakly guides the model to focus on the foreground cues, reducing spurious
+correlations and discouraging shortcut learning. To verify this, we create
+ImageNet-AB and COCO-AB. They are ImageNet and COCO training sets enriched with
+sample-wise annotation byproducts, collected by replicating the respective
+original annotation tasks. We refer to the new paradigm of training models with
+annotation byproducts as learning using annotation byproducts (LUAB). We show
+that a simple multitask loss for regressing Z together with Y already improves
+the generalisability and robustness of the learned models. Compared to the
+original supervised learning, LUAB does not require extra annotation costs.
+ImageNet-AB and COCO-AB are at https://github.com/naver-ai/NeglectedFreeLunch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code & data at https://github.com/naver-ai/NeglectedFreeLunch. To be
+  presented at ICCV'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Weight Balancing on Long-Tailed Recognition Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16573v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16573v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naoya Hasegawa, Issei Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognition problems in long-tailed data, where the sample size per class is
+heavily skewed, have recently gained importance because the distribution of the
+sample size per class in a dataset is generally exponential unless the sample
+size is intentionally adjusted. Various approaches have been devised to address
+these problems. Recently, weight balancing, which combines well-known classical
+regularization techniques with two-stage training, has been proposed. Despite
+its simplicity, it is known for its high performance against existing methods
+devised in various ways. However, there is a lack of understanding as to why
+this approach is effective for long-tailed data. In this study, we analyze the
+method focusing on neural collapse and cone effect at each training stage and
+find that it can be decomposed into the increase in Fisher's discriminant ratio
+of the feature extractor caused by weight decay and cross entropy loss and
+implicit logit adjustment caused by weight decay and class-balanced loss. Our
+analysis shows that the training method can be further simplified by reducing
+the number of training stages to one while increasing accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond the Edge of Stability via Two-step Gradient Updates <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.04172v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.04172v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Chen, Joan Bruna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient Descent (GD) is a powerful workhorse of modern machine learning
+thanks to its scalability and efficiency in high-dimensional spaces. Its
+ability to find local minimisers is only guaranteed for losses with Lipschitz
+gradients, where it can be seen as a `bona-fide' discretisation of an
+underlying gradient flow. Yet, many ML setups involving overparametrised models
+do not fall into this problem class, which has motivated research beyond the
+so-called ``Edge of Stability'' (EoS), where the step-size crosses the
+admissibility threshold inversely proportional to the Lipschitz constant above.
+Perhaps surprisingly, GD has been empirically observed to still converge
+regardless of local instability and oscillatory behavior.
+  The incipient theoretical analysis of this phenomena has mainly focused in
+the overparametrised regime, where the effect of choosing a large learning rate
+may be associated to a `Sharpness-Minimisation' implicit regularisation within
+the manifold of minimisers, under appropriate asymptotic limits. In contrast,
+in this work we directly examine the conditions for such unstable convergence,
+focusing on simple, yet representative, learning problems, via analysis of
+two-step gradient updates. Specifically, we characterize a local condition
+involving third-order derivatives that guarantees existence and convergence to
+fixed points of the two-step updates, and leverage such property in a
+teacher-student setting, under population loss. Finally, starting from Matrix
+Factorization, we provide observations of period-2 orbit of GD in
+high-dimensional settings with intuition of its dynamics, along with
+exploration into more general settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2023. Update: more discussions on Matrix
+  Factorization</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ No Train No Gain: Revisiting Efficient Training Algorithms For
+  <span class="highlight-title">Transformer</span>-based Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06440v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06440v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean Kaddour, Oscar Key, Piotr Nawrot, Pasquale Minervini, Matt J. Kusner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The computation necessary for training Transformer-based language models has
+skyrocketed in recent years. This trend has motivated research on efficient
+training algorithms designed to improve training, validation, and downstream
+performance faster than standard training. In this work, we revisit three
+categories of such algorithms: dynamic architectures (layer stacking, layer
+dropping), batch selection (selective backprop, RHO loss), and efficient
+optimizers (Lion, Sophia). When pre-training BERT and T5 with a fixed
+computation budget using such methods, we find that their training, validation,
+and downstream gains vanish compared to a baseline with a fully-decayed
+learning rate. We define an evaluation protocol that enables computation to be
+done on arbitrary machines by mapping all computation time to a reference
+machine which we call reference system time. We discuss the limitations of our
+proposed protocol and release our code to encourage rigorous research in
+efficient training procedures: https://github.com/JeanKaddour/NoTrainNoGain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-driven Trajectory Truncation for Data Augmentation in
+  Offline Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04660v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04660v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Zhang, Jiafei Lyu, Xiaoteng Ma, Jiangpeng Yan, Jun Yang, Le Wan, Xiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Equipped with the trained environmental dynamics, model-based offline
+reinforcement learning (RL) algorithms can often successfully learn good
+policies from fixed-sized datasets, even some datasets with poor quality.
+Unfortunately, however, it can not be guaranteed that the generated samples
+from the trained dynamics model are reliable (e.g., some synthetic samples may
+lie outside of the support region of the static dataset). To address this
+issue, we propose Trajectory Truncation with Uncertainty (TATU), which
+adaptively truncates the synthetic trajectory if the accumulated uncertainty
+along the trajectory is too large. We theoretically show the performance bound
+of TATU to justify its benefits. To empirically show the advantages of TATU, we
+first combine it with two classical model-based offline RL algorithms, MOPO and
+COMBO. Furthermore, we integrate TATU with several off-the-shelf model-free
+offline RL algorithms, e.g., BCQ. Experimental results on the D4RL benchmark
+show that TATU significantly improves their performance, often by a large
+margin. Code is available here.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Representation of the Magnetic Field Topology in High-Fidelity
+  Plasma Simulations for Machine Learning Applications <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09469v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09469v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioanna Bouri, Fanni Franssila, Markku Alho, Giulia Cozzani, Ivan Zaitsev, Minna Palmroth, Teemu Roos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topological analysis of the magnetic field in simulated plasmas allows the
+study of various physical phenomena in a wide range of settings. One such
+application is magnetic reconnection, a phenomenon related to the dynamics of
+the magnetic field topology, which is difficult to detect and characterize in
+three dimensions. We propose a scalable pipeline for topological data analysis
+and spatiotemporal graph representation of three-dimensional magnetic vector
+fields. We demonstrate our methods on simulations of the Earth's magnetosphere
+produced by Vlasiator, a supercomputer-scale Vlasov theory-based simulation for
+near-Earth space. The purpose of this work is to challenge the machine learning
+community to explore graph-based machine learning approaches to address a
+largely open scientific problem with wide-ranging potential impact.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures, Accepted at the ICML 2023 Workshop on Machine
+  Learning for Astrophysics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compressible Spectral Mixture Kernels with Sparse Dependency Structures
+  for Gaussian Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1808.00560v9">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1808.00560v9.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Chen, Yijue Dai, Feng Yin, Elena Marchiori, Sergios Theodoridis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spectral mixture (SM) kernels comprise a powerful class of generalized
+kernels for Gaussian processes (GPs) to describe complex patterns. This paper
+introduces model compression and time- and phase (TP) modulated dependency
+structures to the original (SM) kernel for improved generalization of GPs.
+Specifically, by adopting Bienaym\'es identity, we generalize the dependency
+structure through cross-covariance between the SM components. Then, we propose
+a novel SM kernel with a dependency structure (SMD) by using cross-convolution
+between the SM components. Furthermore, we ameliorate the expressiveness of the
+dependency structure by parameterizing it with time and phase delays. The
+dependency structure has clear interpretations in terms of spectral density,
+covariance behavior, and sampling path. To enrich the SMD with effective
+hyperparameter initialization, compressible SM kernel components, and sparse
+dependency structures, we introduce a novel structure adaptation (SA) algorithm
+in the end. A thorough comparative analysis of the SMD on both synthetic and
+real-life applications corroborates its efficacy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Learning Dynamics of Attention Networks <span class="chip">ECAI-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13421v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13421v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul Vashisht, Harish G. Ramaswamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention models are typically learned by optimizing one of three standard
+loss functions that are variously called -- soft attention, hard attention, and
+latent variable marginal likelihood (LVML) attention. All three paradigms are
+motivated by the same goal of finding two models -- a `focus' model that
+`selects' the right \textit{segment} of the input and a `classification' model
+that processes the selected segment into the target label. However, they differ
+significantly in the way the selected segments are aggregated, resulting in
+distinct dynamics and final results. We observe a unique signature of models
+learned using these paradigms and explain this as a consequence of the
+evolution of the classification model under gradient descent when the focus
+model is fixed. We also analyze these paradigms in a simple setting and derive
+closed-form expressions for the parameter trajectory under gradient flow. With
+the soft attention loss, the focus model improves quickly at initialization and
+splutters later on. On the other hand, hard attention loss behaves in the
+opposite fashion. Based on our observations, we propose a simple hybrid
+approach that combines the advantages of the different loss functions and
+demonstrates it on a collection of semi-synthetic and real-world datasets
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint: Accepted at ECAI-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deceptive Alignment Monitoring <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10569v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10569v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andres Carranza, Dhruv Pai, Rylan Schaeffer, Arnuv Tandon, Sanmi Koyejo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the capabilities of large machine learning models continue to grow, and as
+the autonomy afforded to such models continues to expand, the spectre of a new
+adversary looms: the models themselves. The threat that a model might behave in
+a seemingly reasonable manner, while secretly and subtly modifying its behavior
+for ulterior reasons is often referred to as deceptive alignment in the AI
+Safety & Alignment communities. Consequently, we call this new direction
+Deceptive Alignment Monitoring. In this work, we identify emerging directions
+in diverse machine learning subfields that we believe will become increasingly
+important and intertwined in the near future for deceptive alignment
+monitoring, and we argue that advances in these fields present both long-term
+challenges and new research opportunities. We conclude by advocating for
+greater involvement by the adversarial machine learning community in these
+emerging directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as BlueSky Oral to 2023 ICML AdvML Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AudioLM: a Language Modeling Approach to Audio Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.03143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.03143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zalán Borsos, Raphaël Marinier, Damien Vincent, Eugene Kharitonov, Olivier Pietquin, Matt Sharifi, Dominik Roblek, Olivier Teboul, David Grangier, Marco Tagliasacchi, Neil Zeghidour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce AudioLM, a framework for high-quality audio generation with
+long-term consistency. AudioLM maps the input audio to a sequence of discrete
+tokens and casts audio generation as a language modeling task in this
+representation space. We show how existing audio tokenizers provide different
+trade-offs between reconstruction quality and long-term structure, and we
+propose a hybrid tokenization scheme to achieve both objectives. Namely, we
+leverage the discretized activations of a masked language model pre-trained on
+audio to capture long-term structure and the discrete codes produced by a
+neural audio codec to achieve high-quality synthesis. By training on large
+corpora of raw audio waveforms, AudioLM learns to generate natural and coherent
+continuations given short prompts. When trained on speech, and without any
+transcript or annotation, AudioLM generates syntactically and semantically
+plausible speech continuations while also maintaining speaker identity and
+prosody for unseen speakers. Furthermore, we demonstrate how our approach
+extends beyond speech by generating coherent piano music continuations, despite
+being trained without any symbolic representation of music.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Duet: efficient and scalable hybriD neUral rElation undersTanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13494v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13494v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixin Zhang, Hongzhi Wang, Yabin Lu, Ziqi Li, Chang Shu, Yu Yan, Donghua Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learned cardinality estimation methods have achieved high precision compared
+to traditional methods. Among learned methods, query-driven approaches face the
+data and workload drift problem for a long time. Although both query-driven and
+hybrid methods are proposed to avoid this problem, even the state-of-art of
+them suffer from high training and estimation costs, limited scalability,
+instability, and long-tailed distribution problem on high cardinality and high
+dimensional tables, which seriously affects the practical application of
+learned cardinality estimators. In this paper, we prove that most of these
+problems are directly caused by the widely used progressive sampling. We solve
+this problem by introducing predicates into the autoregressive model and
+propose Duet, a stable, efficient, and scalable hybrid method to estimate
+cardinality directly without sampling or any non-differentiable process, which
+can not only reduces the inference complexity from $O(n)$ to $O(1)$ compared to
+Naru and UAE but also achieve higher accuracy on high cardinality and high
+dimensional tables. Experimental results show that Duet can achieve all the
+design goals above and be much more practical and even has a lower inference
+cost on CPU than that of most learned methods on GPU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span> Generate Train (PGT): Few-shot Domain Adaption of Retrieval
+  Augmented Generation Models for Open Book Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05915v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05915v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        C. S. Krishna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a framework - Prompt, Generate, Train (PGT) - to efficiently
+develop a generative question-answering model for open-book question-answering
+over a proprietary collection of text documents. The framework adapts a
+retriever augmented generation (RAG) model to the target domain using
+supervised fine-tuning and reinforcement learning with synthetic feedback in a
+few-shot setting. This, we hypothesize, will yield an aligned, uncertainty
+calibrated model that is competitive with GPT-4 based in-context retrieval
+augmented generation in generating relevant answers at lower serving costs. The
+framework's synthetic generation pipeline will generate synthetic training data
+comprising <passage, question, answer> tuples using an open-source LLM and a
+novel consistency filtering scheme. The pipeline will be designed to generate
+both abstractive and extractive questions that span the entire corpus. The
+framework proposes to fine-tune a smaller RAG model comprising a dense
+retriever (ColBERTv2) and a smaller sized LLM on the synthetic dataset. In
+parallel, the framework will train a Reward model to score domain grounded
+answers higher than hallucinated answers using an a priori relevance ordering
+of synthetically assembled samples. In the next phase, the framework will align
+the RAG model with the target domain using reinforcement learning (Proximal
+Policy Optimization). This step may improve the RAG model's ability to generate
+grounded answers and ignore out of domain questions. In the final phase, the
+framework will calibrate the model's uncertainty for extractive
+question-answers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fairness in Recommendation: Foundations, Methods and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.13619v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.13619v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunqi Li, Hanxiong Chen, Shuyuan Xu, Yingqiang Ge, Juntao Tan, Shuchang Liu, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As one of the most pervasive applications of machine learning, recommender
+systems are playing an important role on assisting human decision making. The
+satisfaction of users and the interests of platforms are closely related to the
+quality of the generated recommendation results. However, as a highly
+data-driven system, recommender system could be affected by data or algorithmic
+bias and thus generate unfair results, which could weaken the reliance of the
+systems. As a result, it is crucial to address the potential unfairness
+problems in recommendation settings. Recently, there has been growing attention
+on fairness considerations in recommender systems with more and more literature
+on approaches to promote fairness in recommendation. However, the studies are
+rather fragmented and lack a systematic organization, thus making it difficult
+to penetrate for new researchers to the domain. This motivates us to provide a
+systematic survey of existing works on fairness in recommendation. This survey
+focuses on the foundations for fairness in recommendation literature. It first
+presents a brief introduction about fairness in basic machine learning tasks
+such as classification and ranking in order to provide a general overview of
+fairness research, as well as introduce the more complex situations and
+challenges that need to be considered when studying fairness in recommender
+systems. After that, the survey will introduce fairness in recommendation with
+a focus on the taxonomies of current fairness definitions, the typical
+techniques for improving fairness, as well as the datasets for fairness studies
+in recommendation. The survey also talks about the challenges and opportunities
+in fairness research with the hope of promoting the fair recommendation
+research area and beyond.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 2 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Table and Image Generation for Investigating Knowledge of Entities in
+  <span class="highlight-title">Pre-train</span>ed Vision and Language Models <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02115v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02115v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hidetaka Kamigaito, Katsuhiko Hayashi, Taro Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a table and image generation task to verify how the
+knowledge about entities acquired from natural language is retained in Vision &
+Language (V&L) models. This task consists of two parts: the first is to
+generate a table containing knowledge about an entity and its related image,
+and the second is to generate an image from an entity with a caption and a
+table containing related knowledge of the entity. In both tasks, the model must
+know the entities used to perform the generation properly. We created the
+Wikipedia Table and Image Generation (WikiTIG) dataset from about 200,000
+infoboxes in English Wikipedia articles to perform the proposed tasks. We
+evaluated the performance on the tasks with respect to the above research
+question using the V&L model OFA, which has achieved state-of-the-art results
+in multiple tasks. Experimental results show that OFA forgets part of its
+entity knowledge by pre-training as a complement to improve the performance of
+image related tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient and Accurate Physics-aware Multiplex Graph Neural Networks for
+  3D Small Molecules and Macromolecule Complexes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02789v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02789v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Zhang, Yang Liu, Lei Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in applying Graph Neural Networks (GNNs) to molecular science
+have showcased the power of learning three-dimensional (3D) structure
+representations with GNNs. However, most existing GNNs suffer from the
+limitations of insufficient modeling of diverse interactions, computational
+expensive operations, and ignorance of vectorial values. Here, we tackle these
+limitations by proposing a novel GNN model, Physics-aware Multiplex Graph
+Neural Network (PaxNet), to efficiently and accurately learn the
+representations of 3D molecules for both small organic compounds and
+macromolecule complexes. PaxNet separates the modeling of local and non-local
+interactions inspired by molecular mechanics, and reduces the expensive
+angle-related computations. Besides scalar properties, PaxNet can also predict
+vectorial properties by learning an associated vector for each atom. To
+evaluate the performance of PaxNet, we compare it with state-of-the-art
+baselines in two tasks. On small molecule dataset for predicting quantum
+chemical properties, PaxNet reduces the prediction error by 15% and uses 73%
+less memory than the best baseline. On macromolecule dataset for predicting
+protein-ligand binding affinities, PaxNet outperforms the best baseline while
+reducing the memory consumption by 33% and the inference time by 85%. Thus,
+PaxNet provides a universal, robust and accurate method for large-scale machine
+learning of molecules. Our code is available at
+https://github.com/zetayue/Physics-aware-Multiplex-GNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rate-optimal Bayesian Simple Regret in Best Arm Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.09885v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.09885v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junpei Komiyama, Kaito Ariu, Masahiro Kato, Chao Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider best arm identification in the multi-armed bandit problem.
+Assuming certain continuity conditions of the prior, we characterize the rate
+of the Bayesian simple regret. Differing from Bayesian regret minimization
+(Lai, 1987), the leading term in the Bayesian simple regret derives from the
+region where the gap between optimal and suboptimal arms is smaller than
+$\sqrt{\frac{\log T}{T}}$. We propose a simple and easy-to-compute algorithm
+with its leading term matching with the lower bound up to a constant factor;
+simulation results support our theoretical findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Mathematics of Operations Research. Changed the title
+  from the previous version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MetaDT: Meta Decision Tree with Class Hierarchy for Interpretable
+  Few-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.01482v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.01482v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baoquan Zhang, Hao Jiang, Xutao Li, Shanshan Feng, Yunming Ye, Rui Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-Shot Learning (FSL) is a challenging task, which aims to recognize novel
+classes with few examples. Recently, lots of methods have been proposed from
+the perspective of meta-learning and representation learning. However, few
+works focus on the interpretability of FSL decision process. In this paper, we
+take a step towards the interpretable FSL by proposing a novel meta-learning
+based decision tree framework, namely, MetaDT. In particular, the FSL
+interpretability is achieved from two aspects, i.e., a concept aspect and a
+visual aspect. On the concept aspect, we first introduce a tree-like concept
+hierarchy as FSL prior. Then, resorting to the prior, we split each few-shot
+task to a set of subtasks with different concept levels and then perform class
+prediction via a model of decision tree. The advantage of such design is that a
+sequence of high-level concept decisions that lead up to a final class
+prediction can be obtained, which clarifies the FSL decision process. On the
+visual aspect, a set of subtask-specific classifiers with visual attention
+mechanism is designed to perform decision at each node of the decision tree. As
+a result, a subtask-specific heatmap visualization can be obtained to achieve
+the decision interpretability of each tree node. At last, to alleviate the data
+scarcity issue of FSL, we regard the prior of concept hierarchy as an
+undirected graph, and then design a graph convolution-based decision tree
+inference network as our meta-learner to infer parameters of the decision tree.
+Extensive experiments on performance comparison and interpretability analysis
+show superiority of our MetaDT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Teal: Learning-Accelerated Optimization of WAN Traffic Engineering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.13763v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.13763v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiying Xu, Francis Y. Yan, Rachee Singh, Justin T. Chiu, Alexander M. Rush, Minlan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid expansion of global cloud wide-area networks (WANs) has posed a
+challenge for commercial optimization engines to efficiently solve network
+traffic engineering (TE) problems at scale. Existing acceleration strategies
+decompose TE optimization into concurrent subproblems but realize limited
+parallelism due to an inherent tradeoff between run time and allocation
+performance.
+  We present Teal, a learning-based TE algorithm that leverages the parallel
+processing power of GPUs to accelerate TE control. First, Teal designs a
+flow-centric graph neural network (GNN) to capture WAN connectivity and network
+flows, learning flow features as inputs to downstream allocation. Second, to
+reduce the problem scale and make learning tractable, Teal employs a
+multi-agent reinforcement learning (RL) algorithm to independently allocate
+each traffic demand while optimizing a central TE objective. Finally, Teal
+fine-tunes allocations with ADMM (Alternating Direction Method of Multipliers),
+a highly parallelizable optimization algorithm for reducing constraint
+violations such as overutilized links.
+  We evaluate Teal using traffic matrices from Microsoft's WAN. On a large WAN
+topology with >1,700 nodes, Teal generates near-optimal flow allocations while
+running several orders of magnitude faster than the production optimization
+engine. Compared with other TE acceleration schemes, Teal satisfies 6--32% more
+traffic demand and yields 197--625x speedups.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Kullback-Leibler Maillard Sampling for Multi-armed Bandits with Bounded
+  Rewards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14989v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14989v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Qin, Kwang-Sung Jun, Chicheng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study $K$-armed bandit problems where the reward distributions of the arms
+are all supported on the $[0,1]$ interval. It has been a challenge to design
+regret-efficient randomized exploration algorithms in this setting. Maillard
+sampling~\cite{maillard13apprentissage}, an attractive alternative to Thompson
+sampling, has recently been shown to achieve competitive regret guarantees in
+the sub-Gaussian reward setting~\cite{bian2022maillard} while maintaining
+closed-form action probabilities, which is useful for offline policy
+evaluation. In this work, we propose the Kullback-Leibler Maillard Sampling
+(KL-MS) algorithm, a natural extension of Maillard sampling for achieving
+KL-style gap-dependent regret bound. We show that KL-MS enjoys the asymptotic
+optimality when the rewards are Bernoulli and has a worst-case regret bound of
+the form $O(\sqrt{\mu^*(1-\mu^*) K T \ln K} + K \ln T)$, where $\mu^*$ is the
+expected reward of the optimal arm, and $T$ is the time horizon length.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WavJourney: Compositional Audio Creation with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xubo Liu, Zhongkai Zhu, Haohe Liu, Yi Yuan, Meng Cui, Qiushi Huang, Jinhua Liang, Yin Cao, Qiuqiang Kong, Mark D. Plumbley, Wenwu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown great promise in integrating diverse
+expert models to tackle intricate language and vision tasks. Despite their
+significance in advancing the field of Artificial Intelligence Generated
+Content (AIGC), their potential in intelligent audio content creation remains
+unexplored. In this work, we tackle the problem of creating audio content with
+storylines encompassing speech, music, and sound effects, guided by text
+instructions. We present WavJourney, a system that leverages LLMs to connect
+various audio models for audio content generation. Given a text description of
+an auditory scene, WavJourney first prompts LLMs to generate a structured
+script dedicated to audio storytelling. The audio script incorporates diverse
+audio elements, organized based on their spatio-temporal relationships. As a
+conceptual representation of audio, the audio script provides an interactive
+and interpretable rationale for human engagement. Afterward, the audio script
+is fed into a script compiler, converting it into a computer program. Each line
+of the program calls a task-specific audio generation model or computational
+operation function (e.g., concatenate, mix). The computer program is then
+executed to obtain an explainable solution for audio generation. We demonstrate
+the practicality of WavJourney across diverse real-world scenarios, including
+science fiction, education, and radio play. The explainable and interactive
+design of WavJourney fosters human-machine co-creation in multi-round
+dialogues, enhancing creative control and adaptability in audio production.
+WavJourney audiolizes the human imagination, opening up new avenues for
+creativity in multimedia content creation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://audio-agi.github.io/WavJourney_demopage/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural-based Cross-modal Search and Retrieval of Artwork 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Gong, Georgina Cosma, Axel Finke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating an intelligent search and retrieval system for artwork images,
+particularly paintings, is crucial for documenting cultural heritage, fostering
+wider public engagement, and advancing artistic analysis and interpretation.
+Visual-Semantic Embedding (VSE) networks are deep learning models used for
+information retrieval, which learn joint representations of textual and visual
+data, enabling 1) cross-modal search and retrieval tasks, such as image-to-text
+and text-to-image retrieval; and 2) relation-focused retrieval to capture
+entity relationships and provide more contextually relevant search results.
+Although VSE networks have played a significant role in cross-modal information
+retrieval, their application to painting datasets, such as ArtUK, remains
+unexplored. This paper introduces BoonArt, a VSE-based cross-modal search
+engine that allows users to search for images using textual queries, and to
+obtain textual descriptions along with the corresponding images when using
+image queries. The performance of BoonArt was evaluated using the ArtUK
+dataset. Experimental evaluations revealed that BoonArt achieved 97% Recall@10
+for image-to-text retrieval, and 97.4% Recall@10 for text-to-image Retrieval.
+By bridging the gap between textual and visual modalities, BoonArt provides a
+much-improved search performance compared to traditional search engines, such
+as the one provided by the ArtUK website. BoonArt can be utilised to work with
+other artwork datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boon: A Neural Search Engine for Cross-Modal Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Gong, Georgina Cosma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual-Semantic Embedding (VSE) networks can help search engines better
+understand the meaning behind visual content and associate it with relevant
+textual information, leading to more accurate search results. VSE networks can
+be used in cross-modal search engines to embed image and textual descriptions
+in a shared space, enabling image-to-text and text-to-image retrieval tasks.
+However, the full potential of VSE networks for search engines has yet to be
+fully explored. This paper presents Boon, a novel cross-modal search engine
+that combines two state-of-the-art networks: the GPT-3.5-turbo large language
+model, and the VSE network VITR (VIsion Transformers with Relation-focused
+learning) to enhance the engine's capabilities in extracting and reasoning with
+regional relationships in images. VITR employs encoders from CLIP that were
+trained with 400 million image-description pairs and it was fine-turned on the
+RefCOCOg dataset. Boon's neural-based components serve as its main
+functionalities: 1) a 'cross-modal search engine' that enables end-users to
+perform image-to-text and text-to-image retrieval. 2) a 'multi-lingual
+conversational AI' component that enables the end-user to converse about one or
+more images selected by the end-user. Such a feature makes the search engine
+accessible to a wide audience, including those with visual impairments. 3) Boon
+is multi-lingual and can take queries and handle conversations about images in
+multiple languages. Boon was implemented using the Django and PyTorch
+frameworks. The interface and capabilities of the Boon search engine are
+demonstrated using the RefCOCOg dataset, and the engine's ability to search for
+multimedia through the web is facilitated by Google's API.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A semantics-driven methodology for high-quality image annotation <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fausto Giunchiglia, Mayukh Bagchi, Xiaolei Diao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work in Machine Learning and Computer Vision has highlighted the
+presence of various types of systematic flaws inside ground truth object
+recognition benchmark datasets. Our basic tenet is that these flaws are rooted
+in the many-to-many mappings which exist between the visual information encoded
+in images and the intended semantics of the labels annotating them. The net
+consequence is that the current annotation process is largely under-specified,
+thus leaving too much freedom to the subjective judgment of annotators. In this
+paper, we propose vTelos, an integrated Natural Language Processing, Knowledge
+Representation, and Computer Vision methodology whose main goal is to make
+explicit the (otherwise implicit) intended annotation semantics, thus
+minimizing the number and role of subjective choices. A key element of vTelos
+is the exploitation of the WordNet lexico-semantic hierarchy as the main means
+for providing the meaning of natural language labels and, as a consequence, for
+driving the annotation of images based on the objects and the visual properties
+they depict. The methodology is validated on images populating a subset of the
+ImageNet hierarchy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted @ 26th European Conference on Artificial Intelligence (ECAI)
+  2023, Krak\'ow, Poland</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of Video Quality <span class="highlight-title">Dataset</span>s via Design of Minimalistic Video
+  Quality Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Sun, Wen Wen, Xiongkuo Min, Long Lan, Guangtao Zhai, Kede Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blind video quality assessment (BVQA) plays an indispensable role in
+monitoring and improving the end-users' viewing experience in various
+real-world video-enabled media applications. As an experimental field, the
+improvements of BVQA models have been measured primarily on a few human-rated
+VQA datasets. Thus, it is crucial to gain a better understanding of existing
+VQA datasets in order to properly evaluate the current progress in BVQA.
+Towards this goal, we conduct a first-of-its-kind computational analysis of VQA
+datasets via designing minimalistic BVQA models. By minimalistic, we restrict
+our family of BVQA models to build only upon basic blocks: a video preprocessor
+(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an
+optional temporal quality analyzer, and a quality regressor, all with the
+simplest possible instantiations. By comparing the quality prediction
+performance of different model variants on eight VQA datasets with realistic
+distortions, we find that nearly all datasets suffer from the easy dataset
+problem of varying severity, some of which even admit blind image quality
+assessment (BIQA) solutions. We additionally justify our claims by contrasting
+our model generalizability on these VQA datasets, and by ablating a dizzying
+set of BVQA design choices related to the basic building blocks. Our results
+cast doubt on the current progress in BVQA, and meanwhile shed light on good
+practices of constructing next-generation VQA datasets and models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Who is Speaking Actually? Robust and Versatile Speaker Traceability for
+  Voice Conversion <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05152v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05152v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanzhen Ren, Hongcheng Zhu, Liming Zhai, Zongkun Sun, Rubing Shen, Lina Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Voice conversion (VC), as a voice style transfer technology, is becoming
+increasingly prevalent while raising serious concerns about its illegal use.
+Proactively tracing the origins of VC-generated speeches, i.e., speaker
+traceability, can prevent the misuse of VC, but unfortunately has not been
+extensively studied. In this paper, we are the first to investigate the speaker
+traceability for VC and propose a traceable VC framework named VoxTracer. Our
+VoxTracer is similar to but beyond the paradigm of audio watermarking. We first
+use unique speaker embedding to represent speaker identity. Then we design a
+VAE-Glow structure, in which the hiding process imperceptibly integrates the
+source speaker identity into the VC, and the tracing process accurately
+recovers the source speaker identity and even the source speech in spite of
+severe speech quality degradation. To address the speech mismatch between the
+hiding and tracing processes affected by different distortions, we also adopt
+an asynchronous training strategy to optimize the VAE-Glow models. The
+VoxTracer is versatile enough to be applied to arbitrary VC methods and popular
+audio coding standards. Extensive experiments demonstrate that the VoxTracer
+achieves not only high imperceptibility in hiding, but also nearly 100% tracing
+accuracy against various types of audio lossy compressions (AAC, MP3, Opus and
+SILK) with a broad range of bitrates (16 kbps - 128 kbps) even in a very short
+time duration (0.74s). Our speech demo is available at
+https://anonymous.4open.science/w/DEMOofVoxTracer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>has been accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Priors in Deep Image Restoration and Enhancement: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02070v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02070v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfan Lu, Yiqi Lin, Hao Wu, Yunhao Luo, Xu Zheng, Hui Xiong, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image restoration and enhancement is a process of improving the image quality
+by removing degradations, such as noise, blur, and resolution degradation. Deep
+learning (DL) has recently been applied to image restoration and enhancement.
+Due to its ill-posed property, plenty of works have been explored priors to
+facilitate training deep neural networks (DNNs). However, the importance of
+priors has not been systematically studied and analyzed by far in the research
+community. Therefore, this paper serves as the first study that provides a
+comprehensive overview of recent advancements in priors for deep image
+restoration and enhancement. Our work covers five primary contents: (1) A
+theoretical analysis of priors for deep image restoration and enhancement; (2)
+A hierarchical and structural taxonomy of priors commonly used in the DL-based
+methods; (3) An insightful discussion on each prior regarding its principle,
+potential, and applications; (4) A summary of crucial problems by highlighting
+the potential future directions, especially adopting the large-scale foundation
+models as prior, to spark more research in the community; (5) An open-source
+repository that provides a taxonomy of all mentioned works and code links.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAS Video-QA: Self-Adaptive Sampling for Efficient Video
+  Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Han, Hui Chen, Min-Yen Kan, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video question--answering is a fundamental task in the field of video
+understanding. Although current vision--language models (VLMs) equipped with
+Video Transformers have enabled temporal modeling and yielded superior results,
+they are at the cost of huge computational power and thus too expensive to
+deploy in real-time application scenarios. An economical workaround only
+samples a small portion of frames to represent the main content of that video
+and tune an image--text model on these sampled frames. Recent video
+understanding models usually randomly sample a set of frames or clips,
+regardless of internal correlations between their visual contents, nor their
+relevance to the problem. We argue that such kinds of aimless sampling may omit
+the key frames from which the correct answer can be deduced, and the situation
+gets worse when the sampling sparsity increases, which always happens as the
+video lengths increase. To mitigate this issue, we propose two frame sampling
+strategies, namely the most domain frames (MDF) and most implied frames (MIF),
+to maximally preserve those frames that are most likely vital to the given
+questions. MDF passively minimizes the risk of key frame omission in a
+bootstrap manner, while MIS actively searches key frames customized for each
+video--question pair with the assistance of auxiliary models. The experimental
+results on three public datasets from three advanced VLMs (CLIP, GIT and
+All-in-one) demonstrate that our proposed strategies can boost the performance
+for image--text pretrained models. The source codes pertaining to the method
+proposed in this paper are publicly available at
+https://github.com/declare-lab/sas-vqa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-25T00:00:00Z">2023-07-25</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">38</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Large Language Models for Radiology Natural Language
+  Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengliang Liu, Tianyang Zhong, Yiwei Li, Yutong Zhang, Yi Pan, Zihao Zhao, Peixin Dong, Chao Cao, Yuxiao Liu, Peng Shu, Yaonai Wei, Zihao Wu, Chong Ma, Jiaqi Wang, Sheng Wang, Mengyue Zhou, Zuowei Jiang, Chunlin Li, Shaochen Xu, Lu Zhang, Haixing Dai, Kai Zhang, Xu Liu, Lin Zhao, Peilong Wang, Pingkun Yan, Jun Liu, Bao Ge, Lichao Sun, Dajiang Zhu, Xiang Li, Wei Liu, Xiaoyan Cai, Xintao Hu, Xi Jiang, Shu Zhang, Xin Zhang, Tuo Zhang, Shijie Zhao, Quanzheng Li, Hongtu Zhu, Dinggang Shen, Tianming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of large language models (LLMs) has marked a pivotal shift in the
+field of natural language processing (NLP). LLMs have revolutionized a
+multitude of domains, and they have made a significant impact in the medical
+field. Large language models are now more abundant than ever, and many of these
+models exhibit bilingual capabilities, proficient in both English and Chinese.
+However, a comprehensive evaluation of these models remains to be conducted.
+This lack of assessment is especially apparent within the context of radiology
+NLP. This study seeks to bridge this gap by critically evaluating thirty two
+LLMs in interpreting radiology reports, a crucial component of radiology NLP.
+Specifically, the ability to derive impressions from radiologic findings is
+assessed. The outcomes of this evaluation provide key insights into the
+performance, strengths, and weaknesses of these LLMs, informing their practical
+applications within the medical domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARB: Advanced Reasoning Benchmark for Large Language Models <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13692v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13692v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomohiro Sawada, Daniel Paleka, Alexander Havrilla, Pranav Tadepalli, Paula Vidas, Alexander Kranias, John J. Nay, Kshitij Gupta, Aran Komatsuzaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable performance on
+various quantitative reasoning and knowledge benchmarks. However, many of these
+benchmarks are losing utility as LLMs get increasingly high scores, despite not
+yet reaching expert performance in these domains. We introduce ARB, a novel
+benchmark composed of advanced reasoning problems in multiple fields. ARB
+presents a more challenging test than prior benchmarks, featuring problems in
+mathematics, physics, biology, chemistry, and law. As a subset of ARB, we
+introduce a challenging set of math and physics problems which require advanced
+symbolic reasoning and domain knowledge. We evaluate recent models such as
+GPT-4 and Claude on ARB and demonstrate that current models score well below
+50% on more demanding tasks. In order to improve both automatic and assisted
+evaluation capabilities, we introduce a rubric-based evaluation approach,
+allowing GPT-4 to score its own intermediate reasoning steps. Further, we
+conduct a human evaluation of the symbolic subset of ARB, finding promising
+agreement between annotators and GPT-4 rubric evaluation scores.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to NeurIPS Datasets and Benchmarks Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comprehensive Evaluation and Analysis Study for Chinese Spelling Check 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunjian Yin, Xiaojun Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the development of pre-trained models and the incorporation of phonetic
+and graphic information, neural models have achieved high scores in Chinese
+Spelling Check (CSC). However, it does not provide a comprehensive reflection
+of the models' capability due to the limited test sets. In this study, we
+abstract the representative model paradigm, implement it with nine structures
+and experiment them on comprehensive test sets we constructed with different
+purposes. We perform a detailed analysis of the results and find that: 1)
+Fusing phonetic and graphic information reasonably is effective for CSC. 2)
+Models are sensitive to the error distribution of the test set, which reflects
+the shortcomings of models and reveals the direction we should work on. 3)
+Whether or not the errors and contexts have been seen has a significant impact
+on models. 4) The commonly used benchmark, SIGHAN, can not reliably evaluate
+models' performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contributions to the Improvement of Question Answering Systems in the
+  Biomedical Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mourad Sarrouti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This thesis work falls within the framework of question answering (QA) in the
+biomedical domain where several specific challenges are addressed, such as
+specialized lexicons and terminologies, the types of treated questions, and the
+characteristics of targeted documents. We are particularly interested in
+studying and improving methods that aim at finding accurate and short answers
+to biomedical natural language questions from a large scale of biomedical
+textual documents in English. QA aims at providing inquirers with direct, short
+and precise answers to their natural language questions. In this Ph.D. thesis,
+we propose four contributions to improve the performance of QA in the
+biomedical domain. In our first contribution, we propose a machine
+learning-based method for question type classification to determine the types
+of given questions which enable to a biomedical QA system to use the
+appropriate answer extraction method. We also propose an another machine
+learning-based method to assign one or more topics (e.g., pharmacological,
+test, treatment, etc.) to given questions in order to determine the semantic
+types of the expected answers which are very useful in generating specific
+answer retrieval strategies. In the second contribution, we first propose a
+document retrieval method to retrieve a set of relevant documents that are
+likely to contain the answers to biomedical questions from the MEDLINE
+database. We then present a passage retrieval method to retrieve a set of
+relevant passages to questions. In the third contribution, we propose specific
+answer extraction methods to generate both exact and ideal answers. Finally, in
+the fourth contribution, we develop a fully automated semantic biomedical QA
+system called SemBioNLQA which is able to deal with a variety of natural
+language questions and to generate appropriate answers by providing both exact
+and ideal answers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Doctoral thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">GPT</span>-3 Models are Few-Shot Financial Reasoners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raul Salles de Padua, Imran Qureshi, Mustafa U. Karakaplan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Financial analysis is an important tool for evaluating company performance.
+Practitioners work to answer financial questions to make profitable investment
+decisions, and use advanced quantitative analyses to do so. As a result,
+Financial Question Answering (QA) is a question answering task that requires
+deep reasoning about numbers. Furthermore, it is unknown how well pre-trained
+language models can reason in the financial domain. The current
+state-of-the-art requires a retriever to collect relevant facts about the
+financial question from the text and a generator to produce a valid financial
+program and a final answer. However, recently large language models like GPT-3
+have achieved state-of-the-art performance on wide variety of tasks with just a
+few shot examples. We run several experiments with GPT-3 and find that a
+separate retrieval model and logic engine continue to be essential components
+to achieving SOTA performance in this task, particularly due to the precise
+nature of financial questions and the complex information stored in financial
+documents. With this understanding, our refined prompt-engineering approach on
+GPT-3 achieves near SOTA accuracy without any fine-tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XDLM: Cross-lingual Diffusion Language Model for Machine Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linyao Chen, Aosong Feng, Boming Yang, Zihui Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, diffusion models have excelled in image generation tasks and have
+also been applied to neural language processing (NLP) for controllable text
+generation. However, the application of diffusion models in a cross-lingual
+setting is less unexplored. Additionally, while pretraining with diffusion
+models has been studied within a single language, the potential of
+cross-lingual pretraining remains understudied. To address these gaps, we
+propose XDLM, a novel Cross-lingual diffusion model for machine translation,
+consisting of pretraining and fine-tuning stages. In the pretraining stage, we
+propose TLDM, a new training objective for mastering the mapping between
+different languages; in the fine-tuning stage, we build up the translation
+system based on the pretrained model. We evaluate the result on several machine
+translation benchmarks and outperformed both diffusion and Transformer
+baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FacTool: Factuality Detection in Generative AI -- A Tool Augmented
+  Framework for Multi-Task and Multi-Domain Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        I-Chun Chern, Steffi Chern, Shiqi Chen, Weizhe Yuan, Kehua Feng, Chunting Zhou, Junxian He, Graham Neubig, Pengfei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of generative pre-trained models has facilitated the synthesis
+of high-quality text, but it has also posed challenges in identifying factual
+errors in the generated text. In particular: (1) A wider range of tasks now
+face an increasing risk of containing factual errors when handled by generative
+models. (2) Generated texts tend to be lengthy and lack a clearly defined
+granularity for individual facts. (3) There is a scarcity of explicit evidence
+available during the process of fact checking. With the above challenges in
+mind, in this paper, we propose FacTool, a task and domain agnostic framework
+for detecting factual errors of texts generated by large language models (e.g.,
+ChatGPT). Experiments on four different tasks (knowledge-based QA, code
+generation, mathematical reasoning, and scientific literature review) show the
+efficacy of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zshot: An Open-source Framework for Zero-Shot Named Entity Recognition
+  and Relation Extraction <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13497v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13497v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriele Picco, Marcos Martínez Galindo, Alberto Purpura, Leopold Fuchs, Vanessa López, Hoang Thanh Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Zero-Shot Learning (ZSL) task pertains to the identification of entities
+or relations in texts that were not seen during training. ZSL has emerged as a
+critical research area due to the scarcity of labeled data in specific domains,
+and its applications have grown significantly in recent years. With the advent
+of large pretrained language models, several novel methods have been proposed,
+resulting in substantial improvements in ZSL performance. There is a growing
+demand, both in the research community and industry, for a comprehensive ZSL
+framework that facilitates the development and accessibility of the latest
+methods and pretrained models.In this study, we propose a novel ZSL framework
+called Zshot that aims to address the aforementioned challenges. Our primary
+objective is to provide a platform that allows researchers to compare different
+state-of-the-art ZSL methods with standard benchmark datasets. Additionally, we
+have designed our framework to support the industry with readily available APIs
+for production under the standard SpaCy NLP pipeline. Our API is extendible and
+evaluable, moreover, we include numerous enhancements such as boosting the
+accuracy with pipeline ensembling and visualization utilities available as a
+SpaCy extension.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Holistic Exploration on Universal Decompositional Semantic Parsing:
+  Architecture, Data Augmentation, and LLM Paradigm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hexuan Deng, Xin Zhang, Meishan Zhang, Xuebo Liu, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we conduct a holistic exploration of the Universal
+Decompositional Semantic (UDS) Parsing. We first introduce a cascade model for
+UDS parsing that decomposes the complex parsing task into semantically
+appropriate subtasks. Our approach outperforms the prior models, while
+significantly reducing inference time. We also incorporate syntactic
+information and further optimized the architecture. Besides, different ways for
+data augmentation are explored, which further improve the UDS Parsing. Lastly,
+we conduct experiments to investigate the efficacy of ChatGPT in handling the
+UDS task, revealing that it excels in attribute parsing but struggles in
+relation parsing, and using ChatGPT for data augmentation yields suboptimal
+results. Our code is available at https://github.com/hexuandeng/HExp4UDS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Resolving Word Ambiguity with Word Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Thurnbauer, Johannes Reisinger, Christoph Goller, Andreas Fischer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ambiguity is ubiquitous in natural language. Resolving ambiguous meanings is
+especially important in information retrieval tasks. While word embeddings
+carry semantic information, they fail to handle ambiguity well. Transformer
+models have been shown to handle word ambiguity for complex queries, but they
+cannot be used to identify ambiguous words, e.g. for a 1-word query.
+Furthermore, training these models is costly in terms of time, hardware
+resources, and training data, prohibiting their use in specialized environments
+with sensitive data. Word embeddings can be trained using moderate hardware
+resources. This paper shows that applying DBSCAN clustering to the latent space
+can identify ambiguous words and evaluate their level of ambiguity. An
+automatic DBSCAN parameter selection leads to high-quality clusters, which are
+semantically coherent and correspond well to the perceived meanings of a given
+word.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Bridging the Digital Language Divide 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13405v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13405v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gábor Bella, Paula Helm, Gertraud Koch, Fausto Giunchiglia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is a well-known fact that current AI-based language technology -- language
+models, machine translation systems, multilingual dictionaries and corpora --
+focuses on the world's 2-3% most widely spoken languages. Recent research
+efforts have attempted to expand the coverage of AI technology to
+`under-resourced languages.' The goal of our paper is to bring attention to a
+phenomenon that we call linguistic bias: multilingual language processing
+systems often exhibit a hardwired, yet usually involuntary and hidden
+representational preference towards certain languages. Linguistic bias is
+manifested in uneven per-language performance even in the case of similar test
+conditions. We show that biased technology is often the result of research and
+development methodologies that do not do justice to the complexity of the
+languages being represented, and that can even become ethically problematic as
+they disregard valuable aspects of diversity as well as the needs of the
+language communities themselves. As our attempt at building diversity-aware
+language resources, we present a new initiative that aims at reducing
+linguistic bias through both technological design and methodology, based on an
+eye-level collaboration with local communities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embedding Models for Supervised Automatic Extraction and Classification
+  of Named Entities in Scientific Acknowledgements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nina Smirnova, Philipp Mayr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acknowledgments in scientific papers may give an insight into aspects of the
+scientific community, such as reward systems, collaboration patterns, and
+hidden research trends. The aim of the paper is to evaluate the performance of
+different embedding models for the task of automatic extraction and
+classification of acknowledged entities from the acknowledgment text in
+scientific papers. We trained and implemented a named entity recognition (NER)
+task using the Flair NLP framework. The training was conducted using three
+default Flair NER models with four differently-sized corpora and different
+versions of the Flair NLP framework. The Flair Embeddings model trained on the
+medium corpus with the latest FLAIR version showed the best accuracy of 0.79.
+Expanding the size of a training corpus from very small to medium size
+massively increased the accuracy of all training algorithms, but further
+expansion of the training corpus did not bring further improvement. Moreover,
+the performance of the model slightly deteriorated. Our model is able to
+recognize six entity types: funding agency, grant number, individuals,
+university, corporation, and miscellaneous. The model works more precisely for
+some entity types than for others; thus, individuals and grant numbers showed a
+very good F1-Score over 0.9. Most of the previous works on acknowledgment
+analysis were limited by the manual evaluation of data and therefore by the
+amount of processed data. This model can be applied for the comprehensive
+analysis of acknowledgment texts and may potentially make a great contribution
+to the field of automated acknowledgment analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The present paper is an extended version of the article Evaluation of
+  Embedding Models for Automatic Extraction and Classification of Acknowledged
+  Entities in Scientific Documents (Smirnova and Mayr, 2022) presented at the
+  3rd Workshop on Extraction and Evaluation of Knowledge Entities from
+  Scientific Documents (EEKE2022). arXiv admin note: substantial text overlap
+  with arXiv:2206.10939</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empower Your Model with Longer and Better Context Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei Gao, Lei Wang, Jun Fang, Longhua Hu, Jun Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, with the emergence of numerous Large Language Models (LLMs), the
+implementation of AI has entered a new era. Irrespective of these models' own
+capacity and structure, there is a growing demand for LLMs to possess enhanced
+comprehension of longer and more complex contexts with relatively smaller
+sizes. Models often encounter an upper limit when processing sequences of
+sentences that extend beyond their comprehension capacity and result in
+off-topic or even chaotic responses. While several recent works attempt to
+address this issue in various ways, they rarely focus on "why models are unable
+to compensate or strengthen their capabilities on their own". In this paper, we
+thoroughly investigate the nature of information transfer within LLMs and
+propose a novel technique called Attention Transition. This technique empowers
+models to achieve longer and better context comprehension with minimal
+additional training or impact on generation fluency. Our experiments are
+conducted in XSum and achieve substantial improvement compared with the
+original generation results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>LLM for long context comprehension</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing Chain-of-Thought <span class="highlight-title">Prompt</span>ing in Large Language Models via
+  Gradient-based Feature Attributions <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Skyler Wu, Eric Meng Shen, Charumathi Badrinath, Jiaqi Ma, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-thought (CoT) prompting has been shown to empirically improve the
+accuracy of large language models (LLMs) on various question answering tasks.
+While understanding why CoT prompting is effective is crucial to ensuring that
+this phenomenon is a consequence of desired model behavior, little work has
+addressed this; nonetheless, such an understanding is a critical prerequisite
+for responsible model deployment. We address this question by leveraging
+gradient-based feature attribution methods which produce saliency scores that
+capture the influence of input tokens on model output. Specifically, we probe
+several open-source LLMs to investigate whether CoT prompting affects the
+relative importances they assign to particular input tokens. Our results
+indicate that while CoT prompting does not increase the magnitude of saliency
+scores attributed to semantically relevant tokens in the prompt compared to
+standard few-shot prompting, it increases the robustness of saliency scores to
+question perturbations and variations in model output.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Workshop on Challenges in Deployable Generative AI at
+  ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Intent Taxonomy of Legal Case Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunqiu Shao, Haitao Li, Yueyue Wu, Yiqun Liu, Qingyao Ai, Jiaxin Mao, Yixiao Ma, Shaoping Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legal case retrieval is a special Information Retrieval~(IR) task focusing on
+legal case documents. Depending on the downstream tasks of the retrieved case
+documents, users' information needs in legal case retrieval could be
+significantly different from those in Web search and traditional ad-hoc
+retrieval tasks. While there are several studies that retrieve legal cases
+based on text similarity, the underlying search intents of legal retrieval
+users, as shown in this paper, are more complicated than that yet mostly
+unexplored. To this end, we present a novel hierarchical intent taxonomy of
+legal case retrieval. It consists of five intent types categorized by three
+criteria, i.e., search for Particular Case(s), Characterization, Penalty,
+Procedure, and Interest. The taxonomy was constructed transparently and
+evaluated extensively through interviews, editorial user studies, and query log
+analysis. Through a laboratory user study, we reveal significant differences in
+user behavior and satisfaction under different search intents in legal case
+retrieval. Furthermore, we apply the proposed taxonomy to various downstream
+legal retrieval tasks, e.g., result ranking and satisfaction prediction, and
+demonstrate its effectiveness. Our work provides important insights into the
+understanding of user intents in legal case retrieval and potentially leads to
+better retrieval techniques in the legal domain, such as intent-aware ranking
+strategies and evaluation methodologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, work in process</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LoraHub: Efficient Cross-Task Generalization via Dynamic LoRA
+  Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengsong Huang, Qian Liu, Bill Yuchen Lin, Tianyu Pang, Chao Du, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-rank adaptations (LoRA) are often employed to fine-tune large language
+models (LLMs) for new tasks. This paper investigates LoRA composability for
+cross-task generalization and introduces LoraHub, a strategic framework devised
+for the purposive assembly of LoRA modules trained on diverse given tasks, with
+the objective of achieving adaptable performance on unseen tasks. With just a
+few examples from a novel task, LoraHub enables the fluid combination of
+multiple LoRA modules, eradicating the need for human expertise. Notably, the
+composition requires neither additional model parameters nor gradients. Our
+empirical results, derived from the Big-Bench Hard (BBH) benchmark, suggest
+that LoraHub can effectively mimic the performance of in-context learning in
+few-shot scenarios, excluding the necessity of in-context examples alongside
+each inference input. A significant contribution of our research is the
+fostering of a community for LoRA, where users can share their trained LoRA
+modules, thereby facilitating their application to new tasks. We anticipate
+this resource will widen access to and spur advancements in general
+intelligence as well as LLMs in production. Code will be available at
+https://github.com/sail-sg/lorahub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress. The first three authors contributed equally to this
+  work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WebArena: A Realistic Web Environment for Building Autonomous Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyan Zhou, Frank F. Xu, Hao Zhu, Xuhui Zhou, Robert Lo, Abishek Sridhar, Xianyi Cheng, Yonatan Bisk, Daniel Fried, Uri Alon, Graham Neubig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With generative AI advances, the exciting potential for autonomous agents to
+manage daily tasks via natural language commands has emerged. However, cur rent
+agents are primarily created and tested in simplified synthetic environments,
+substantially limiting real-world scenario representation. In this paper, we
+build an environment for agent command and control that is highly realistic and
+reproducible. Specifically, we focus on agents that perform tasks on websites,
+and we create an environment with fully functional websites from four common
+domains: e-commerce, social forum discussions, collaborative software
+development, and content management. Our environment is enriched with tools
+(e.g., a map) and external knowledge bases (e.g., user manuals) to encourage
+human-like task-solving. Building upon our environment, we release a set of
+benchmark tasks focusing on evaluating the functional correctness of task
+completions. The tasks in our benchmark are diverse, long-horizon, and are
+designed to emulate tasks that humans routinely perform on the internet. We
+design and implement several autonomous agents, integrating recent techniques
+such as reasoning before acting. The results demonstrate that solving complex
+tasks is challenging: our best GPT-4-based agent only achieves an end-to-end
+task success rate of 10.59%. These results highlight the need for further
+development of robust agents, that current state-of-the-art LMs are far from
+perfect performance in these real-life tasks, and that WebArena can be used to
+measure such progress. Our code, data, environment reproduction resources, and
+video demonstrations are publicly available at https://webarena.dev/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARC-NLP at Multimodal Hate Speech Event Detection 2023: Multimodal
+  Methods Boosted by Ensemble Learning, Syntactical and Entity Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13829v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13829v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Umitcan Sahin, Izzet Emre Kucukkaya, Oguzhan Ozcelik, Cagri Toraman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-embedded images can serve as a means of spreading hate speech,
+propaganda, and extremist beliefs. Throughout the Russia-Ukraine war, both
+opposing factions heavily relied on text-embedded images as a vehicle for
+spreading propaganda and hate speech. Ensuring the effective detection of hate
+speech and propaganda is of utmost importance to mitigate the negative effect
+of hate speech dissemination. In this paper, we outline our methodologies for
+two subtasks of Multimodal Hate Speech Event Detection 2023. For the first
+subtask, hate speech detection, we utilize multimodal deep learning models
+boosted by ensemble learning and syntactical text attributes. For the second
+subtask, target detection, we employ multimodal deep learning models boosted by
+named entity features. Through experimentation, we demonstrate the superior
+performance of our models compared to all textual, visual, and text-visual
+baselines employed in multimodal hate speech detection. Furthermore, our models
+achieve the first place in both subtasks on the final leaderboard of the shared
+task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to CASE at RANLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Watermarking Conditional Text Generation for AI Detection: Unveiling
+  Challenges and a Semantic-Aware Watermark Remedy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Fu, Deyi Xiong, Yue Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To mitigate potential risks associated with language models, recent AI
+detection research proposes incorporating watermarks into machine-generated
+text through random vocabulary restrictions and utilizing this information for
+detection. While these watermarks only induce a slight deterioration in
+perplexity, our empirical investigation reveals a significant detriment to the
+performance of conditional text generation. To address this issue, we introduce
+a simple yet effective semantic-aware watermarking algorithm that considers the
+characteristics of conditional text generation and the input context.
+Experimental results demonstrate that our proposed method yields substantial
+improvements across various text generation models, including BART and Flan-T5,
+in tasks such as summarization and data-to-text generation while maintaining
+detection ability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is <span class="highlight-title">GPT</span> a Computational Model of Emotion? Detailed Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13779v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13779v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ala N. Tak, Jonathan Gratch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the emotional reasoning abilities of the GPT family
+of large language models via a component perspective. The paper first examines
+how the model reasons about autobiographical memories. Second, it
+systematically varies aspects of situations to impact emotion intensity and
+coping tendencies. Even without the use of prompt engineering, it is shown that
+GPT's predictions align significantly with human-provided appraisals and
+emotional labels. However, GPT faces difficulties predicting emotion intensity
+and coping responses. GPT-4 showed the highest performance in the initial study
+but fell short in the second, despite providing superior results after minor
+prompt engineering. This assessment brings up questions on how to effectively
+employ the strong points and address the weak areas of these models,
+particularly concerning response variability. These studies underscore the
+merits of evaluating models from a componential perspective.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Combating the Curse of Multilinguality in Cross-Lingual WSD by Aligning
+  Sparse Contextualized Word Representations <span class="chip">NAACL2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gábor Berend
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we advocate for using large pre-trained monolingual language
+models in cross lingual zero-shot word sense disambiguation (WSD) coupled with
+a contextualized mapping mechanism. We also report rigorous experiments that
+illustrate the effectiveness of employing sparse contextualized word
+representations obtained via a dictionary learning procedure. Our experimental
+results demonstrate that the above modifications yield a significant
+improvement of nearly 6.5 points of increase in the average F-score (from 62.0
+to 68.5) over a collection of 17 typologically diverse set of target languages.
+We release our source code for replicating our experiments at
+https://github.com/begab/sparsity_makes_sense.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at NAACL2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diversity and Language Technology: How Techno-Linguistic Bias Can Cause
+  Epistemic Injustice 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paula Helm, Gábor Bella, Gertraud Koch, Fausto Giunchiglia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is well known that AI-based language technology -- large language models,
+machine translation systems, multilingual dictionaries, and corpora -- is
+currently limited to 2 to 3 percent of the world's most widely spoken and/or
+financially and politically best supported languages. In response, recent
+research efforts have sought to extend the reach of AI technology to
+``underserved languages.'' In this paper, we show that many of these attempts
+produce flawed solutions that adhere to a hard-wired representational
+preference for certain languages, which we call techno-linguistic bias.
+Techno-linguistic bias is distinct from the well-established phenomenon of
+linguistic bias as it does not concern the languages represented but rather the
+design of the technologies. As we show through the paper, techno-linguistic
+bias can result in systems that can only express concepts that are part of the
+language and culture of dominant powers, unable to correctly represent concepts
+from other communities. We argue that at the root of this problem lies a
+systematic tendency of technology developer communities to apply a simplistic
+understanding of diversity which does not do justice to the more profound
+differences that languages, and ultimately the communities that speak them,
+embody. Drawing on the concept of epistemic injustice, we point to the broader
+sociopolitical consequences of the bias we identify and show how it can lead
+not only to a disregard for valuable aspects of diversity but also to an
+under-representation of the needs and diverse worldviews of marginalized
+language communities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2307.13405</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QuIP: 2-Bit Quantization of Large Language Models With Guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerry Chee, Yaohui Cai, Volodymyr Kuleshov, Christopher De Sa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work studies post-training parameter quantization in large language
+models (LLMs). We introduce quantization with incoherence processing (QuIP), a
+new method based on the insight that quantization benefits from incoherent
+weight and Hessian matrices, i.e., from the weights and the directions in which
+it is important to round them accurately being unaligned with the coordinate
+axes. QuIP consists of two steps: (1) an adaptive rounding procedure minimizing
+a quadratic proxy objective; (2) efficient pre- and post-processing that
+ensures weight and Hessian incoherence via multiplication by random orthogonal
+matrices. We complement QuIP with the first theoretical analysis for an
+LLM-scale quantization algorithm, and show that our theory also applies to an
+existing method, OPTQ. Empirically, we find that our incoherence preprocessing
+improves several existing quantization algorithms and yields the first LLM
+quantization methods that produce viable results using only two bits per
+weight. Our code can be found at https://github.com/jerry-chee/QuIP .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stabilizing <span class="highlight-title">Transformer</span> Training by Preventing Attention Entropy
+  Collapse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangfei Zhai, Tatiana Likhomanenko, Etai Littwin, Dan Busbridge, Jason Ramapuram, Yizhe Zhang, Jiatao Gu, Josh Susskind
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training stability is of great importance to Transformers. In this work, we
+investigate the training dynamics of Transformers by examining the evolution of
+the attention layers. In particular, we track the attention entropy for each
+attention head during the course of training, which is a proxy for model
+sharpness. We identify a common pattern across different architectures and
+tasks, where low attention entropy is accompanied by high training instability,
+which can take the form of oscillating loss or divergence. We denote the
+pathologically low attention entropy, corresponding to highly concentrated
+attention scores, as $\textit{entropy collapse}$. As a remedy, we propose
+$\sigma$Reparam, a simple and efficient solution where we reparametrize all
+linear layers with spectral normalization and an additional learned scalar. We
+demonstrate that $\sigma$Reparam successfully prevents entropy collapse in the
+attention layers, promoting more stable training. Additionally, we prove a
+tight lower bound of the attention entropy, which decreases exponentially fast
+with the spectral norm of the attention logits, providing additional motivation
+for our approach. We conduct experiments with $\sigma$Reparam on image
+classification, image self-supervised learning, machine translation, speech
+recognition, and language modeling tasks. We show that $\sigma$Reparam provides
+stability and robustness with respect to the choice of hyperparameters, going
+so far as enabling training (a) a Vision Transformer {to competitive
+performance} without warmup, weight decay, layer normalization or adaptive
+optimizers; (b) deep architectures in machine translation and (c) speech
+recognition to competitive performance without warmup and adaptive optimizers.
+Code is available at \url{https://github.com/apple/ml-sigma-reparam}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soft <span class="highlight-title">Prompt</span> Tuning for Augmenting Dense Retrieval with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Peng, Xuyang Wu, Yi Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dense retrieval (DR) converts queries and documents into dense embeddings and
+measures the similarity between queries and documents in vector space. One of
+the challenges in DR is the lack of domain-specific training data. While DR
+models can learn from large-scale public datasets like MS MARCO through
+transfer learning, evidence shows that not all DR models and domains can
+benefit from transfer learning equally. Recently, some researchers have
+resorted to large language models (LLMs) to improve the zero-shot and few-shot
+DR models. However, the hard prompts or human-written prompts utilized in these
+works cannot guarantee the good quality of generated weak queries. To tackle
+this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,
+we leverage soft prompt-tuning to optimize a task-specific soft prompt on
+limited ground truth data and then prompt the LLMs to tag unlabeled documents
+with weak queries, yielding enough weak document-query pairs to train
+task-specific dense retrievers. We design a filter to select high-quality
+example document-query pairs in the prompt to further improve the quality of
+weak tagged queries. To the best of our knowledge, there is no prior work
+utilizing soft prompt tuning to augment DR models. The experiments demonstrate
+that SPTAR outperforms the unsupervised baselines BM25 and the recently
+proposed LLMs-based augmentation method for DR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>fix typo InPairs which should be InPars</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Meta-Referential Games to Learn Compositional Learning Behaviours 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.08012v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.08012v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Denamganaï, Sondess Missaoui, James Alfred Walker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human beings use compositionality to generalise from past experiences to
+novel experiences. We assume a separation of our experiences into fundamental
+atomic components that can be recombined in novel ways to support our ability
+to engage with novel experiences. We frame this as the ability to learn to
+generalise compositionally, and we will refer to behaviours making use of this
+ability as compositional learning behaviours (CLBs). A central problem to
+learning CLBs is the resolution of a binding problem (BP). While it is another
+feat of intelligence that human beings perform with ease, it is not the case
+for state-of-the-art artificial agents. Thus, in order to build artificial
+agents able to collaborate with human beings, we propose to develop a novel
+benchmark to investigate agents' abilities to exhibit CLBs by solving a
+domain-agnostic version of the BP. We take inspiration from the language
+emergence and grounding framework of referential games and propose a
+meta-learning extension of referential games, entitled Meta-Referential Games,
+and use this framework to build our benchmark, that we name Symbolic Behaviour
+Benchmark (S2B). We provide baseline results showing that our benchmark is a
+compelling challenge that we hope will spur the research community towards
+developing more capable artificial agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DataComp: In search of the next generation of multimodal <span class="highlight-title">dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14108v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14108v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samir Yitzhak Gadre, Gabriel Ilharco, Alex Fang, Jonathan Hayase, Georgios Smyrnis, Thao Nguyen, Ryan Marten, Mitchell Wortsman, Dhruba Ghosh, Jieyu Zhang, Eyal Orgad, Rahim Entezari, Giannis Daras, Sarah Pratt, Vivek Ramanujan, Yonatan Bitton, Kalyani Marathe, Stephen Mussmann, Richard Vencu, Mehdi Cherti, Ranjay Krishna, Pang Wei Koh, Olga Saukh, Alexander Ratner, Shuran Song, Hannaneh Hajishirzi, Ali Farhadi, Romain Beaumont, Sewoong Oh, Alex Dimakis, Jenia Jitsev, Yair Carmon, Vaishaal Shankar, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal datasets are a critical component in recent breakthroughs such as
+Stable Diffusion and GPT-4, yet their design does not receive the same research
+attention as model architectures or training algorithms. To address this
+shortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset
+experiments centered around a new candidate pool of 12.8 billion image-text
+pairs from Common Crawl. Participants in our benchmark design new filtering
+techniques or curate new data sources and then evaluate their new dataset by
+running our standardized CLIP training code and testing the resulting model on
+38 downstream test sets. Our benchmark consists of multiple compute scales
+spanning four orders of magnitude, which enables the study of scaling trends
+and makes the benchmark accessible to researchers with varying resources. Our
+baseline experiments show that the DataComp workflow leads to better training
+sets. In particular, our best baseline, DataComp-1B, enables training a CLIP
+ViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming
+OpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training
+procedure and compute. We release DataComp and all accompanying code at
+www.datacomp.ai.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revision <span class="highlight-title">Transformer</span>s: Instructing Language Models to Change their
+  Values 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10332v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10332v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Friedrich, Wolfgang Stammer, Patrick Schramowski, Kristian Kersting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current transformer language models (LM) are large-scale models with billions
+of parameters. They have been shown to provide high performances on a variety
+of tasks but are also prone to shortcut learning and bias. Addressing such
+incorrect model behavior via parameter adjustments is very costly. This is
+particularly problematic for updating dynamic concepts, such as moral values,
+which vary culturally or interpersonally. In this work, we question the current
+common practice of storing all information in the model parameters and propose
+the Revision Transformer (RiT) to facilitate easy model updating. The specific
+combination of a large-scale pre-trained LM that inherently but also diffusely
+encodes world knowledge with a clear-structured revision engine makes it
+possible to update the model's knowledge with little effort and the help of
+user interaction. We exemplify RiT on a moral dataset and simulate user
+feedback demonstrating strong performance in model revision even with small
+data. This way, users can easily design a model regarding their preferences,
+paving the way for more transparent AI models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpretable Word Sense Representations via Definition Generation: The
+  Case of Semantic Change Analysis <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11993v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11993v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mario Giulianelli, Iris Luden, Raquel Fernandez, Andrey Kutuzov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose using automatically generated natural language definitions of
+contextualised word usages as interpretable word and word sense
+representations. Given a collection of usage examples for a target word, and
+the corresponding data-driven usage clusters (i.e., word senses), a definition
+is generated for each usage with a specialised Flan-T5 language model, and the
+most prototypical definition in a usage cluster is chosen as the sense label.
+  We demonstrate how the resulting sense labels can make existing approaches to
+semantic change analysis more interpretable, and how they can allow users --
+historical linguists, lexicographers, or social scientists -- to explore and
+intuitively explain diachronic trajectories of word meaning. Semantic change
+analysis is only one of many possible applications of the `definitions as
+representations' paradigm. Beyond being human-readable, contextualised
+definitions also outperform token or usage sentence embeddings in
+word-in-context semantic similarity judgements, making them a new promising
+type of lexical representation for NLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative User-Experience Research for Developing Domain-specific
+  Natural Language Processing Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasia Zhukova, Lukas von Sperl, Christian E. Matt, Bela Gipp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User experience (UX) is a part of human-computer interaction (HCI) research
+and focuses on increasing intuitiveness, transparency, simplicity, and trust
+for system users. Most of the UX research for machine learning (ML) or natural
+language processing (NLP) focuses on a data-driven methodology, i.e., it fails
+to focus on users' requirements, and engages domain users mainly for usability
+evaluation. Moreover, more typical UX methods tailor the systems towards user
+usability, unlike learning about the user needs first. The paper proposes a
+methodology for integrating generative UX research into developing domain NLP
+applications. Generative UX research employs domain users at the initial stages
+of prototype development, i.e., ideation and concept evaluation, and the last
+stage for evaluating the change in user value. In the case study, we report the
+full-cycle prototype development of a domain-specific semantic search for daily
+operations in the process industry. Our case study shows that involving domain
+experts increases their interest and trust in the final NLP application.
+Moreover, we show that synergetic UX+NLP research efficiently considers data-
+and user-driven opportunities and constraints, which can be crucial for NLP
+applications in narrow domains
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Corrections of Zipf's and Heaps' Laws Derived from Hapax Rate Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12896v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12896v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Łukasz Dębowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The article introduces corrections to Zipf's and Heaps' laws based on
+systematic models of the hapax rate. The derivation rests on two assumptions:
+The first one is the standard urn model which predicts that marginal frequency
+distributions for shorter texts look as if word tokens were sampled blindly
+from a given longer text. The second assumption posits that the rate of hapaxes
+is a simple function of the text size. Four such functions are discussed: the
+constant model, the Davis model, the linear model, and the logistic model. It
+is shown that the logistic model yields the best fit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 7 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retentive Network: A Successor to <span class="highlight-title">Transformer</span> for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08621v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08621v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose Retentive Network (RetNet) as a foundation
+architecture for large language models, simultaneously achieving training
+parallelism, low-cost inference, and good performance. We theoretically derive
+the connection between recurrence and attention. Then we propose the retention
+mechanism for sequence modeling, which supports three computation paradigms,
+i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel
+representation allows for training parallelism. The recurrent representation
+enables low-cost $O(1)$ inference, which improves decoding throughput, latency,
+and GPU memory without sacrificing performance. The chunkwise recurrent
+representation facilitates efficient long-sequence modeling with linear
+complexity, where each chunk is encoded parallelly while recurrently
+summarizing the chunks. Experimental results on language modeling show that
+RetNet achieves favorable scaling results, parallel training, low-cost
+deployment, and efficient inference. The intriguing properties make RetNet a
+strong successor to Transformer for large language models. Code will be
+available at https://aka.ms/retnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RRAML: Reinforced Retrieval Augmented Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12798v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12798v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bacciu, Florin Cocunasu, Federico Siciliano, Fabrizio Silvestri, Nicola Tonellotto, Giovanni Trappolini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large language models (LLMs) has revolutionized machine
+learning and related fields, showcasing remarkable abilities in comprehending,
+generating, and manipulating human language. However, their conventional usage
+through API-based text prompt submissions imposes certain limitations in terms
+of context constraints and external source availability. To address these
+challenges, we propose a novel framework called Reinforced Retrieval Augmented
+Machine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs
+with supporting information retrieved by a purpose-built retriever from a vast
+user-provided database. By leveraging recent advancements in reinforcement
+learning, our method effectively addresses several critical challenges.
+Firstly, it circumvents the need for accessing LLM gradients. Secondly, our
+method alleviates the burden of retraining LLMs for specific tasks, as it is
+often impractical or impossible due to restricted access to the model and the
+computational intensity involved. Additionally we seamlessly link the
+retriever's task with the reasoner, mitigating hallucinations and reducing
+irrelevant, and potentially damaging retrieved documents. We believe that the
+research agenda outlined in this paper has the potential to profoundly impact
+the field of AI, democratizing access to and utilization of LLMs for a wide
+range of entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Question Decomposition Improves the Faithfulness of Model-Generated
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11768v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11768v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ansh Radhakrishnan, Karina Nguyen, Anna Chen, Carol Chen, Carson Denison, Danny Hernandez, Esin Durmus, Evan Hubinger, Jackson Kernion, Kamilė Lukošiūtė, Newton Cheng, Nicholas Joseph, Nicholas Schiefer, Oliver Rausch, Sam McCandlish, Sheer El Showk, Tamera Lanham, Tim Maxwell, Venkatesa Chandrasekaran, Zac Hatfield-Dodds, Jared Kaplan, Jan Brauner, Samuel R. Bowman, Ethan Perez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) perform more difficult tasks, it becomes
+harder to verify the correctness and safety of their behavior. One approach to
+help with this issue is to prompt LLMs to externalize their reasoning, e.g., by
+having them generate step-by-step reasoning as they answer a question
+(Chain-of-Thought; CoT). The reasoning may enable us to check the process that
+models use to perform tasks. However, this approach relies on the stated
+reasoning faithfully reflecting the model's actual reasoning, which is not
+always the case. To improve over the faithfulness of CoT reasoning, we have
+models generate reasoning by decomposing questions into subquestions.
+Decomposition-based methods achieve strong performance on question-answering
+tasks, sometimes approaching that of CoT while improving the faithfulness of
+the model's stated reasoning on several recently-proposed metrics. By forcing
+the model to answer simpler subquestions in separate contexts, we greatly
+increase the faithfulness of model-generated reasoning over CoT, while still
+achieving some of the performance gains of CoT. Our results show it is possible
+to improve the faithfulness of model-generated reasoning; continued
+improvements may lead to reasoning that enables us to verify the correctness
+and safety of LLM behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For few-shot examples and prompts, see
+  https://github.com/anthropics/DecompositionFaithfulnessPaper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Emotion<span class="highlight-title">Prompt</span>: Leveraging Psychology for Large Language Models
+  Enhancement via Emotional Stimulus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11760v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11760v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Li, Jindong Wang, Kaijie Zhu, Yixuan Zhang, Wenxin Hou, Jianxun Lian, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved significant performance in many
+fields such as reasoning, language understanding, and math problem-solving, and
+are regarded as a crucial step to artificial general intelligence (AGI).
+However, the sensitivity of LLMs to prompts remains a major bottleneck for
+their daily adoption. In this paper, we take inspiration from psychology and
+propose EmotionPrompt to explore emotional intelligence to enhance the
+performance of LLMs. EmotionPrompt operates on a remarkably straightforward
+principle: the incorporation of emotional stimulus into prompts. Experimental
+results demonstrate that our EmotionPrompt, using the same single prompt
+templates, significantly outperforms original zero-shot prompt and
+Zero-shot-CoT on 8 tasks with diverse models: ChatGPT, Vicuna-13b, Bloom, and
+T5. Further, EmotionPrompt was observed to improve both truthfulness and
+informativeness. We believe that EmotionPrompt heralds a novel avenue for
+exploring interdisciplinary knowledge for humans-LLMs interaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress; 9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Concept Algebra for Score-Based Conditional Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03693v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03693v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Wang, Lin Gui, Jeffrey Negrea, Victor Veitch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper concerns the structure of learned representations in text-guided
+generative models, focusing on score-based models. Here, we focus on the idea
+that concepts are encoded as subspaces (or directions) of some representation
+space. We develop a mathematical formalization of this idea.Using this
+formalism, we show there's a natural choice of representation with this
+property, and we develop a simple method for identifying the part of the
+representation corresponding to a given concept. In particular, this allows us
+to manipulate the concepts expressed by the model through algebraic
+manipulation of the representation. We demonstrate the idea with examples
+text-guided image generation, using Stable Diffusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying & Modeling Multimodal Interactions: An Information
+  Decomposition Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12247v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12247v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Pu Liang, Yun Cheng, Xiang Fan, Chun Kai Ling, Suzanne Nie, Richard Chen, Zihao Deng, Nicholas Allen, Randy Auerbach, Faisal Mahmood, Ruslan Salakhutdinov, Louis-Philippe Morency
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent explosion of interest in multimodal applications has resulted in a
+wide selection of datasets and methods for representing and integrating
+information from different modalities. Despite these empirical advances, there
+remain fundamental research questions: How can we quantify the interactions
+that are necessary to solve a multimodal task? Subsequently, what are the most
+suitable multimodal models to capture these interactions? To answer these
+questions, we propose an information-theoretic approach to quantify the degree
+of redundancy, uniqueness, and synergy relating input modalities with an output
+task. We term these three measures as the PID statistics of a multimodal
+distribution (or PID for short), and introduce two new estimators for these PID
+statistics that scale to high-dimensional distributions. To validate PID
+estimation, we conduct extensive experiments on both synthetic datasets where
+the PID is known and on large-scale multimodal benchmarks where PID estimations
+are compared with human annotations. Finally, we demonstrate their usefulness
+in (1) quantifying interactions within multimodal datasets, (2) quantifying
+interactions captured by multimodal models, (3) principled approaches for model
+selection, and (4) three real-world case studies engaging with domain experts
+in pathology, mood prediction, and robotic perception where our framework helps
+to recommend strong multimodal models for each application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at: https://github.com/pliang279/PID</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Direct Speech Translation for Automatic Subtitling <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.13192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.13192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Papi, Marco Gaido, Alina Karakanta, Mauro Cettolo, Matteo Negri, Marco Turchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic subtitling is the task of automatically translating the speech of
+audiovisual content into short pieces of timed text, i.e. subtitles and their
+corresponding timestamps. The generated subtitles need to conform to space and
+time requirements, while being synchronised with the speech and segmented in a
+way that facilitates comprehension. Given its considerable complexity, the task
+has so far been addressed through a pipeline of components that separately deal
+with transcribing, translating, and segmenting text into subtitles, as well as
+predicting timestamps. In this paper, we propose the first direct ST model for
+automatic subtitling that generates subtitles in the target language along with
+their timestamps with a single model. Our experiments on 7 language pairs show
+that our approach outperforms a cascade system in the same data condition, also
+being competitive with production tools on both in-domain and newly-released
+out-domain benchmarks covering new scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at TACL</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">100</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking and Analyzing Generative Data for Visual Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13697v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13697v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Li, Haotian Liu, Liangyu Chen, Yong Jae Lee, Chunyuan Li, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in large pre-trained generative models have expanded their
+potential as effective data generators in visual recognition. This work delves
+into the impact of generative images, primarily comparing paradigms that
+harness external data (\ie generative \vs retrieval \vs original).
+  Our key contributions are: \textbf{1) GenBench Construction:} We devise
+\textbf{GenBench}, a broad benchmark comprising 22 datasets with 2548
+categories, to appraise generative data across various visual recognition
+tasks. \textbf{2) CLER Score:} To address the insufficient correlation of
+existing metrics (\eg, FID, CLIP score) with downstream recognition
+performance, we propose \textbf{CLER}, a training-free metric indicating
+generative data's efficiency for recognition tasks prior to training.
+\textbf{3) New Baselines:} Comparisons of generative data with retrieved data
+from the same external pool help to elucidate the unique traits of generative
+data. \textbf{4) External Knowledge Injection:} By fine-tuning special token
+embeddings for each category via Textual Inversion, performance improves across
+17 datasets, except when dealing with low-resolution reference images.
+  Our exhaustive benchmark and analysis spotlight generative data's promise in
+visual recognition, while identifying key challenges for future investigation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Research Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Visual Language of Fabrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valentin Deschaintre, Julia Guerrero-Viu, Diego Gutierrez, Tamy Boubekeur, Belen Masia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce text2fabric, a novel dataset that links free-text descriptions
+to various fabric materials. The dataset comprises 15,000 natural language
+descriptions associated to 3,000 corresponding images of fabric materials.
+Traditionally, material descriptions come in the form of tags/keywords, which
+limits their expressivity, induces pre-existing knowledge of the appropriate
+vocabulary, and ultimately leads to a chopped description system. Therefore, we
+study the use of free-text as a more appropriate way to describe material
+appearance, taking the use case of fabrics as a common item that non-experts
+may often deal with. Based on the analysis of the dataset, we identify a
+compact lexicon, set of attributes and key structure that emerge from the
+descriptions. This allows us to accurately understand how people describe
+fabrics and draw directions for generalization to other types of materials. We
+also show that our dataset enables specializing large vision-language models
+such as CLIP, creating a meaningful latent space for fabric appearance, and
+significantly improving applications such as fine-grained material retrieval
+and automatic captioning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Personal Protective Equipment Detection in Extreme Construction
+  Conditions <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuexiong Ding, Xiaowei Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object detection has been widely applied for construction safety management,
+especially personal protective equipment (PPE) detection. Though the existing
+PPE detection models trained on conventional datasets have achieved excellent
+results, their performance dramatically declines in extreme construction
+conditions. A robust detection model NST-YOLOv5 is developed by combining the
+neural style transfer (NST) and YOLOv5 technologies. Five extreme conditions
+are considered and simulated via the NST module to endow the detection model
+with excellent robustness, including low light, intense light, sand dust, fog,
+and rain. Experiments show that the NST has great potential as a tool for
+extreme data synthesis since it is better at simulating extreme conditions than
+other traditional image processing algorithms and helps the NST-YOLOv5 achieve
+0.141 and 0.083 mAP_(05:95) improvements in synthesized and real-world extreme
+data. This study provides a new feasible way to obtain a more robust detection
+model for extreme construction conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 ASCE International Conference on Computing in Civil Engineering
+  (i3CE 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QuickQual: Lightweight, convenient retinal image quality scoring with
+  off-the-shelf <span class="highlight-title">pretrain</span>ed models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justin Engelmann, Amos Storkey, Miguel O. Bernabeu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image quality remains a key problem for both traditional and deep learning
+(DL)-based approaches to retinal image analysis, but identifying poor quality
+images can be time consuming and subjective. Thus, automated methods for
+retinal image quality scoring (RIQS) are needed. The current state-of-the-art
+is MCFNet, composed of three Densenet121 backbones each operating in a
+different colour space. MCFNet, and the EyeQ dataset released by the same
+authors, was a huge step forward for RIQS. We present QuickQual, a simple
+approach to RIQS, consisting of a single off-the-shelf ImageNet-pretrained
+Densenet121 backbone plus a Support Vector Machine (SVM). QuickQual performs
+very well, setting a new state-of-the-art for EyeQ (Accuracy: 88.50% vs 88.00%
+for MCFNet; AUC: 0.9687 vs 0.9588). This suggests that RIQS can be solved with
+generic perceptual features learned on natural images, as opposed to requiring
+DL models trained on large amounts of fundus images. Additionally, we propose a
+Fixed Prior linearisation scheme, that converts EyeQ from a 3-way
+classification to a continuous logistic regression task. For this task, we
+present a second model, QuickQual MEga Minified Estimator (QuickQual-MEME),
+that consists of only 10 parameters on top of an off-the-shelf Densenet121 and
+can distinguish between gradable and ungradable images with an accuracy of
+89.18% (AUC: 0.9537). Code and model are available on GitHub:
+https://github.com/justinengelmann/QuickQual . QuickQual is so lightweight,
+that the entire inference code (and even the parameters for QuickQual-MEME) is
+already contained in this paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Transferable Object-Centric Diffeomorphic Transformations for
+  Data Augmentation in Medical Image Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13645v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13645v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nilesh Kumar, Prashnna K. Gyawali, Sandesh Ghimire, Linwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining labelled data in medical image segmentation is challenging due to
+the need for pixel-level annotations by experts. Recent works have shown that
+augmenting the object of interest with deformable transformations can help
+mitigate this challenge. However, these transformations have been learned
+globally for the image, limiting their transferability across datasets or
+applicability in problems where image alignment is difficult. While
+object-centric augmentations provide a great opportunity to overcome these
+issues, existing works are only focused on position and random transformations
+without considering shape variations of the objects. To this end, we propose a
+novel object-centric data augmentation model that is able to learn the shape
+variations for the objects of interest and augment the object in place without
+modifying the rest of the image. We demonstrated its effectiveness in improving
+kidney tumour segmentation when leveraging shape variations learned both from
+within the same dataset and transferred from external datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optical Flow boosts Unsupervised Localization and Segmentation <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13640v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13640v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Zhang, Abdeslam Boularias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised localization and segmentation are long-standing robot vision
+challenges that describe the critical ability for an autonomous robot to learn
+to decompose images into individual objects without labeled data. These tasks
+are important because of the limited availability of dense image manual
+annotation and the promising vision of adapting to an evolving set of object
+categories in lifelong learning. Most recent methods focus on using visual
+appearance continuity as object cues by spatially clustering features obtained
+from self-supervised vision transformers (ViT). In this work, we leverage
+motion cues, inspired by the common fate principle that pixels that share
+similar movements tend to belong to the same object. We propose a new loss term
+formulation that uses optical flow in unlabeled videos to encourage
+self-supervised ViT features to become closer to each other if their
+corresponding spatial locations share similar movements, and vice versa. We use
+the proposed loss function to finetune vision transformers that were originally
+trained on static images. Our fine-tuning procedure outperforms
+state-of-the-art techniques for unsupervised semantic segmentation through
+linear probing, without the use of any labeled data. This procedure also
+demonstrates increased performance over original ViT networks across
+unsupervised object localization and semantic segmentation benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IROS2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fake It Without Making It: Conditioned Face Generation for Accurate 3D
+  Face Shape Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Will Rowan, Patrik Huber, Nick Pears, Andrew Keeling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate 3D face shape estimation is an enabling technology with applications
+in healthcare, security, and creative industries, yet current state-of-the-art
+methods either rely on self-supervised training with 2D image data or
+supervised training with very limited 3D data. To bridge this gap, we present a
+novel approach which uses a conditioned stable diffusion model for face image
+generation, leveraging the abundance of 2D facial information to inform 3D
+space. By conditioning stable diffusion on depth maps sampled from a 3D
+Morphable Model (3DMM) of the human face, we generate diverse and
+shape-consistent images, forming the basis of SynthFace. We introduce this
+large-scale synthesised dataset of 250K photorealistic images and corresponding
+3DMM parameters. We further propose ControlFace, a deep neural network, trained
+on SynthFace, which achieves competitive performance on the NoW benchmark,
+without requiring 3D supervision or manual 3D asset creation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RecursiveDet: End-to-End Region-based Recursive Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13619v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13619v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Zhao, Li Sun, Qingli Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end region-based object detectors like Sparse R-CNN usually have
+multiple cascade bounding box decoding stages, which refine the current
+predictions according to their previous results. Model parameters within each
+stage are independent, evolving a huge cost. In this paper, we find the general
+setting of decoding stages is actually redundant. By simply sharing parameters
+and making a recursive decoder, the detector already obtains a significant
+improvement. The recursive decoder can be further enhanced by positional
+encoding (PE) of the proposal box, which makes it aware of the exact locations
+and sizes of input bounding boxes, thus becoming adaptive to proposals from
+different stages during the recursion. Moreover, we also design
+centerness-based PE to distinguish the RoI feature element and dynamic
+convolution kernels at different positions within the bounding box. To validate
+the effectiveness of the proposed method, we conduct intensive ablations and
+build the full model on three recent mainstream region-based detectors. The
+RecusiveDet is able to achieve obvious performance boosts with even fewer model
+parameters and slightly increased computation cost. Codes are available at
+https://github.com/bravezzzzzz/RecursiveDet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Object-based Probabilistic Similarity Evidence of Sparse Latent Features
+  from Fully Convolutional Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cyril Juliani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Similarity analysis using neural networks has emerged as a powerful technique
+for understanding and categorizing complex patterns in various domains. By
+leveraging the latent representations learned by neural networks, data objects
+such as images can be compared effectively. This research explores the
+utilization of latent information generated by fully convolutional networks
+(FCNs) in similarity analysis, notably to estimate the visual resemblance of
+objects segmented in 2D pictures. To do this, the analytical scheme comprises
+two steps: (1) extracting and transforming feature patterns per 2D object from
+a trained FCN, and (2) identifying the most similar patterns through fuzzy
+inference. The step (2) can be further enhanced by incorporating a weighting
+scheme that considers the significance of latent variables in the analysis. The
+results provide valuable insights into the benefits and challenges of employing
+neural network-based similarity analysis for discerning data patterns
+effectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decisive Data using Multi-Modality Optical Sensors for Advanced
+  Vehicular Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13600v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13600v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Ali Farooq, Waseem Shariff, Mehdi Sefidgar Dilmaghani, Wang Yao, Moazam Soomro, Peter Corcoran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical sensors have played a pivotal role in acquiring real world data for
+critical applications. This data, when integrated with advanced machine
+learning algorithms provides meaningful information thus enhancing human
+vision. This paper focuses on various optical technologies for design and
+development of state-of-the-art out-cabin forward vision systems and in-cabin
+driver monitoring systems. The focused optical sensors include Longwave Thermal
+Imaging (LWIR) cameras, Near Infrared (NIR), Neuromorphic/ event cameras,
+Visible CMOS cameras and Depth cameras. Further the paper discusses different
+potential applications which can be employed using the unique strengths of each
+these optical modalities in real time environment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Paper is accepted in 25th Irish Machine Vision and Image
+  Processing Conference (IMVIP23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mystique: Deconstructing SVG Charts for Layout Reuse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Chen, Bongshin Lee, Yunhai Wang, Yunjeong Chang, Zhicheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To facilitate the reuse of existing charts, previous research has examined
+how to obtain a semantic understanding of a chart by deconstructing its visual
+representation into reusable components, such as encodings. However, existing
+deconstruction approaches primarily focus on chart styles, handling only basic
+layouts. In this paper, we investigate how to deconstruct chart layouts,
+focusing on rectangle-based ones as they cover not only 17 chart types but also
+advanced layouts (e.g., small multiples, nested layouts). We develop an
+interactive tool, called Mystique, adopting a mixed-initiative approach to
+extract the axes and legend, and deconstruct a chart's layout into four
+semantic components: mark groups, spatial relationships, data encodings, and
+graphical constraints. Mystique employs a wizard interface that guides chart
+authors through a series of steps to specify how the deconstructed components
+map to their own data. On 150 rectangle-based SVG charts, Mystique achieves
+above 85% accuracy for axis and legend extraction and 96% accuracy for layout
+deconstruction. In a chart reproduction study, participants could easily reuse
+existing charts on new datasets. We discuss the current limitations of Mystique
+and future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at the 2023 IEEE Visualization Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Group Activity Recognition in Computer Vision: A Comprehensive <span class="highlight-title">Review</span>,
+  Challenges, and Future Perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanchuan Wang, Ahmad Sufril Azlan Mohamed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group activity recognition is a hot topic in computer vision. Recognizing
+activities through group relationships plays a vital role in group activity
+recognition. It holds practical implications in various scenarios, such as
+video analysis, surveillance, automatic driving, and understanding social
+activities. The model's key capabilities encompass efficiently modeling
+hierarchical relationships within a scene and accurately extracting distinctive
+spatiotemporal features from groups. Given this technology's extensive
+applicability, identifying group activities has garnered significant research
+attention. This work examines the current progress in technology for
+recognizing group activities, with a specific focus on global interactivity and
+activities. Firstly, we comprehensively review the pertinent literature and
+various group activity recognition approaches, from traditional methodologies
+to the latest methods based on spatial structure, descriptors, non-deep
+learning, hierarchical recurrent neural networks (HRNN), relationship models,
+and attention mechanisms. Subsequently, we present the relational network and
+relational architectures for each module. Thirdly, we investigate methods for
+recognizing group activity and compare their performance with state-of-the-art
+technologies. We summarize the existing challenges and provide comprehensive
+guidance for newcomers to understand group activity recognition. Furthermore,
+we review emerging perspectives in group activity recognition to explore new
+directions and possibilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Calibration in Dense Classification with Adaptive Label
+  Perturbation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Liu, Changkun Ye, Shan Wang, Ruikai Cui, Jing Zhang, Kaihao Zhang, Nick Barnes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For safety-related applications, it is crucial to produce trustworthy deep
+neural networks whose prediction is associated with confidence that can
+represent the likelihood of correctness for subsequent decision-making.
+Existing dense binary classification models are prone to being over-confident.
+To improve model calibration, we propose Adaptive Stochastic Label Perturbation
+(ASLP) which learns a unique label perturbation level for each training image.
+ASLP employs our proposed Self-Calibrating Binary Cross Entropy (SC-BCE) loss,
+which unifies label perturbation processes including stochastic approaches
+(like DisturbLabel), and label smoothing, to correct calibration while
+maintaining classification rates. ASLP follows Maximum Entropy Inference of
+classic statistical mechanics to maximise prediction entropy with respect to
+missing information. It performs this while: (1) preserving classification
+accuracy on known data as a conservative solution, or (2) specifically improves
+model calibration degree by minimising the gap between the prediction accuracy
+and expected confidence of the target training label. Extensive results
+demonstrate that ASLP can significantly improve calibration degrees of dense
+binary classification models on both in-distribution and out-of-distribution
+data. The code is available on https://github.com/Carlisle-Liu/ASLP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spectrum-guided Multi-granularity Referring Video Object Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Miao, Mohammed Bennamoun, Yongsheng Gao, Ajmal Mian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current referring video object segmentation (R-VOS) techniques extract
+conditional kernels from encoded (low-resolution) vision-language features to
+segment the decoded high-resolution features. We discovered that this causes
+significant feature drift, which the segmentation kernels struggle to perceive
+during the forward computation. This negatively affects the ability of
+segmentation kernels. To address the drift problem, we propose a
+Spectrum-guided Multi-granularity (SgMg) approach, which performs direct
+segmentation on the encoded features and employs visual details to further
+optimize the masks. In addition, we propose Spectrum-guided Cross-modal Fusion
+(SCF) to perform intra-frame global interactions in the spectral domain for
+effective multimodal representation. Finally, we extend SgMg to perform
+multi-object R-VOS, a new paradigm that enables simultaneous segmentation of
+multiple referred objects in a video. This not only makes R-VOS faster, but
+also more practical. Extensive experiments show that SgMg achieves
+state-of-the-art performance on four video benchmark datasets, outperforming
+the nearest competitor by 2.8% points on Ref-YouTube-VOS. Our extended SgMg
+enables multi-object R-VOS, runs about 3 times faster while maintaining
+satisfactory performance. Code is available at https://github.com/bo-miao/SgMg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023, code is at https://github.com/bo-miao/SgMg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Re-mine, Learn and Reason: Exploring the Cross-modal Semantic
+  Correlations for Language-guided HOI detection <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichao Cao, Xiu Su, Qingfei Tang, Feng Yang, Shan You, Xiaobo Lu, Chang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-Object Interaction (HOI) detection is a challenging computer vision
+task that requires visual models to address the complex interactive
+relationship between humans and objects and predict HOI triplets. Despite the
+challenges posed by the numerous interaction combinations, they also offer
+opportunities for multimodal learning of visual texts. In this paper, we
+present a systematic and unified framework (RmLR) that enhances HOI detection
+by incorporating structured text knowledge. Firstly, we qualitatively and
+quantitatively analyze the loss of interaction information in the two-stage HOI
+detector and propose a re-mining strategy to generate more comprehensive visual
+representation.Secondly, we design more fine-grained sentence- and word-level
+alignment and knowledge transfer strategies to effectively address the
+many-to-many matching problem between multiple interactions and multiple
+texts.These strategies alleviate the matching confusion problem that arises
+when multiple interactions occur simultaneously, thereby improving the
+effectiveness of the alignment process. Finally, HOI reasoning by visual
+features augmented with textual knowledge substantially improves the
+understanding of interactions. Experimental results illustrate the
+effectiveness of our approach, where state-of-the-art performance is achieved
+on public benchmarks. We further analyze the effects of different components of
+our approach to provide insights into its efficacy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Not with my name! Inferring artists' names of input strings employed by
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13527v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13527v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roberto Leotta, Oliver Giudice, Luca Guarnera, Sebastiano Battiato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Models (DM) are highly effective at generating realistic,
+high-quality images. However, these models lack creativity and merely compose
+outputs based on their training data, guided by a textual input provided at
+creation time. Is it acceptable to generate images reminiscent of an artist,
+employing his name as input? This imply that if the DM is able to replicate an
+artist's work then it was trained on some or all of his artworks thus violating
+copyright. In this paper, a preliminary study to infer the probability of use
+of an artist's name in the input string of a generated image is presented. To
+this aim we focused only on images generated by the famous DALL-E 2 and
+collected images (both original and generated) of five renowned artists.
+Finally, a dedicated Siamese Neural Network was employed to have a first kind
+of probability. Experimental results demonstrate that our approach is an
+optimal starting point and can be employed as a prior for predicting a complete
+input string of an investigated image. Dataset and code are available at:
+https://github.com/ictlab-unict/not-with-my-name .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HeightFormer: Explicit Height Modeling without Extra Data for
+  Camera-only 3D Object Detection in Bird's Eye View 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Wu, Ruixiang Li, Zequn Qin, Xinhai Zhao, Xi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-based Bird's Eye View (BEV) representation is an emerging perception
+formulation for autonomous driving. The core challenge is to construct BEV
+space with multi-camera features, which is a one-to-many ill-posed problem.
+Diving into all previous BEV representation generation methods, we found that
+most of them fall into two types: modeling depths in image views or modeling
+heights in the BEV space, mostly in an implicit way. In this work, we propose
+to explicitly model heights in the BEV space, which needs no extra data like
+LiDAR and can fit arbitrary camera rigs and types compared to modeling depths.
+Theoretically, we give proof of the equivalence between height-based methods
+and depth-based methods. Considering the equivalence and some advantages of
+modeling heights, we propose HeightFormer, which models heights and
+uncertainties in a self-recursive way. Without any extra data, the proposed
+HeightFormer could estimate heights in BEV accurately. Benchmark results show
+that the performance of HeightFormer achieves SOTA compared with those
+camera-only methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NormAUG: Normalization-guided Augmentation for Domain Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13492v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13492v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Qi, Hongpeng Yang, Yinghuan Shi, Xin Geng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has made significant advancements in supervised learning.
+However, models trained in this setting often face challenges due to domain
+shift between training and test sets, resulting in a significant drop in
+performance during testing. To address this issue, several domain
+generalization methods have been developed to learn robust and domain-invariant
+features from multiple training domains that can generalize well to unseen test
+domains. Data augmentation plays a crucial role in achieving this goal by
+enhancing the diversity of the training data. In this paper, inspired by the
+observation that normalizing an image with different statistics generated by
+different batches with various domains can perturb its feature, we propose a
+simple yet effective method called NormAUG (Normalization-guided Augmentation).
+Our method includes two paths: the main path and the auxiliary (augmented)
+path. During training, the auxiliary path includes multiple sub-paths, each
+corresponding to batch normalization for a single domain or a random
+combination of multiple domains. This introduces diverse information at the
+feature level and improves the generalization of the main path. Moreover, our
+NormAUG method effectively reduces the existing upper boundary for
+generalization based on theoretical perspectives. During the test stage, we
+leverage an ensemble strategy to combine the predictions from the auxiliary
+path of our model, further boosting performance. Extensive experiments are
+conducted on multiple benchmark datasets to validate the effectiveness of our
+proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cos R-CNN for Online Few-shot Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gratianus Wesley Putra Data, Henry Howard-Jenkins, David Murray, Victor Prisacariu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Cos R-CNN, a simple exemplar-based R-CNN formulation that is
+designed for online few-shot object detection. That is, it is able to localise
+and classify novel object categories in images with few examples without
+fine-tuning. Cos R-CNN frames detection as a learning-to-compare task: unseen
+classes are represented as exemplar images, and objects are detected based on
+their similarity to these exemplars. The cosine-based classification head
+allows for dynamic adaptation of classification parameters to the exemplar
+embedding, and encourages the clustering of similar classes in embedding space
+without the need for manual tuning of distance-metric hyperparameters. This
+simple formulation achieves best results on the recently proposed 5-way
+ImageNet few-shot detection benchmark, beating the online 1/5/10-shot scenarios
+by more than 8/3/1%, as well as performing up to 20% better in online 20-way
+few-shot VOC across all shots on novel classes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Unpublished tech report from 2020</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unlocking the Emotional World of Visual Media: An <span class="highlight-title">Overview</span> of the
+  Science, Research, and Impact of Understanding Emotion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Z. Wang, Sicheng Zhao, Chenyan Wu, Reginald B. Adams, Michelle G. Newman, Tal Shafir, Rachelle Tsachor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of artificial emotional intelligence technology is
+revolutionizing the fields of computers and robotics, allowing for a new level
+of communication and understanding of human behavior that was once thought
+impossible. While recent advancements in deep learning have transformed the
+field of computer vision, automated understanding of evoked or expressed
+emotions in visual media remains in its infancy. This foundering stems from the
+absence of a universally accepted definition of "emotion", coupled with the
+inherently subjective nature of emotions and their intricate nuances. In this
+article, we provide a comprehensive, multidisciplinary overview of the field of
+emotion analysis in visual media, drawing on insights from psychology,
+engineering, and the arts. We begin by exploring the psychological foundations
+of emotion and the computational principles that underpin the understanding of
+emotions from images and videos. We then review the latest research and systems
+within the field, accentuating the most promising approaches. We also discuss
+the current technological challenges and limitations of emotion analysis,
+underscoring the necessity for continued investigation and innovation. We
+contend that this represents a "Holy Grail" research problem in computing and
+delineate pivotal directions for future inquiry. Finally, we examine the
+ethical ramifications of emotion-understanding technologies and contemplate
+their potential societal impacts. Overall, this article endeavors to equip
+readers with a deeper understanding of the domain of emotion analysis in visual
+media and to inspire further research and development in this captivating and
+rapidly evolving field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the IEEE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weakly-supervised 3D Pose Transfer with Keypoints <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinnan Chen, Chen Li, Gim Hee Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main challenges of 3D pose transfer are: 1) Lack of paired training data
+with different characters performing the same pose; 2) Disentangling pose and
+shape information from the target mesh; 3) Difficulty in applying to meshes
+with different topologies. We thus propose a novel weakly-supervised
+keypoint-based framework to overcome these difficulties. Specifically, we use a
+topology-agnostic keypoint detector with inverse kinematics to compute
+transformations between the source and target meshes. Our method only requires
+supervision on the keypoints, can be applied to meshes with different
+topologies and is shape-invariant for the target which allows extraction of
+pose-only information from the target meshes without transferring shape
+information. We further design a cycle reconstruction to perform
+self-supervised pose transfer without the need for ground truth deformed mesh
+with the same pose and shape as the target and source, respectively. We
+evaluate our approach on benchmark human and animal datasets, where we achieve
+superior performance compared to the state-of-the-art unsupervised approaches
+and even comparable performance with the fully supervised approaches. We test
+on the more challenging Mixamo dataset to verify our approach's ability in
+handling meshes with different topologies and complex clothes. Cross-dataset
+evaluation further shows the strong generalization ability of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Explainable Model-Agnostic Algorithm for CNN-based Biometrics
+  Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Alonso-Fernandez, Kevin Hernandez-Diaz, Jose M. Buades, Prayag Tiwari, Josef Bigun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes an adaptation of the Local Interpretable Model-Agnostic
+Explanations (LIME) AI method to operate under a biometric verification
+setting. LIME was initially proposed for networks with the same output classes
+used for training, and it employs the softmax probability to determine which
+regions of the image contribute the most to classification. However, in a
+verification setting, the classes to be recognized have not been seen during
+training. In addition, instead of using the softmax output, face descriptors
+are usually obtained from a layer before the classification layer. The model is
+adapted to achieve explainability via cosine similarity between feature vectors
+of perturbated versions of the input image. The method is showcased for face
+biometrics with two CNN models based on MobileNetv2 and ResNet50.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A signal processing interpretation of noise-reduction convolutional
+  neural networks <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis A. Zavala-Mondragón, Peter H. N. de With, Fons van der Sommen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Encoding-decoding CNNs play a central role in data-driven noise reduction and
+can be found within numerous deep-learning algorithms. However, the development
+of these CNN architectures is often done in ad-hoc fashion and theoretical
+underpinnings for important design choices is generally lacking. Up to this
+moment there are different existing relevant works that strive to explain the
+internal operation of these CNNs. Still, these ideas are either scattered
+and/or may require significant expertise to be accessible for a bigger
+audience. In order to open up this exciting field, this article builds
+intuition on the theory of deep convolutional framelets and explains diverse ED
+CNN architectures in a unified theoretical framework. By connecting basic
+principles from signal processing to the field of deep learning, this
+self-contained material offers significant guidance for designing robust and
+efficient novel CNN architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article is currently accepted in IEEE Signal Processing Magazine
+  (SPM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Memory Wall Effects in CNN Engines with On-the-Fly Weights
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13412v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13412v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stylianos I. Venieris, Javier Fernandez-Marques, Nicholas D. Lane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unprecedented accuracy of convolutional neural networks (CNNs) across a
+broad range of AI tasks has led to their widespread deployment in mobile and
+embedded settings. In a pursuit for high-performance and energy-efficient
+inference, significant research effort has been invested in the design of
+FPGA-based CNN accelerators. In this context, single computation engines
+constitute a popular approach to support diverse CNN modes without the overhead
+of fabric reconfiguration. Nevertheless, this flexibility often comes with
+significantly degraded performance on memory-bound layers and resource
+underutilisation due to the suboptimal mapping of certain layers on the
+engine's fixed configuration. In this work, we investigate the implications in
+terms of CNN engine design for a class of models that introduce a
+pre-convolution stage to decompress the weights at run time. We refer to these
+approaches as on-the-fly. This paper presents unzipFPGA, a novel CNN inference
+system that counteracts the limitations of existing CNN engines. The proposed
+framework comprises a novel CNN hardware architecture that introduces a weights
+generator module that enables the on-chip on-the-fly generation of weights,
+alleviating the negative impact of limited bandwidth on memory-bound layers. We
+further enhance unzipFPGA with an automated hardware-aware methodology that
+tailors the weights generation mechanism to the target CNN-device pair, leading
+to an improved accuracy-performance balance. Finally, we introduce an input
+selective processing element (PE) design that balances the load between PEs in
+suboptimally mapped layers. The proposed framework yields hardware designs that
+achieve an average of 2.57x performance efficiency gain over highly optimised
+GPU designs for the same power constraints and up to 3.94x higher performance
+density over a diverse range of state-of-the-art FPGA-based CNN accelerators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM TODAES, 2023. arXiv admin note: substantial text
+  overlap with arXiv:2103.05600</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scoring Cycling Environments Perceived Safety using Pairwise Image
+  Comparisons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel Costa, Manuel Marques, Felix Wilhelm Siebert, Carlos Lima Azevedo, Filipe Moura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Today, many cities seek to transition to more sustainable transportation
+systems. Cycling is critical in this transition for shorter trips, including
+first-and-last-mile links to transit. Yet, if individuals perceive cycling as
+unsafe, they will not cycle and choose other transportation modes. This study
+presents a novel approach to identifying how the perception of cycling safety
+can be analyzed and understood and the impact of the built environment and
+cycling contexts on such perceptions. We base our work on other perception
+studies and pairwise comparisons, using real-world images to survey
+respondents. We repeatedly show respondents two road environments and ask them
+to select the one they perceive as safer for cycling. We compare several
+methods capable of rating cycling environments from pairwise comparisons and
+classify cycling environments perceived as safe or unsafe. Urban planning can
+use this score to improve interventions' effectiveness and improve cycling
+promotion campaigns. Furthermore, this approach facilitates the continuous
+assessment of changing cycling environments, allows for a short-term evaluation
+of measures, and is efficiently deployed in different locations or contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Unifying Anatomy Segmentation: Automated Generation of a
+  Full-body CT <span class="highlight-title">Dataset</span> via Knowledge Aggregation and Anatomical Guidelines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Jaus, Constantin Seibold, Kelsey Hermann, Alexandra Walter, Kristina Giske, Johannes Haubold, Jens Kleesiek, Rainer Stiefelhagen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we present a method for generating automated anatomy
+segmentation datasets using a sequential process that involves nnU-Net-based
+pseudo-labeling and anatomy-guided pseudo-label refinement. By combining
+various fragmented knowledge bases, we generate a dataset of whole-body CT
+scans with $142$ voxel-level labels for 533 volumes providing comprehensive
+anatomical coverage which experts have approved. Our proposed procedure does
+not rely on manual annotation during the label aggregation stage. We examine
+its plausibility and usefulness using three complementary checks: Human expert
+evaluation which approved the dataset, a Deep Learning usefulness benchmark on
+the BTCV dataset in which we achieve 85% dice score without using its training
+dataset, and medical validity checks. This evaluation procedure combines
+scalable automated checks with labor-intensive high-quality expert checks.
+Besides the dataset, we release our trained unified anatomical segmentation
+model capable of predicting $142$ anatomical structures on CT data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kefa: A Knowledge Enhanced and Fine-grained Aligned Speaker for
+  Navigation Instruction Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haitian Zeng, Xiaohan Wang, Wenguan Wang, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel speaker model \textsc{Kefa} for navigation instruction
+generation. The existing speaker models in Vision-and-Language Navigation
+suffer from the large domain gap of vision features between different
+environments and insufficient temporal grounding capability. To address the
+challenges, we propose a Knowledge Refinement Module to enhance the feature
+representation with external knowledge facts, and an Adaptive Temporal
+Alignment method to enforce fine-grained alignment between the generated
+instructions and the observation sequences. Moreover, we propose a new metric
+SPICE-D for navigation instruction evaluation, which is aware of the
+correctness of direction phrases. The experimental results on R2R and UrbanWalk
+datasets show that the proposed KEFA speaker achieves state-of-the-art
+instruction generation performance for both indoor and outdoor scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3DRP-Net: 3D Relative Position-aware Network for 3D Visual Grounding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13363v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13363v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehan Wang, Haifeng Huang, Yang Zhao, Linjun Li, Xize Cheng, Yichen Zhu, Aoxiong Yin, Zhou Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D visual grounding aims to localize the target object in a 3D point cloud by
+a free-form language description. Typically, the sentences describing the
+target object tend to provide information about its relative relation between
+other objects and its position within the whole scene. In this work, we propose
+a relation-aware one-stage framework, named 3D Relative Position-aware Network
+(3DRP-Net), which can effectively capture the relative spatial relationships
+between objects and enhance object attributes. Specifically, 1) we propose a 3D
+Relative Position Multi-head Attention (3DRP-MA) module to analyze relative
+relations from different directions in the context of object pairs, which helps
+the model to focus on the specific object relations mentioned in the sentence.
+2) We designed a soft-labeling strategy to alleviate the spatial ambiguity
+caused by redundant points, which further stabilizes and enhances the learning
+process through a constant and discriminative distribution. Extensive
+experiments conducted on three benchmarks (i.e., ScanRefer and Nr3D/Sr3D)
+demonstrate that our method outperforms all the state-of-the-art methods in
+general. The source code will be released on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Of Mice and Pose: 2D Mouse Pose Estimation from Unlabelled Data and
+  Synthetic Prior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose Sosa, Sharn Perry, Jane Alty, David Hogg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerous fields, such as ecology, biology, and neuroscience, use animal
+recordings to track and measure animal behaviour. Over time, a significant
+volume of such data has been produced, but some computer vision techniques
+cannot explore it due to the lack of annotations. To address this, we propose
+an approach for estimating 2D mouse body pose from unlabelled images using a
+synthetically generated empirical pose prior. Our proposal is based on a recent
+self-supervised method for estimating 2D human pose that uses single images and
+a set of unpaired typical 2D poses within a GAN framework. We adapt this method
+to the limb structure of the mouse and generate the empirical prior of 2D poses
+from a synthetic 3D mouse model, thereby avoiding manual annotation. In
+experiments on a new mouse video dataset, we evaluate the performance of the
+approach by comparing pose predictions to a manually obtained ground truth. We
+also compare predictions with those from a supervised state-of-the-art method
+for animal pose estimation. The latter evaluation indicates promising results
+despite the lack of paired training data. Finally, qualitative results using a
+dataset of horse images show the potential of the setting to adapt to other
+animal species.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the International Conference on Computer Vision Systems
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do humans and Convolutional Neural Networks attend to similar areas
+  during scene classification: Effects of task and image type 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13345v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13345v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Romy Müller, Marcel Duerschmidt, Julian Ullrich, Carsten Knoll, Sascha Weber, Steffen Seitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning models like Convolutional Neural Networks (CNN) are powerful
+image classifiers, but what factors determine whether they attend to similar
+image areas as humans do? While previous studies have focused on technological
+factors, little is known about the role of factors that affect human attention.
+In the present study, we investigated how the tasks used to elicit human
+attention maps interact with image characteristics in modulating the similarity
+between humans and CNN. We varied the intentionality of human tasks, ranging
+from spontaneous gaze during categorization over intentional gaze-pointing up
+to manual area selection. Moreover, we varied the type of image to be
+categorized, using either singular, salient objects, indoor scenes consisting
+of object arrangements, or landscapes without distinct objects defining the
+category. The human attention maps generated in this way were compared to the
+CNN attention maps revealed by explainable artificial intelligence (Grad-CAM).
+The influence of human tasks strongly depended on image type: For objects,
+human manual selection produced maps that were most similar to CNN, while the
+specific eye movement task has little impact. For indoor scenes, spontaneous
+gaze produced the least similarity, while for landscapes, similarity was
+equally low across all human tasks. To better understand these results, we also
+compared the different human attention maps to each other. Our results
+highlight the importance of taking human factors into account when comparing
+the attention of humans and CNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prior Based Online Lane Graph Extraction from Single Onboard Camera
+  Image <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yigit Baran Can, Alexander Liniger, Danda Pani Paudel, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The local road network information is essential for autonomous navigation.
+This information is commonly obtained from offline HD-Maps in terms of lane
+graphs. However, the local road network at a given moment can be drastically
+different than the one given in the offline maps; due to construction works,
+accidents etc. Moreover, the autonomous vehicle might be at a location not
+covered in the offline HD-Map. Thus, online estimation of the lane graph is
+crucial for widespread and reliable autonomous navigation. In this work, we
+tackle online Bird's-Eye-View lane graph extraction from a single onboard
+camera image. We propose to use prior information to increase quality of the
+estimations. The prior is extracted from the dataset through a transformer
+based Wasserstein Autoencoder. The autoencoder is then used to enhance the
+initial lane graph estimates. This is done through optimization of the latent
+space vector. The optimization encourages the lane graph estimation to be
+logical by discouraging it to diverge from the prior distribution. We test the
+method on two benchmark datasets, NuScenes and Argoverse. The results show that
+the proposed method significantly improves the performance compared to
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ITSC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Overcoming Distribution Mismatch in Quantizing Image Super-Resolution
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheeun Hong, Kyoung Mu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization is a promising approach to reduce the high computational
+complexity of image super-resolution (SR) networks. However, compared to
+high-level tasks like image classification, low-bit quantization leads to
+severe accuracy loss in SR networks. This is because feature distributions of
+SR networks are significantly divergent for each channel or input image, and is
+thus difficult to determine a quantization range. Existing SR quantization
+works approach this distribution mismatch problem by dynamically adapting
+quantization ranges to the variant distributions during test time. However,
+such dynamic adaptation incurs additional computational costs that limit the
+benefits of quantization. Instead, we propose a new quantization-aware training
+framework that effectively Overcomes the Distribution Mismatch problem in SR
+networks without the need for dynamic adaptation. Intuitively, the mismatch can
+be reduced by directly regularizing the variance in features during training.
+However, we observe that variance regularization can collide with the
+reconstruction loss during training and adversely impact SR accuracy. Thus, we
+avoid the conflict between two losses by regularizing the variance only when
+the gradients of variance regularization are cooperative with that of
+reconstruction. Additionally, to further reduce the distribution mismatch, we
+introduce distribution offsets to layers with a significant mismatch, which
+either scales or shifts channel-wise features. Our proposed algorithm, called
+ODM, effectively reduces the mismatch in distributions with minimal
+computational overhead. Experimental results show that ODM effectively
+outperforms existing SR quantization approaches with similar or fewer
+computations, demonstrating the importance of reducing the distribution
+mismatch problem. Our code is available at https://github.com/Cheeun/ODM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unmasking Anomalies in Road-Scene Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shyam Nandan Rai, Fabio Cermelli, Dario Fontanel, Carlo Masone, Barbara Caputo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly segmentation is a critical task for driving applications, and it is
+approached traditionally as a per-pixel classification problem. However,
+reasoning individually about each pixel without considering their contextual
+semantics results in high uncertainty around the objects' boundaries and
+numerous false positives. We propose a paradigm change by shifting from a
+per-pixel classification to a mask classification. Our mask-based method,
+Mask2Anomaly, demonstrates the feasibility of integrating an anomaly detection
+method in a mask-classification architecture. Mask2Anomaly includes several
+technical novelties that are designed to improve the detection of anomalies in
+masks: i) a global masked attention module to focus individually on the
+foreground and background regions; ii) a mask contrastive learning that
+maximizes the margin between an anomaly and known classes; and iii) a mask
+refinement solution to reduce false positives. Mask2Anomaly achieves new
+state-of-the-art results across a range of benchmarks, both in the per-pixel
+and component-level evaluations. In particular, Mask2Anomaly reduces the
+average false positives rate by 60% wrt the previous state-of-the-art. Github
+page:
+https://github.com/shyam671/Mask2Anomaly-Unmasking-Anomalies-in-Road-Scene-Segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Cross-client GANs-based Attack in Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Huang, Xinyu Lei, Tao Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning makes multimedia data (e.g., images) more attractive,
+however, multimedia data is usually distributed and privacy sensitive. Multiple
+distributed multimedia clients can resort to federated learning (FL) to jointly
+learn a global shared model without requiring to share their private samples
+with any third-party entities. In this paper, we show that FL suffers from the
+cross-client generative adversarial networks (GANs)-based (C-GANs) attack, in
+which a malicious client (i.e., adversary) can reconstruct samples with the
+same distribution as the training samples from other clients (i.e., victims).
+Since a benign client's data can be leaked to the adversary, this attack brings
+the risk of local data leakage for clients in many security-critical FL
+applications. Thus, we propose Fed-EDKD (i.e., Federated Ensemble Data-free
+Knowledge Distillation) technique to improve the current popular FL schemes to
+resist C-GANs attack. In Fed-EDKD, each client submits a local model to the
+server for obtaining an ensemble global model. Then, to avoid model expansion,
+Fed-EDKD adopts data-free knowledge distillation techniques to transfer
+knowledge from the ensemble global model to a compressed model. By this way,
+Fed-EDKD reduces the adversary's control capability over the global model, so
+Fed-EDKD can effectively mitigate C-GANs attack. Finally, the experimental
+results demonstrate that Fed-EDKD significantly mitigates C-GANs attack while
+only incurring a slight accuracy degradation of FL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CT-Net: Arbitrary-Shaped Text Detection via Contour <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13310v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13310v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwen Shao, Yuchen Su, Yong Zhou, Fanrong Meng, Hancheng Zhu, Bing Liu, Rui Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contour based scene text detection methods have rapidly developed recently,
+but still suffer from inaccurate frontend contour initialization, multi-stage
+error accumulation, or deficient local information aggregation. To tackle these
+limitations, we propose a novel arbitrary-shaped scene text detection framework
+named CT-Net by progressive contour regression with contour transformers.
+Specifically, we first employ a contour initialization module that generates
+coarse text contours without any post-processing. Then, we adopt contour
+refinement modules to adaptively refine text contours in an iterative manner,
+which are beneficial for context information capturing and progressive global
+contour deformation. Besides, we propose an adaptive training strategy to
+enable the contour transformers to learn more potential deformation paths, and
+introduce a re-score mechanism that can effectively suppress false positives.
+Extensive experiments are conducted on four challenging datasets, which
+demonstrate the accuracy and efficiency of our CT-Net over state-of-the-art
+methods. Particularly, CT-Net achieves F-measure of 86.1 at 11.2 frames per
+second (FPS) and F-measure of 87.8 at 10.1 FPS for CTW1500 and Total-Text
+datasets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by IEEE Transactions on Circuits and
+  Systems for Video Technology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mini-PointNetPlus: a local feature descriptor in deep learning model for
+  3d environment perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13300v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13300v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanyu Luo, Nuo Cheng, Sikun Ma, Jun Xiang, Xiaohan Li, Shengguang Lei, Pu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Common deep learning models for 3D environment perception often use
+pillarization/voxelization methods to convert point cloud data into
+pillars/voxels and then process it with a 2D/3D convolutional neural network
+(CNN). The pioneer work PointNet has been widely applied as a local feature
+descriptor, a fundamental component in deep learning models for 3D perception,
+to extract features of a point cloud. This is achieved by using a symmetric
+max-pooling operator which provides unique pillar/voxel features. However, by
+ignoring most of the points, the max-pooling operator causes an information
+loss, which reduces the model performance. To address this issue, we propose a
+novel local feature descriptor, mini-PointNetPlus, as an alternative for
+plug-and-play to PointNet. Our basic idea is to separately project the data
+points to the individual features considered, each leading to a permutation
+invariant. Thus, the proposed descriptor transforms an unordered point cloud to
+a stable order. The vanilla PointNet is proved to be a special case of our
+mini-PointNetPlus. Due to fully utilizing the features by the proposed
+descriptor, we demonstrate in experiment a considerable performance improvement
+for 3D perception.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Imperceptible Physical Attack against Face Recognition Systems via LED
+  Illumination Modulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junbin Fang, Canjian Jiang, You Jiang, Puxi Lin, Zhaojie Chen, Yujing Sun, Siu-Ming Yiu, Zoe L. Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although face recognition starts to play an important role in our daily life,
+we need to pay attention that data-driven face recognition vision systems are
+vulnerable to adversarial attacks. However, the current two categories of
+adversarial attacks, namely digital attacks and physical attacks both have
+drawbacks, with the former ones impractical and the latter one conspicuous,
+high-computational and inexecutable. To address the issues, we propose a
+practical, executable, inconspicuous and low computational adversarial attack
+based on LED illumination modulation. To fool the systems, the proposed attack
+generates imperceptible luminance changes to human eyes through fast intensity
+modulation of scene LED illumination and uses the rolling shutter effect of
+CMOS image sensors in face recognition systems to implant luminance information
+perturbation to the captured face images. In summary,we present a
+denial-of-service (DoS) attack for face detection and a dodging attack for face
+verification. We also evaluate their effectiveness against well-known face
+detection models, Dlib, MTCNN and RetinaFace , and face verification models,
+Dlib, FaceNet,and ArcFace.The extensive experiments show that the success rates
+of DoS attacks against face detection models reach 97.67%, 100%, and 100%,
+respectively, and the success rates of dodging attacks against all face
+verification models reach 100%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-Resolution Volumetric Reconstruction for Clothed Humans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sicong Tang, Guangyuan Wang, Qing Ran, Lingzhi Li, Li Shen, Ping Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel method for reconstructing clothed humans from a sparse set
+of, e.g., 1 to 6 RGB images. Despite impressive results from recent works
+employing deep implicit representation, we revisit the volumetric approach and
+demonstrate that better performance can be achieved with proper system design.
+The volumetric representation offers significant advantages in leveraging 3D
+spatial context through 3D convolutions, and the notorious quantization error
+is largely negligible with a reasonably large yet affordable volume resolution,
+e.g., 512. To handle memory and computation costs, we propose a sophisticated
+coarse-to-fine strategy with voxel culling and subspace sparse convolution. Our
+method starts with a discretized visual hull to compute a coarse shape and then
+focuses on a narrow band nearby the coarse shape for refinement. Once the shape
+is reconstructed, we adopt an image-based rendering approach, which computes
+the colors of surface points by blending input images with learned weights.
+Extensive experimental results show that our method significantly reduces the
+mean point-to-surface (P2S) precision of state-of-the-art methods by more than
+50% to achieve approximately 2mm accuracy with a 512 volume resolution.
+Additionally, images rendered from our textured model achieve a higher peak
+signal-to-noise ratio (PSNR) compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaitFormer: Revisiting Intrinsic Periodicity for Gait Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Wu, Ruixuan Xiao, Kaixin Xu, Jingcheng Ni, Boxun Li, Ziyao Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gait recognition aims to distinguish different walking patterns by analyzing
+video-level human silhouettes, rather than relying on appearance information.
+Previous research on gait recognition has primarily focused on extracting local
+or global spatial-temporal representations, while overlooking the intrinsic
+periodic features of gait sequences, which, when fully utilized, can
+significantly enhance performance. In this work, we propose a plug-and-play
+strategy, called Temporal Periodic Alignment (TPA), which leverages the
+periodic nature and fine-grained temporal dependencies of gait patterns. The
+TPA strategy comprises two key components. The first component is Adaptive
+Fourier-transform Position Encoding (AFPE), which adaptively converts features
+and discrete-time signals into embeddings that are sensitive to periodic
+walking patterns. The second component is the Temporal Aggregation Module
+(TAM), which separates embeddings into trend and seasonal components, and
+extracts meaningful temporal correlations to identify primary components, while
+filtering out random noise. We present a simple and effective baseline method
+for gait recognition, based on the TPA strategy. Extensive experiments
+conducted on three popular public datasets (CASIA-B, OU-MVLP, and GREW)
+demonstrate that our proposed method achieves state-of-the-art performance on
+multiple benchmark tests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conditional Cross Attention Network for Multi-Space Embedding without
+  Entanglement in Only a SINGLE Network <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chull Hwan Song, Taebaek Hwang, Jooyoung Yoon, Shunghyun Choi, Yeong Hyeon Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many studies in vision tasks have aimed to create effective embedding spaces
+for single-label object prediction within an image. However, in reality, most
+objects possess multiple specific attributes, such as shape, color, and length,
+with each attribute composed of various classes. To apply models in real-world
+scenarios, it is essential to be able to distinguish between the granular
+components of an object. Conventional approaches to embedding multiple specific
+attributes into a single network often result in entanglement, where
+fine-grained features of each attribute cannot be identified separately. To
+address this problem, we propose a Conditional Cross-Attention Network that
+induces disentangled multi-space embeddings for various specific attributes
+with only a single backbone. Firstly, we employ a cross-attention mechanism to
+fuse and switch the information of conditions (specific attributes), and we
+demonstrate its effectiveness through a diverse visualization example.
+Secondly, we leverage the vision transformer for the first time to a
+fine-grained image retrieval task and present a simple yet effective framework
+compared to existing methods. Unlike previous studies where performance varied
+depending on the benchmark dataset, our proposed method achieved consistent
+state-of-the-art performance on the FashionAI, DARN, DeepFashion, and Zappos50K
+benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaPro: Box-Supervised 3D Point Cloud Instance Segmentation Using
+  Gaussian Processes as Pseudo Labelers <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tuan Duc Ngo, Binh-Son Hua, Khoi Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instance segmentation on 3D point clouds (3DIS) is a longstanding challenge
+in computer vision, where state-of-the-art methods are mainly based on full
+supervision. As annotating ground truth dense instance masks is tedious and
+expensive, solving 3DIS with weak supervision has become more practical. In
+this paper, we propose GaPro, a new instance segmentation for 3D point clouds
+using axis-aligned 3D bounding box supervision. Our two-step approach involves
+generating pseudo labels from box annotations and training a 3DIS network with
+the resulting labels. Additionally, we employ the self-training strategy to
+improve the performance of our method further. We devise an effective Gaussian
+Process to generate pseudo instance masks from the bounding boxes and resolve
+ambiguities when they overlap, resulting in pseudo instance masks with their
+uncertainty values. Our experiments show that GaPro outperforms previous weakly
+supervised 3D instance segmentation methods and has competitive performance
+compared to state-of-the-art fully supervised ones. Furthermore, we demonstrate
+the robustness of our approach, where we can adapt various state-of-the-art
+fully supervised methods to the weak supervision task by using our pseudo
+labels for training. The source code and trained models are available at
+https://github.com/VinAIResearch/GaPro.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Keyword-Aware Relative Spatio-Temporal Graph Networks for Video Question
+  Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Cheng, Hehe Fan, Dongyun Lin, Ying Sun, Mohan Kankanhalli, Joo-Hwee Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main challenge in video question answering (VideoQA) is to capture and
+understand the complex spatial and temporal relations between objects based on
+given questions. Existing graph-based methods for VideoQA usually ignore
+keywords in questions and employ a simple graph to aggregate features without
+considering relative relations between objects, which may lead to inferior
+performance. In this paper, we propose a Keyword-aware Relative Spatio-Temporal
+(KRST) graph network for VideoQA. First, to make question features aware of
+keywords, we employ an attention mechanism to assign high weights to keywords
+during question encoding. The keyword-aware question features are then used to
+guide video graph construction. Second, because relations are relative, we
+integrate the relative relation modeling to better capture the spatio-temporal
+dynamics among object nodes. Moreover, we disentangle the spatio-temporal
+reasoning into an object-level spatial graph and a frame-level temporal graph,
+which reduces the impact of spatial and temporal relation reasoning on each
+other. Extensive experiments on the TGIF-QA, MSVD-QA and MSRVTT-QA datasets
+demonstrate the superiority of our KRST over multiple state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Granularity Prediction with Learnable Fusion for Scene Text
+  Recognition <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Da, Peng Wang, Cong Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the enormous technical challenges and wide range of applications,
+scene text recognition (STR) has been an active research topic in computer
+vision for years. To tackle this tough problem, numerous innovative methods
+have been successively proposed, and incorporating linguistic knowledge into
+STR models has recently become a prominent trend. In this work, we first draw
+inspiration from the recent progress in Vision Transformer (ViT) to construct a
+conceptually simple yet functionally powerful vision STR model, which is built
+upon ViT and a tailored Adaptive Addressing and Aggregation (A$^3$) module. It
+already outperforms most previous state-of-the-art models for scene text
+recognition, including both pure vision models and language-augmented methods.
+To integrate linguistic knowledge, we further propose a Multi-Granularity
+Prediction strategy to inject information from the language modality into the
+model in an implicit way, \ie, subword representations (BPE and WordPiece)
+widely used in NLP are introduced into the output space, in addition to the
+conventional character level representation, while no independent language
+model (LM) is adopted. To produce the final recognition results, two strategies
+for effectively fusing the multi-granularity predictions are devised. The
+resultant algorithm (termed MGP-STR) is able to push the performance envelope
+of STR to an even higher level. Specifically, MGP-STR achieves an average
+recognition accuracy of $94\%$ on standard benchmarks for scene text
+recognition. Moreover, it also achieves state-of-the-art results on widely-used
+handwritten benchmarks as well as more challenging scene text datasets,
+demonstrating the generality of the proposed MGP-STR algorithm. The source code
+and models will be available at:
+\url{https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/MGP-STR}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to TPAMI; an extension to our previous ECCV 2022 paper
+  arXiv:2209.03592</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fashion Matrix: Editing Photos by Just Talking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Chong, Xujie Zhang, Fuwei Zhao, Zhenyu Xie, Xiaodan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The utilization of Large Language Models (LLMs) for the construction of AI
+systems has garnered significant attention across diverse fields. The extension
+of LLMs to the domain of fashion holds substantial commercial potential but
+also inherent challenges due to the intricate semantic interactions in
+fashion-related generation. To address this issue, we developed a hierarchical
+AI system called Fashion Matrix dedicated to editing photos by just talking.
+This system facilitates diverse prompt-driven tasks, encompassing garment or
+accessory replacement, recoloring, addition, and removal. Specifically, Fashion
+Matrix employs LLM as its foundational support and engages in iterative
+interactions with users. It employs a range of Semantic Segmentation Models
+(e.g., Grounded-SAM, MattingAnything, etc.) to delineate the specific editing
+masks based on user instructions. Subsequently, Visual Foundation Models (e.g.,
+Stable Diffusion, ControlNet, etc.) are leveraged to generate edited images
+from text prompts and masks, thereby facilitating the automation of fashion
+editing processes. Experiments demonstrate the outstanding ability of Fashion
+Matrix to explores the collaborative potential of functionally diverse
+pre-trained models in the domain of fashion editing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-aware Query-enhanced <span class="highlight-title">Transformer</span> for Audio-Visual Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinxiang Liu, Chen Ju, Chaofan Ma, Yanfeng Wang, Yu Wang, Ya Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of the audio-visual segmentation (AVS) task is to segment the
+sounding objects in the video frames using audio cues. However, current
+fusion-based methods have the performance limitations due to the small
+receptive field of convolution and inadequate fusion of audio-visual features.
+To overcome these issues, we propose a novel \textbf{Au}dio-aware
+query-enhanced \textbf{TR}ansformer (AuTR) to tackle the task. Unlike existing
+methods, our approach introduces a multimodal transformer architecture that
+enables deep fusion and aggregation of audio-visual features. Furthermore, we
+devise an audio-aware query-enhanced transformer decoder that explicitly helps
+the model focus on the segmentation of the pinpointed sounding objects based on
+audio signals, while disregarding silent yet salient objects. Experimental
+results show that our method outperforms previous methods and demonstrates
+better generalization ability in multi-sound and open-set scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2305.11019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Strivec: Sparse Tri-Vector Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quankai Gao, Qiangeng Xu, Hao Su, Ulrich Neumann, Zexiang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Strivec, a novel neural representation that models a 3D scene as a
+radiance field with sparsely distributed and compactly factorized local tensor
+feature grids. Our approach leverages tensor decomposition, following the
+recent work TensoRF, to model the tensor grids. In contrast to TensoRF which
+uses a global tensor and focuses on their vector-matrix decomposition, we
+propose to utilize a cloud of local tensors and apply the classic
+CANDECOMP/PARAFAC (CP) decomposition to factorize each tensor into triple
+vectors that express local feature distributions along spatial axes and
+compactly encode a local neural field. We also apply multi-scale tensor grids
+to discover the geometry and appearance commonalities and exploit spatial
+coherence with the tri-vector factorization at multiple local scales. The final
+radiance field properties are regressed by aggregating neural features from
+multiple local tensors across all scales. Our tri-vector tensors are sparsely
+distributed around the actual scene surface, discovered by a fast coarse
+reconstruction, leveraging the sparsity of a 3D scene. We demonstrate that our
+model can achieve better rendering quality while using significantly fewer
+parameters than previous methods, including TensoRF and Instant-NGP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multilevel Large Language Models for Everyone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhao Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have made significant progress in the past few years.
+However, they are either generic {\it or} field specific, splitting the
+community into different groups. In this paper, we unify these large language
+models into a larger map, where the generic {\it and} specific models are
+linked together and can improve each other, based on the user personal input
+and information from the internet. The idea of linking several large language
+models together is inspired by the functionality of human brain. The specific
+regions on the brain cortex are specific for certain low level functionality.
+And these regions can jointly work together to achieve more complex high level
+functionality. Such behavior on human brain cortex sheds the light to design
+the multilevel large language models that contain global level, field level and
+user level models. The user level models run on local machines to achieve
+efficient response and protect the user's privacy. Such multilevel models
+reduce some redundancy and perform better than the single level models. The
+proposed multilevel idea can be applied in various applications, such as
+natural language processing, computer vision tasks, professional assistant,
+business and healthcare.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image Segmentation Keras : Implementation of Segnet, FCN, UNet, PSPNet
+  and other models in Keras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Divam Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation plays a vital role in computer vision tasks, enabling
+precise pixel-level understanding of images. In this paper, we present a
+comprehensive library for semantic segmentation, which contains implementations
+of popular segmentation models like SegNet, FCN, UNet, and PSPNet. We also
+evaluate and compare these models on several datasets, offering researchers and
+practitioners a powerful toolset for tackling diverse segmentation challenges.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Investigation into Glomeruli Detection in Kidney H&E and PAS Images
+  using YOLO 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kimia Hemmatirad, Morteza Babaie, Jeffrey Hodgin, Liron Pantanowitz, H. R. Tizhoosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Context: Analyzing digital pathology images is necessary to draw diagnostic
+conclusions by investigating tissue patterns and cellular morphology. However,
+manual evaluation can be time-consuming, expensive, and prone to inter- and
+intra-observer variability. Objective: To assist pathologists using
+computerized solutions, automated tissue structure detection and segmentation
+must be proposed. Furthermore, generating pixel-level object annotations for
+histopathology images is expensive and time-consuming. As a result, detection
+models with bounding box labels may be a feasible solution. Design: This paper
+studies. YOLO-v4 (You-Only-Look-Once), a real-time object detector for
+microscopic images. YOLO uses a single neural network to predict several
+bounding boxes and class probabilities for objects of interest. YOLO can
+enhance detection performance by training on whole slide images. YOLO-v4 has
+been used in this paper. for glomeruli detection in human kidney images.
+Multiple experiments have been designed and conducted based on different
+training data of two public datasets and a private dataset from the University
+of Michigan for fine-tuning the model. The model was tested on the private
+dataset from the University of Michigan, serving as an external validation of
+two different stains, namely hematoxylin and eosin (H&E) and periodic
+acid-Schiff (PAS). Results: Average specificity and sensitivity for all
+experiments, and comparison of existing segmentation methods on the same
+datasets are discussed. Conclusions: Automated glomeruli detection in human
+kidney images is possible using modern AI models. The design and validation for
+different stains still depends on variability of public multi-stain datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pretrain</span>ed Deep 2.5D Models for Efficient Predictive Modeling from
+  Retinal OCT <span class="chip">MICCAI'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taha Emre, Marzieh Oghbaie, Arunava Chakravarty, Antoine Rivail, Sophie Riedl, Julia Mai, Hendrik P. N. Scholl, Sobha Sivaprasad, Daniel Rueckert, Andrew Lotery, Ursula Schmidt-Erfurth, Hrvoje Bogunović
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of medical imaging, 3D deep learning models play a crucial role
+in building powerful predictive models of disease progression. However, the
+size of these models presents significant challenges, both in terms of
+computational resources and data requirements. Moreover, achieving high-quality
+pretraining of 3D models proves to be even more challenging. To address these
+issues, hybrid 2.5D approaches provide an effective solution for utilizing 3D
+volumetric data efficiently using 2D models. Combining 2D and 3D techniques
+offers a promising avenue for optimizing performance while minimizing memory
+requirements. In this paper, we explore 2.5D architectures based on a
+combination of convolutional neural networks (CNNs), long short-term memory
+(LSTM), and Transformers. In addition, leveraging the benefits of recent
+non-contrastive pretraining approaches in 2D, we enhanced the performance and
+data efficiency of 2.5D techniques even further. We demonstrate the
+effectiveness of architectures and associated pretraining on a task of
+predicting progression to wet age-related macular degeneration (AMD) within a
+six-month period on two large longitudinal OCT datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at OMIA-X MICCAI'23 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the unreasonable vulnerability of <span class="highlight-title">transformer</span>s for image restoration
+  -- and an easy fix 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Agnihotri, Kanchana Vaishnavi Gandikota, Julia Grabinski, Paramanand Chandramouli, Margret Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following their success in visual recognition tasks, Vision
+Transformers(ViTs) are being increasingly employed for image restoration. As a
+few recent works claim that ViTs for image classification also have better
+robustness properties, we investigate whether the improved adversarial
+robustness of ViTs extends to image restoration. We consider the recently
+proposed Restormer model, as well as NAFNet and the "Baseline network" which
+are both simplified versions of a Restormer. We use Projected Gradient Descent
+(PGD) and CosPGD, a recently proposed adversarial attack tailored to pixel-wise
+prediction tasks for our robustness evaluation. Our experiments are performed
+on real-world images from the GoPro dataset for image deblurring. Our analysis
+indicates that contrary to as advocated by ViTs in image classification works,
+these models are highly susceptible to adversarial attacks. We attempt to
+improve their robustness through adversarial training. While this yields a
+significant increase in robustness for Restormer, results on other networks are
+less promising. Interestingly, the design choices in NAFNet and Baselines,
+which were based on iid performance, and not on robust generalization, seem to
+be at odds with the model robustness. Thus, we investigate this further and
+find a fix.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tags: Robustness, adversarial attacks, image deblurring, image
+  restoration, NAFNet, Baseline, Restormer, adversarial training</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Sharpened Cosine Similarity <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Skyler Wu, Fred Lu, Edward Raff, James Holt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional layers have long served as the primary workhorse for image
+classification. Recently, an alternative to convolution was proposed using the
+Sharpened Cosine Similarity (SCS), which in theory may serve as a better
+feature detector. While multiple sources report promising results, there has
+not been to date a full-scale empirical analysis of neural network performance
+using these new layers. In our work, we explore SCS's parameter behavior and
+potential as a drop-in replacement for convolutions in multiple CNN
+architectures benchmarked on CIFAR-10. We find that while SCS may not yield
+significant increases in accuracy, it may learn more interpretable
+representations. We also find that, in some circumstances, SCS may confer a
+slight increase in adversarial robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to I Can't Believe It's Not Better Workshop (ICBINB) at
+  NeurIPS 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SplitFed resilience to packet loss: Where to split, that is the question <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chamani Shiranthika, Zahra Hafezi Kafshgari, Parvaneh Saeedi, Ivan V. Bajić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized machine learning has broadened its scope recently with the
+invention of Federated Learning (FL), Split Learning (SL), and their hybrids
+like Split Federated Learning (SplitFed or SFL). The goal of SFL is to reduce
+the computational power required by each client in FL and parallelize SL while
+maintaining privacy. This paper investigates the robustness of SFL against
+packet loss on communication links. The performance of various SFL aggregation
+strategies is examined by splitting the model at two points -- shallow split
+and deep split -- and testing whether the split point makes a statistically
+significant difference to the accuracy of the final model. Experiments are
+carried out on a segmentation model for human embryo images and indicate the
+statistically significant advantage of a deeper split point.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, MICCAI 2023 Workshop on Distributed,
+  Collaborative and Federated Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAEA: Multimodal Attribution for Embodied AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vidhi Jain, Jayant Sravan Tamarapalli, Sahiti Yerramilli, Yonatan Bisk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding multimodal perception for embodied AI is an open question
+because such inputs may contain highly complementary as well as redundant
+information for the task. A relevant direction for multimodal policies is
+understanding the global trends of each modality at the fusion layer. To this
+end, we disentangle the attributions for visual, language, and previous action
+inputs across different policies trained on the ALFRED dataset. Attribution
+analysis can be utilized to rank and group the failure scenarios, investigate
+modeling and dataset biases, and critically analyze multimodal EAI policies for
+robustness and user trust before deployment. We present MAEA, a framework to
+compute global attributions per modality of any differentiable policy. In
+addition, we show how attributions enable lower-level behavior analysis in EAI
+policies for language and visual attributions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CosSIF: Cosine similarity-based image filtering to overcome low
+  inter-class variation in synthetic medical image <span class="highlight-title">dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mominul Islam, Hasib Zunair, Nabeel Mohammed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crafting effective deep learning models for medical image analysis is a
+complex task, particularly in cases where the medical image dataset lacks
+significant inter-class variation. This challenge is further aggravated when
+employing such datasets to generate synthetic images using generative
+adversarial networks (GANs), as the output of GANs heavily relies on the input
+data. In this research, we propose a novel filtering algorithm called Cosine
+Similarity-based Image Filtering (CosSIF). We leverage CosSIF to develop two
+distinct filtering methods: Filtering Before GAN Training (FBGT) and Filtering
+After GAN Training (FAGT). FBGT involves the removal of real images that
+exhibit similarities to images of other classes before utilizing them as the
+training dataset for a GAN. On the other hand, FAGT focuses on eliminating
+synthetic images with less discriminative features compared to real images used
+for training the GAN. Experimental results reveal that employing either the
+FAGT or FBGT method with modern transformer and convolutional-based networks
+leads to substantial performance gains in various evaluation metrics. FAGT
+implementation on the ISIC-2016 dataset surpasses the baseline method in terms
+of sensitivity by 1.59\% and AUC by 1.88\%. Furthermore, for the HAM10000
+dataset, applying FABT outperforms the baseline approach in terms of recall by
+13.75\%, and with the sole implementation of FAGT, achieves a maximum accuracy
+of 94.44\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ E^2VPT: An Effective and Efficient Approach for Visual <span class="highlight-title">Prompt</span> Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Han, Qifan Wang, Yiming Cui, Zhiwen Cao, Wenguan Wang, Siyuan Qi, Dongfang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the size of transformer-based models continues to grow, fine-tuning these
+large-scale pretrained vision models for new tasks has become increasingly
+parameter-intensive. Parameter-efficient learning has been developed to reduce
+the number of tunable parameters during fine-tuning. Although these methods
+show promising results, there is still a significant performance gap compared
+to full fine-tuning. To address this challenge, we propose an Effective and
+Efficient Visual Prompt Tuning (E^2VPT) approach for large-scale
+transformer-based model adaptation. Specifically, we introduce a set of
+learnable key-value prompts and visual prompts into self-attention and input
+layers, respectively, to improve the effectiveness of model fine-tuning.
+Moreover, we design a prompt pruning procedure to systematically prune low
+importance prompts while preserving model performance, which largely enhances
+the model's efficiency. Empirical results demonstrate that our approach
+outperforms several state-of-the-art baselines on two benchmarks, with
+considerably low parameter usage (e.g., 0.32% of model parameters on VTAB-1k).
+Our code is available at https://github.com/ChengHan111/E2VPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A real-time material breakage detection for offshore wind turbines based
+  on improved neural network algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yantong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integrity of offshore wind turbines, pivotal for sustainable energy
+generation, is often compromised by surface material defects. Despite the
+availability of various detection techniques, limitations persist regarding
+cost-effectiveness, efficiency, and applicability. Addressing these
+shortcomings, this study introduces a novel approach leveraging an advanced
+version of the YOLOv8 object detection model, supplemented with a Convolutional
+Block Attention Module (CBAM) for improved feature recognition. The optimized
+loss function further refines the learning process. Employing a dataset of
+5,432 images from the Saemangeum offshore wind farm and a publicly available
+dataset, our method underwent rigorous testing. The findings reveal a
+substantial enhancement in defect detection stability, marking a significant
+stride towards efficient turbine maintenance. This study's contributions
+illuminate the path for future research, potentially revolutionizing
+sustainable energy practices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2306.16019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implementing and Benchmarking the Locally Competitive Algorithm on the
+  Loihi 2 Neuromorphic Processor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gavin Parpart, Sumedh R. Risbud, Garrett T. Kenyon, Yijing Watkins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neuromorphic processors have garnered considerable interest in recent years
+for their potential in energy-efficient and high-speed computing. The Locally
+Competitive Algorithm (LCA) has been utilized for power efficient sparse coding
+on neuromorphic processors, including the first Loihi processor. With the Loihi
+2 processor enabling custom neuron models and graded spike communication, more
+complex implementations of LCA are possible. We present a new implementation of
+LCA designed for the Loihi 2 processor and perform an initial set of benchmarks
+comparing it to LCA on CPU and GPU devices. In these experiments LCA on Loihi 2
+is orders of magnitude more efficient and faster for large sparsity penalties,
+while maintaining similar reconstruction quality. We find this performance
+improvement increases as the LCA parameters are tuned towards greater
+representation sparsity.
+  Our study highlights the potential of neuromorphic processors, particularly
+Loihi 2, in enabling intelligent, autonomous, real-time processing on small
+robots, satellites where there are strict SWaP (small, lightweight, and low
+power) requirements. By demonstrating the superior performance of LCA on Loihi
+2 compared to conventional computing device, our study suggests that Loihi 2
+could be a valuable tool in advancing these types of applications. Overall, our
+study highlights the potential of neuromorphic processors for efficient and
+accurate data processing on resource-constrained devices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PlaneRecTR: Unified Query learning for 3D Plane Recovery from a Single
+  View <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingjia Shi, Shuaifeng Zhi, Kai Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D plane recovery from a single image can usually be divided into several
+subtasks of plane detection, segmentation, parameter estimation and possibly
+depth estimation. Previous works tend to solve this task by either extending
+the RCNN-based segmentation network or the dense pixel embedding-based
+clustering framework. However, none of them tried to integrate above related
+subtasks into a unified framework but treat them separately and sequentially,
+which we suspect is potentially a main source of performance limitation for
+existing approaches. Motivated by this finding and the success of query-based
+learning in enriching reasoning among semantic entities, in this paper, we
+propose PlaneRecTR, a Transformer-based architecture, which for the first time
+unifies all subtasks related to single-view plane recovery with a single
+compact model. Extensive quantitative and qualitative experiments demonstrate
+that our proposed unified learning achieves mutual benefits across subtasks,
+obtaining a new state-of-the-art performance on public ScanNet and NYUv2-Plane
+datasets. Codes are available at https://github.com/SJingjia/PlaneRecTR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in Proceedings of IEEE International Conference on
+  Computer Vision (ICCV 2023). Codes: https://github.com/SJingjia/PlaneRecTR ,
+  Video: https://youtu.be/YBB7totHGJg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TMR-RD: Training-based Model Refinement and Representation Disagreement
+  for Semi-Supervised Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Mojtaba Marvasti-Zadeh, Nilanjan Ray, Nadir Erbilgin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised object detection (SSOD) can incorporate limited labeled data
+and large amounts of unlabeled data to improve the performance and
+generalization of existing object detectors. Despite many advances, recent SSOD
+methods are still challenged by noisy/misleading pseudo-labels, classical
+exponential moving average (EMA) strategy, and the consensus of Teacher-Student
+models in the latter stages of training. This paper proposes a novel
+training-based model refinement (TMR) stage and a simple yet effective
+representation disagreement (RD) strategy to address the limitations of
+classical EMA and the consensus problem. The TMR stage of Teacher-Student
+models optimizes the lightweight scaling operation to refine the model's
+weights and prevent overfitting or forgetting learned patterns from unlabeled
+data. Meanwhile, the RD strategy helps keep these models diverged to encourage
+the student model to explore complementary representations. In addition, we use
+cascade regression to generate more reliable pseudo-labels for supervising the
+student model. Extensive experiments demonstrate the superior performance of
+our approach over state-of-the-art SSOD methods. Specifically, the proposed
+approach outperforms the Unbiased-Teacher method by an average mAP margin of
+4.6% and 5.3% when using partially-labeled and fully-labeled data on the
+MS-COCO dataset, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ChildGAN: Large Scale Synthetic Child Facial Data Using Domain
+  Adaptation in StyleGAN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Ali Farooq, Wang Yao, Gabriel Costache, Peter Corcoran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this research work, we proposed a novel ChildGAN, a pair of GAN networks
+for generating synthetic boys and girls facial data derived from StyleGAN2.
+ChildGAN is built by performing smooth domain transfer using transfer learning.
+It provides photo-realistic, high-quality data samples. A large-scale dataset
+is rendered with a variety of smart facial transformations: facial expressions,
+age progression, eye blink effects, head pose, skin and hair color variations,
+and variable lighting conditions. The dataset comprises more than 300k distinct
+data samples. Further, the uniqueness and characteristics of the rendered
+facial features are validated by running different computer vision application
+tests which include CNN-based child gender classifier, face localization and
+facial landmarks detection test, identity similarity evaluation using ArcFace,
+and lastly running eye detection and eye aspect ratio tests. The results
+demonstrate that synthetic child facial data of high quality offers an
+alternative to the cost and complexity of collecting a large-scale dataset from
+real children.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Paper is submitted in IEEE Access Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Foundational Models Defining a New Era in Vision: A <span class="highlight-title">Survey</span> and Outlook 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Awais, Muzammal Naseer, Salman Khan, Rao Muhammad Anwer, Hisham Cholakkal, Mubarak Shah, Ming-Hsuan Yang, Fahad Shahbaz Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision systems to see and reason about the compositional nature of visual
+scenes are fundamental to understanding our world. The complex relations
+between objects and their locations, ambiguities, and variations in the
+real-world environment can be better described in human language, naturally
+governed by grammatical rules and other modalities such as audio and depth. The
+models learned to bridge the gap between such modalities coupled with
+large-scale training data facilitate contextual reasoning, generalization, and
+prompt capabilities at test time. These models are referred to as foundational
+models. The output of such models can be modified through human-provided
+prompts without retraining, e.g., segmenting a particular object by providing a
+bounding box, having interactive dialogues by asking questions about an image
+or video scene or manipulating the robot's behavior through language
+instructions. In this survey, we provide a comprehensive review of such
+emerging foundational models, including typical architecture designs to combine
+different modalities (vision, text, audio, etc), training objectives
+(contrastive, generative), pre-training datasets, fine-tuning mechanisms, and
+the common prompting patterns; textual, visual, and heterogeneous. We discuss
+the open challenges and research directions for foundational models in computer
+vision, including difficulties in their evaluations and benchmarking, gaps in
+their real-world understanding, limitations of their contextual understanding,
+biases, vulnerability to adversarial attacks, and interpretability issues. We
+review recent developments in this field, covering a wide range of applications
+of foundation models systematically and comprehensively. A comprehensive list
+of foundational models studied in this work is available at
+\url{https://github.com/awaisrauf/Awesome-CV-Foundational-Models}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page:
+  https://github.com/awaisrauf/Awesome-CV-Foundational-Models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Composite Diffusion | whole >= Σparts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13720v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13720v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vikram Jamwal, Ramaneswaran S
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For an artist or a graphic designer, the spatial layout of a scene is a
+critical design choice. However, existing text-to-image diffusion models
+provide limited support for incorporating spatial information. This paper
+introduces Composite Diffusion as a means for artists to generate high-quality
+images by composing from the sub-scenes. The artists can specify the
+arrangement of these sub-scenes through a flexible free-form segment layout.
+They can describe the content of each sub-scene primarily using natural text
+and additionally by utilizing reference images or control inputs such as line
+art, scribbles, human pose, canny edges, and more.
+  We provide a comprehensive and modular method for Composite Diffusion that
+enables alternative ways of generating, composing, and harmonizing sub-scenes.
+Further, we wish to evaluate the composite image for effectiveness in both
+image quality and achieving the artist's intent. We argue that existing image
+quality metrics lack a holistic evaluation of image composites. To address
+this, we propose novel quality criteria especially relevant to composite
+generation.
+  We believe that our approach provides an intuitive method of art creation.
+Through extensive user surveys, quantitative and qualitative analysis, we show
+how it achieves greater spatial, semantic, and creative control over image
+generation. In addition, our methods do not need to retrain or modify the
+architecture of the base diffusion models and can work in a plug-and-play
+manner with the fine-tuned models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>44 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comprehensive Analysis on the Leakage of Fuzzy Matchers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Axel Durbet, Paul-Marie Grollemund, Kevin Thiry-Atighehchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The present paper presents a comprehensive analysis of potential information
+leakage in distance evaluation, with a specific emphasis on threshold-based
+obfuscated distance (i.e. Fuzzy Matcher). It includes detailed descriptions of
+various situations related to potential information leakage and specific
+attention is given to their consequences on security. Generic attacks
+corresponding to each scenario are outlined, and their complexities are
+assessed. The main contribution of this work lies in providing an upper bound
+on the security of a fuzzy matcher in scenarios where there is additional
+information leakage from the matcher, providing a straightforward understanding
+of the maximum level of achievable security and its potential implications for
+data privacy and security.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards a Visual-Language Foundation Model for Computational Pathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12914v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12914v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Y. Lu, Bowen Chen, Drew F. K. Williamson, Richard J. Chen, Ivy Liang, Tong Ding, Guillaume Jaume, Igor Odintsov, Andrew Zhang, Long Phi Le, Georg Gerber, Anil V Parwani, Faisal Mahmood
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accelerated adoption of digital pathology and advances in deep learning
+have enabled the development of powerful models for various pathology tasks
+across a diverse array of diseases and patient cohorts. However, model training
+is often difficult due to label scarcity in the medical domain and the model's
+usage is limited by the specific task and disease for which it is trained.
+Additionally, most models in histopathology leverage only image data, a stark
+contrast to how humans teach each other and reason about histopathologic
+entities. We introduce CONtrastive learning from Captions for Histopathology
+(CONCH), a visual-language foundation model developed using diverse sources of
+histopathology images, biomedical text, and notably over 1.17 million
+image-caption pairs via task-agnostic pretraining. Evaluated on a suite of 13
+diverse benchmarks, CONCH can be transferred to a wide range of downstream
+tasks involving either or both histopathology images and text, achieving
+state-of-the-art performance on histology image classification, segmentation,
+captioning, text-to-image and image-to-text retrieval. CONCH represents a
+substantial leap over concurrent visual-language pretrained systems for
+histopathology, with the potential to directly facilitate a wide array of
+machine learning-based workflows requiring minimal or no further supervised
+fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometric Wavelet Scattering Networks on Compact Riemannian Manifolds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1905.10448v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1905.10448v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Perlmutter, Feng Gao, Guy Wolf, Matthew Hirn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Euclidean scattering transform was introduced nearly a decade ago to
+improve the mathematical understanding of convolutional neural networks.
+Inspired by recent interest in geometric deep learning, which aims to
+generalize convolutional neural networks to manifold and graph-structured
+domains, we define a geometric scattering transform on manifolds. Similar to
+the Euclidean scattering transform, the geometric scattering transform is based
+on a cascade of wavelet filters and pointwise nonlinearities. It is invariant
+to local isometries and stable to certain types of diffeomorphisms. Empirical
+results demonstrate its utility on several geometric learning tasks. Our
+results generalize the deformation stability and local translation invariance
+of Euclidean scattering, and demonstrate the importance of linking the used
+filter structures to the underlying geometry of the data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages; 3 figures; 2 tables; v4: Fixed a minor error. Convergence
+  in Equation 13 is in L2 not p.w. modified proof of Theorem 3.3 accordingly</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stabilizing <span class="highlight-title">Transformer</span> Training by Preventing Attention Entropy
+  Collapse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangfei Zhai, Tatiana Likhomanenko, Etai Littwin, Dan Busbridge, Jason Ramapuram, Yizhe Zhang, Jiatao Gu, Josh Susskind
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training stability is of great importance to Transformers. In this work, we
+investigate the training dynamics of Transformers by examining the evolution of
+the attention layers. In particular, we track the attention entropy for each
+attention head during the course of training, which is a proxy for model
+sharpness. We identify a common pattern across different architectures and
+tasks, where low attention entropy is accompanied by high training instability,
+which can take the form of oscillating loss or divergence. We denote the
+pathologically low attention entropy, corresponding to highly concentrated
+attention scores, as $\textit{entropy collapse}$. As a remedy, we propose
+$\sigma$Reparam, a simple and efficient solution where we reparametrize all
+linear layers with spectral normalization and an additional learned scalar. We
+demonstrate that $\sigma$Reparam successfully prevents entropy collapse in the
+attention layers, promoting more stable training. Additionally, we prove a
+tight lower bound of the attention entropy, which decreases exponentially fast
+with the spectral norm of the attention logits, providing additional motivation
+for our approach. We conduct experiments with $\sigma$Reparam on image
+classification, image self-supervised learning, machine translation, speech
+recognition, and language modeling tasks. We show that $\sigma$Reparam provides
+stability and robustness with respect to the choice of hyperparameters, going
+so far as enabling training (a) a Vision Transformer {to competitive
+performance} without warmup, weight decay, layer normalization or adaptive
+optimizers; (b) deep architectures in machine translation and (c) speech
+recognition to competitive performance without warmup and adaptive optimizers.
+Code is available at \url{https://github.com/apple/ml-sigma-reparam}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> video <span class="highlight-title">pretrain</span>ing yields human-aligned visual
+  representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06433v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06433v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Parthasarathy, S. M. Ali Eslami, João Carreira, Olivier J. Hénaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans learn powerful representations of objects and scenes by observing how
+they evolve over time. Yet, outside of specific tasks that require explicit
+temporal understanding, static image pretraining remains the dominant paradigm
+for learning visual foundation models. We question this mismatch, and ask
+whether video pretraining can yield visual representations that bear the
+hallmarks of human perception: generalisation across tasks, robustness to
+perturbations, and consistency with human judgements. To that end we propose a
+novel procedure for curating videos, and develop a contrastive framework which
+learns from the complex transformations therein. This simple paradigm for
+distilling knowledge from videos, called VITO, yields general representations
+that far outperform prior video pretraining methods on image understanding
+tasks, and image pretraining methods on video understanding tasks. Moreover,
+VITO representations are significantly more robust to natural and synthetic
+deformations than image-, video-, and adversarially-trained ones. Finally,
+VITO's predictions are strongly aligned with human judgements, surpassing
+models that were specifically trained for that purpose. Together, these results
+suggest that video pretraining could be a simple way of learning unified,
+robust, and human-aligned representations of the visual world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TF-ICON: Diffusion-Based Training-Free Cross-Domain Image Composition <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12493v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12493v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shilin Lu, Yanzhu Liu, Adams Wai-Kin Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-driven diffusion models have exhibited impressive generative
+capabilities, enabling various image editing tasks. In this paper, we propose
+TF-ICON, a novel Training-Free Image COmpositioN framework that harnesses the
+power of text-driven diffusion models for cross-domain image-guided
+composition. This task aims to seamlessly integrate user-provided objects into
+a specific visual context. Current diffusion-based methods often involve costly
+instance-based optimization or finetuning of pretrained models on customized
+datasets, which can potentially undermine their rich prior. In contrast,
+TF-ICON can leverage off-the-shelf diffusion models to perform cross-domain
+image-guided composition without requiring additional training, finetuning, or
+optimization. Moreover, we introduce the exceptional prompt, which contains no
+information, to facilitate text-driven diffusion models in accurately inverting
+real images into latent representations, forming the basis for compositing. Our
+experiments show that equipping Stable Diffusion with the exceptional prompt
+outperforms state-of-the-art inversion methods on various datasets (CelebA-HQ,
+COCO, and ImageNet), and that TF-ICON surpasses prior baselines in versatile
+visual domains. Code is available at https://github.com/Shilin-LU/TF-ICON
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating Curricula with Replays: Its Effects on Continual Learning <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05747v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05747v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ren Jie Tee, Mengmi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans engage in learning and reviewing processes with curricula when
+acquiring new skills or knowledge. This human learning behavior has inspired
+the integration of curricula with replay methods in continual learning agents.
+The goal is to emulate the human learning process, thereby improving knowledge
+retention and facilitating learning transfer. Existing replay methods in
+continual learning agents involve the random selection and ordering of data
+from previous tasks, which has shown to be effective. However, limited research
+has explored the integration of different curricula with replay methods to
+enhance continual learning. Our study takes initial steps in examining the
+impact of integrating curricula with replay methods on continual learning in
+three specific aspects: the interleaved frequency of replayed exemplars with
+training data, the sequence in which exemplars are replayed, and the strategy
+for selecting exemplars into the replay buffer. These aspects of curricula
+design align with cognitive psychology principles and leverage the benefits of
+interleaved practice during replays, easy-to-hard rehearsal, and exemplar
+selection strategy involving exemplars from a uniform distribution of
+difficulties. Based on our results, these three curricula effectively mitigated
+catastrophic forgetting and enhanced positive knowledge transfer, demonstrating
+the potential of curricula in advancing continual learning methodologies. Our
+code and data are available:
+https://github.com/ZhangLab-DeepNeuroCogLab/Integrating-Curricula-with-Replays
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, accepted in AAAI Summer Symposium Series
+  Proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harmonizing Feature Attributions Across Deep Learning Architectures:
+  Enhancing Interpretability and Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02150v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02150v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abdul Kadir, Gowtham Krishna Addluri, Daniel Sonntag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring the trustworthiness and interpretability of machine learning models
+is critical to their deployment in real-world applications. Feature attribution
+methods have gained significant attention, which provide local explanations of
+model predictions by attributing importance to individual input features. This
+study examines the generalization of feature attributions across various deep
+learning architectures, such as convolutional neural networks (CNNs) and vision
+transformers. We aim to assess the feasibility of utilizing a feature
+attribution method as a future detector and examine how these features can be
+harmonized across multiple models employing distinct architectures but trained
+on the same data distribution. By exploring this harmonization, we aim to
+develop a more coherent and optimistic understanding of feature attributions,
+enhancing the consistency of local explanations across diverse deep-learning
+models. Our findings highlight the potential for harmonized feature attribution
+methods to improve interpretability and foster trust in machine learning
+applications, regardless of the underlying architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version of the contribution has been submitted in KI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EdgeAL: An Edge Estimation Based Active Learning Approach for OCT
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10745v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10745v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abdul Kadir, Hasan Md Tusfiqur Alam, Daniel Sonntag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Active learning algorithms have become increasingly popular for training
+models with limited data. However, selecting data for annotation remains a
+challenging problem due to the limited information available on unseen data. To
+address this issue, we propose EdgeAL, which utilizes the edge information of
+unseen images as {\it a priori} information for measuring uncertainty. The
+uncertainty is quantified by analyzing the divergence and entropy in model
+predictions across edges. This measure is then used to select superpixels for
+annotation. We demonstrate the effectiveness of EdgeAL on multi-class Optical
+Coherence Tomography (OCT) segmentation tasks, where we achieved a 99% dice
+score while reducing the annotation label cost to 12%, 2.3%, and 3%,
+respectively, on three publicly available datasets (Duke, AROI, and UMN). The
+source code is available at \url{https://github.com/Mak-Ta-Reque/EdgeAL}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version of the contribution has been submitted in miccai2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FDCT: Fast Depth Completion for Transparent Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12274v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12274v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianan Li, Zhehan Chen, Huan Liu, Chen Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth completion is crucial for many robotic tasks such as autonomous
+driving, 3-D reconstruction, and manipulation. Despite the significant
+progress, existing methods remain computationally intensive and often fail to
+meet the real-time requirements of low-power robotic platforms. Additionally,
+most methods are designed for opaque objects and struggle with transparent
+objects due to the special properties of reflection and refraction. To address
+these challenges, we propose a Fast Depth Completion framework for Transparent
+objects (FDCT), which also benefits downstream tasks like object pose
+estimation. To leverage local information and avoid overfitting issues when
+integrating it with global information, we design a new fusion branch and
+shortcuts to exploit low-level features and a loss function to suppress
+overfitting. This results in an accurate and user-friendly depth rectification
+framework which can recover dense depth estimation from RGB-D images alone.
+Extensive experiments demonstrate that FDCT can run about 70 FPS with a higher
+accuracy than the state-of-the-art methods. We also demonstrate that FDCT can
+improve pose estimation in object grasping tasks. The source code is available
+at https://github.com/Nonmy/FDCT
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9pages,7figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From CAD models to soft point cloud labels: An automatic annotation
+  pipeline for cheaply supervised 3D semantic segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03114v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03114v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Galadrielle Humblot-Renaux, Simon Buus Jensen, Andreas Møgelmose
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a fully automatic annotation scheme that takes a raw 3D point
+cloud with a set of fitted CAD models as input and outputs convincing
+point-wise labels that can be used as cheap training data for point cloud
+segmentation. Compared with manual annotations, we show that our automatic
+labels are accurate while drastically reducing the annotation time and
+eliminating the need for manual intervention or dataset-specific parameters.
+Our labeling pipeline outputs semantic classes and soft point-wise object
+scores, which can either be binarized into standard one-hot-encoded labels,
+thresholded into weak labels with ambiguous points left unlabeled, or used
+directly as soft labels during training. We evaluate the label quality and
+segmentation performance of PointNet++ on a dataset of real industrial point
+clouds and Scan2CAD, a public dataset of indoor scenes. Our results indicate
+that reducing supervision in areas that are more difficult to label
+automatically is beneficial compared with the conventional approach of naively
+assigning a hard "best guess" label to every point.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>updated version, published in the Remote Sensing journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DataComp: In search of the next generation of multimodal <span class="highlight-title">dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14108v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14108v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samir Yitzhak Gadre, Gabriel Ilharco, Alex Fang, Jonathan Hayase, Georgios Smyrnis, Thao Nguyen, Ryan Marten, Mitchell Wortsman, Dhruba Ghosh, Jieyu Zhang, Eyal Orgad, Rahim Entezari, Giannis Daras, Sarah Pratt, Vivek Ramanujan, Yonatan Bitton, Kalyani Marathe, Stephen Mussmann, Richard Vencu, Mehdi Cherti, Ranjay Krishna, Pang Wei Koh, Olga Saukh, Alexander Ratner, Shuran Song, Hannaneh Hajishirzi, Ali Farhadi, Romain Beaumont, Sewoong Oh, Alex Dimakis, Jenia Jitsev, Yair Carmon, Vaishaal Shankar, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal datasets are a critical component in recent breakthroughs such as
+Stable Diffusion and GPT-4, yet their design does not receive the same research
+attention as model architectures or training algorithms. To address this
+shortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset
+experiments centered around a new candidate pool of 12.8 billion image-text
+pairs from Common Crawl. Participants in our benchmark design new filtering
+techniques or curate new data sources and then evaluate their new dataset by
+running our standardized CLIP training code and testing the resulting model on
+38 downstream test sets. Our benchmark consists of multiple compute scales
+spanning four orders of magnitude, which enables the study of scaling trends
+and makes the benchmark accessible to researchers with varying resources. Our
+baseline experiments show that the DataComp workflow leads to better training
+sets. In particular, our best baseline, DataComp-1B, enables training a CLIP
+ViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming
+OpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training
+procedure and compute. We release DataComp and all accompanying code at
+www.datacomp.ai.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Streaming Video Super-Resolution with Convolutional Look-Up Table 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00334v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00334v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanghao Yin, Zefan Qu, Xinyang Jiang, Shan Jiang, Zhenhua Han, Ningxin Zheng, Xiaohong Liu, Huan Yang, Yuqing Yang, Dongsheng Li, Lili Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online video streaming has fundamental limitations on the transmission
+bandwidth and computational capacity and super-resolution is a promising
+potential solution. However, applying existing video super-resolution methods
+to online streaming is non-trivial. Existing video codecs and streaming
+protocols (\eg, WebRTC) dynamically change the video quality both spatially and
+temporally, which leads to diverse and dynamic degradations. Furthermore,
+online streaming has a strict requirement for latency that most existing
+methods are less applicable. As a result, this paper focuses on the rarely
+exploited problem setting of online streaming video super resolution. To
+facilitate the research on this problem, a new benchmark dataset named
+LDV-WebRTC is constructed based on a real-world online streaming system.
+Leveraging the new benchmark dataset, we proposed a novel method specifically
+for online video streaming, which contains a convolution and Look-Up Table
+(LUT) hybrid model to achieve better performance-latency trade-off. To tackle
+the changing degradations, we propose a mixture-of-expert-LUT module, where a
+set of LUT specialized in different degradations are built and adaptively
+combined to handle different degradations. Experiments show our method achieves
+720P video SR around 100 FPS, while significantly outperforms existing
+LUT-based methods and offers competitive performance compared to efficient
+CNN-based methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-linear Neurons with Human-like Apical Dendrite Activations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2003.03229v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2003.03229v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mariana-Iuliana Georgescu, Radu Tudor Ionescu, Nicolae-Catalin Ristea, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In order to classify linearly non-separable data, neurons are typically
+organized into multi-layer neural networks that are equipped with at least one
+hidden layer. Inspired by some recent discoveries in neuroscience, we propose a
+new model of artificial neuron along with a novel activation function enabling
+the learning of nonlinear decision boundaries using a single neuron. We show
+that a standard neuron followed by our novel apical dendrite activation (ADA)
+can learn the XOR logical function with 100% accuracy. Furthermore, we conduct
+experiments on six benchmark data sets from computer vision, signal processing
+and natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,
+Tiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions
+provide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and
+Swish, for various neural network architectures, e.g. one-hidden-layer or
+two-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural
+networks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain
+further performance improvements when we change the standard model of the
+neuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our
+code is available at: https://github.com/raduionescu/pynada.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Applied Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leaping Into Memories: Space-Time Deep Feature Synthesis <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09941v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09941v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandros Stergiou, Nikos Deligiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of deep learning models has led to their adaptation and adoption
+by prominent video understanding methods. The majority of these approaches
+encode features in a joint space-time modality for which the inner workings and
+learned representations are difficult to visually interpret. We propose LEArned
+Preconscious Synthesis (LEAPS), an architecture-independent method for
+synthesizing videos from the internal spatiotemporal representations of models.
+Using a stimulus video and a target class, we prime a fixed space-time model
+and iteratively optimize a video initialized with random noise. Additional
+regularizers are used to improve the feature diversity of the synthesized
+videos alongside the cross-frame temporal coherence of motions. We
+quantitatively and qualitatively evaluate the applicability of LEAPS by
+inverting a range of spatiotemporal convolutional and attention-based
+architectures trained on Kinetics-400, which to the best of our knowledge has
+not been previously accomplished.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE/CVF International Conference on Computer Vision
+  (ICCV) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ResShift: Efficient Diffusion Model for Image Super-resolution by
+  Residual Shifting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12348v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12348v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongsheng Yue, Jianyi Wang, Chen Change Loy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based image super-resolution (SR) methods are mainly limited by the
+low inference speed due to the requirements of hundreds or even thousands of
+sampling steps. Existing acceleration sampling techniques inevitably sacrifice
+performance to some extent, leading to over-blurry SR results. To address this
+issue, we propose a novel and efficient diffusion model for SR that
+significantly reduces the number of diffusion steps, thereby eliminating the
+need for post-acceleration during inference and its associated performance
+deterioration. Our method constructs a Markov chain that transfers between the
+high-resolution image and the low-resolution image by shifting the residual
+between them, substantially improving the transition efficiency. Additionally,
+an elaborate noise schedule is developed to flexibly control the shifting speed
+and the noise strength during the diffusion process. Extensive experiments
+demonstrate that the proposed method obtains superior or at least comparable
+performance to current state-of-the-art methods on both synthetic and
+real-world datasets, even only with 15 sampling steps. Our code and model are
+available at https://github.com/zsyOAOA/ResShift.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instance-aware Dynamic <span class="highlight-title">Prompt</span> Tuning for <span class="highlight-title">Pre-train</span>ed Point Cloud Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07221v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07221v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaohua Zha, Jinpeng Wang, Tao Dai, Bin Chen, Zhi Wang, Shu-Tao Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained point cloud models have found extensive applications in 3D
+understanding tasks like object classification and part segmentation. However,
+the prevailing strategy of full fine-tuning in downstream tasks leads to large
+per-task storage overhead for model parameters, which limits the efficiency
+when applying large-scale pre-trained models. Inspired by the recent success of
+visual prompt tuning (VPT), this paper attempts to explore prompt tuning on
+pre-trained point cloud models, to pursue an elegant balance between
+performance and parameter efficiency. We find while instance-agnostic static
+prompting, e.g. VPT, shows some efficacy in downstream transfer, it is
+vulnerable to the distribution diversity caused by various types of noises in
+real-world point cloud data. To conquer this limitation, we propose a novel
+Instance-aware Dynamic Prompt Tuning (IDPT) strategy for pre-trained point
+cloud models. The essence of IDPT is to develop a dynamic prompt generation
+module to perceive semantic prior features of each point cloud instance and
+generate adaptive prompt tokens to enhance the model's robustness. Notably,
+extensive experiments demonstrate that IDPT outperforms full fine-tuning in
+most tasks with a mere 7% of the trainable parameters, providing a promising
+solution to parameter-efficient learning for pre-trained point cloud models.
+Code is available at \url{https://github.com/zyh16143998882/ICCV23-IDPT}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-based Person Search in Full Images via Semantic-Driven Proposal
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.12965v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.12965v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shizhou Zhang, De Cheng, Wenlong Luo, Yinghui Xing, Duo Long, Hao Li, Kai Niu, Guoqiang Liang, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding target persons in full scene images with a query of text description
+has important practical applications in intelligent video surveillance.However,
+different from the real-world scenarios where the bounding boxes are not
+available, existing text-based person retrieval methods mainly focus on the
+cross modal matching between the query text descriptions and the gallery of
+cropped pedestrian images. To close the gap, we study the problem of text-based
+person search in full images by proposing a new end-to-end learning framework
+which jointly optimize the pedestrian detection, identification and
+visual-semantic feature embedding tasks. To take full advantage of the query
+text, the semantic features are leveraged to instruct the Region Proposal
+Network to pay more attention to the text-described proposals. Besides, a
+cross-scale visual-semantic embedding mechanism is utilized to improve the
+performance. To validate the proposed method, we collect and annotate two
+large-scale benchmark datasets based on the widely adopted image-based person
+search datasets CUHK-SYSU and PRW. Comprehensive experiments are conducted on
+the two datasets and compared with the baseline methods, our method achieves
+the state-of-the-art performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recent Progress in <span class="highlight-title">Transformer</span>-based Medical Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.06643v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.06643v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoshan Liu, Qiujie Lv, Ziduo Yang, Yifan Li, Chau Hung Lee, Lei Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transformer is primarily used in the field of natural language
+processing. Recently, it has been adopted and shows promise in the computer
+vision (CV) field. Medical image analysis (MIA), as a critical branch of CV,
+also greatly benefits from this state-of-the-art technique. In this review, we
+first recap the core component of the transformer, the attention mechanism, and
+the detailed structures of the transformer. After that, we depict the recent
+progress of the transformer in the field of MIA. We organize the applications
+in a sequence of different tasks, including classification, segmentation,
+captioning, registration, detection, enhancement, localization, and synthesis.
+The mainstream classification and segmentation tasks are further divided into
+eleven medical image modalities. A large number of experiments studied in this
+review illustrate that the transformer-based method outperforms existing
+methods through comparisons with multiple evaluation metrics. Finally, we
+discuss the open challenges and future opportunities in this field. This
+task-modality review with the latest contents, detailed information, and
+comprehensive comparison may greatly benefit the broad MIA community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computers in Biology and Medicine Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatiotemporal Modeling Encounters 3D Medical Image Analysis:
+  Slice-Shift UNet with Multi-View Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12853v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12853v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        C. I. Ugwu, S. Casarin, O. Lanz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a fundamental part of computational healthcare, Computer Tomography (CT)
+and Magnetic Resonance Imaging (MRI) provide volumetric data, making the
+development of algorithms for 3D image analysis a necessity. Despite being
+computationally cheap, 2D Convolutional Neural Networks can only extract
+spatial information. In contrast, 3D CNNs can extract three-dimensional
+features, but they have higher computational costs and latency, which is a
+limitation for clinical practice that requires fast and efficient models.
+Inspired by the field of video action recognition we propose a new 2D-based
+model dubbed Slice SHift UNet (SSH-UNet) which encodes three-dimensional
+features at 2D CNN's complexity. More precisely multi-view features are
+collaboratively learned by performing 2D convolutions along the three
+orthogonal planes of a volume and imposing a weights-sharing mechanism. The
+third dimension, which is neglected by the 2D convolution, is reincorporated by
+shifting a portion of the feature maps along the slices' axis. The
+effectiveness of our approach is validated in Multi-Modality Abdominal
+Multi-Organ Segmentation (AMOS) and Multi-Atlas Labeling Beyond the Cranial
+Vault (BTCV) datasets, showing that SSH-UNet is more efficient while on par in
+performance with state-of-the-art architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StyleHumanCLIP: Text-guided Garment Manipulation for StyleGAN-Human 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takato Yoshikawa, Yuki Endo, Yoshihiro Kanamori
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper tackles text-guided control of StyleGAN for editing garments in
+full-body human images. Existing StyleGAN-based methods suffer from handling
+the rich diversity of garments and body shapes and poses. We propose a
+framework for text-guided full-body human image synthesis via an
+attention-based latent code mapper, which enables more disentangled control of
+StyleGAN than existing mappers. Our latent code mapper adopts an attention
+mechanism that adaptively manipulates individual latent codes on different
+StyleGAN layers under text guidance. In addition, we introduce feature-space
+masking at inference time to avoid unwanted changes caused by text inputs. Our
+quantitative and qualitative evaluations reveal that our method can control
+generated images more faithfully to given texts than existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FusionLoc: Camera-2D LiDAR Fusion Using Multi-Head Self-Attention for
+  End-to-End Serving Robot Relocalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06872v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06872v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jieun Lee, Hakjun Lee, Jiyong Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As technology advances in autonomous mobile robots, mobile service robots
+have been actively used more and more for various purposes. Especially, serving
+robots have been not surprising products anymore since the COVID-19 pandemic.
+One of the practical problems in operating a serving robot is that it often
+fails to estimate its pose on a map that it moves around. Whenever the failure
+happens, servers should bring the serving robot to its initial location and
+reboot it manually. In this paper, we focus on end-to-end relocalization of
+serving robots to address the problem. It is to predict robot pose directly
+from only the onboard sensor data using neural networks. In particular, we
+propose a deep neural network architecture for the relocalization based on
+camera-2D LiDAR sensor fusion. We call the proposed method FusionLoc. In the
+proposed method, the multi-head self-attention complements different types of
+information captured by the two sensors to regress the robot pose. Our
+experiments on a dataset collected by a commercial serving robot demonstrate
+that FusionLoc can provide better performances than previous end-to-end
+relocalization methods taking only a single image or a 2D LiDAR point cloud as
+well as a straightforward fusion method concatenating their features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Development of pericardial fat count images using a combination of three
+  different deep-learning models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12316v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12316v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takaaki Matsunaga, Atsushi Kono, Hidetoshi Matsuo, Kaoru Kitagawa, Mizuho Nishio, Hiromi Hashimura, Yu Izawa, Takayoshi Toba, Kazuki Ishikawa, Akie Katsuki, Kazuyuki Ohmura, Takamichi Murakami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rationale and Objectives: Pericardial fat (PF), the thoracic visceral fat
+surrounding the heart, promotes the development of coronary artery disease by
+inducing inflammation of the coronary arteries. For evaluating PF, this study
+aimed to generate pericardial fat count images (PFCIs) from chest radiographs
+(CXRs) using a dedicated deep-learning model.
+  Materials and Methods: The data of 269 consecutive patients who underwent
+coronary computed tomography (CT) were reviewed. Patients with metal implants,
+pleural effusion, history of thoracic surgery, or that of malignancy were
+excluded. Thus, the data of 191 patients were used. PFCIs were generated from
+the projection of three-dimensional CT images, where fat accumulation was
+represented by a high pixel value. Three different deep-learning models,
+including CycleGAN, were combined in the proposed method to generate PFCIs from
+CXRs. A single CycleGAN-based model was used to generate PFCIs from CXRs for
+comparison with the proposed method. To evaluate the image quality of the
+generated PFCIs, structural similarity index measure (SSIM), mean squared error
+(MSE), and mean absolute error (MAE) of (i) the PFCI generated using the
+proposed method and (ii) the PFCI generated using the single model were
+compared.
+  Results: The mean SSIM, MSE, and MAE were as follows: 0.856, 0.0128, and
+0.0357, respectively, for the proposed model; and 0.762, 0.0198, and 0.0504,
+respectively, for the single CycleGAN-based model.
+  Conclusion: PFCIs generated from CXRs with the proposed model showed better
+performance than those with the single model. PFCI evaluation without CT may be
+possible with the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selecting the motion ground truth for loose-fitting wearables:
+  benchmarking optical MoCap methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11881v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11881v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lala Shakti Swarup Ray, Bo Zhou, Sungho Suh, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To help smart wearable researchers choose the optimal ground truth methods
+for motion capturing (MoCap) for all types of loose garments, we present a
+benchmark, DrapeMoCapBench (DMCB), specifically designed to evaluate the
+performance of optical marker-based and marker-less MoCap. High-cost
+marker-based MoCap systems are well-known as precise golden standards. However,
+a less well-known caveat is that they require skin-tight fitting markers on
+bony areas to ensure the specified precision, making them questionable for
+loose garments. On the other hand, marker-less MoCap methods powered by
+computer vision models have matured over the years, which have meager costs as
+smartphone cameras would suffice. To this end, DMCB uses large real-world
+recorded MoCap datasets to perform parallel 3D physics simulations with a wide
+range of diversities: six levels of drape from skin-tight to extremely draped
+garments, three levels of motions and six body type - gender combinations to
+benchmark state-of-the-art optical marker-based and marker-less MoCap methods
+to identify the best-performing method in different scenarios. In assessing the
+performance of marker-based and low-cost marker-less MoCap for casual loose
+garments both approaches exhibit significant performance loss (>10cm), but for
+everyday activities involving basic and fast motions, marker-less MoCap
+slightly outperforms marker-based MoCap, making it a favorable and
+cost-effective choice for wearable studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GridMM: Grid Memory Map for Vision-and-Language Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12907v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12907v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Wang, Xiangyang Li, Jiahao Yang, Yeqi Liu, Shuqiang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-and-language navigation (VLN) enables the agent to navigate to a
+remote location following the natural language instruction in 3D environments.
+To represent the previously visited environment, most approaches for VLN
+implement memory using recurrent states, topological maps, or top-down semantic
+maps. In contrast to these approaches, we build the top-down egocentric and
+dynamically growing Grid Memory Map (i.e., GridMM) to structure the visited
+environment. From a global perspective, historical observations are projected
+into a unified grid map in a top-down view, which can better represent the
+spatial relations of the environment. From a local perspective, we further
+propose an instruction relevance aggregation method to capture fine-grained
+visual clues in each grid region. Extensive experiments are conducted on both
+the REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE
+dataset in the continuous environments, showing the superiority of our proposed
+method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nerfstudio: A Modular Framework for Neural Radiance Field Development 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04264v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04264v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Tancik, Ethan Weber, Evonne Ng, Ruilong Li, Brent Yi, Justin Kerr, Terrance Wang, Alexander Kristoffersen, Jake Austin, Kamyar Salahi, Abhik Ahuja, David McAllister, Angjoo Kanazawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRF) are a rapidly growing area of research with
+wide-ranging applications in computer vision, graphics, robotics, and more. In
+order to streamline the development and deployment of NeRF research, we propose
+a modular PyTorch framework, Nerfstudio. Our framework includes plug-and-play
+components for implementing NeRF-based methods, which make it easy for
+researchers and practitioners to incorporate NeRF into their projects.
+Additionally, the modular design enables support for extensive real-time
+visualization tools, streamlined pipelines for importing captured in-the-wild
+data, and tools for exporting to video, point cloud and mesh representations.
+The modularity of Nerfstudio enables the development of Nerfacto, our method
+that combines components from recent papers to achieve a balance between speed
+and quality, while also remaining flexible to future modifications. To promote
+community-driven development, all associated code and data are made publicly
+available with open-source licensing at https://nerf.studio.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page at https://nerf.studio</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CORE: Cooperative Reconstruction for Multi-Agent Perception <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11514v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11514v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binglu Wang, Lei Zhang, Zhaozhong Wang, Yongqiang Zhao, Tianfei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents CORE, a conceptually simple, effective and
+communication-efficient model for multi-agent cooperative perception. It
+addresses the task from a novel perspective of cooperative reconstruction,
+based on two key insights: 1) cooperating agents together provide a more
+holistic observation of the environment, and 2) the holistic observation can
+serve as valuable supervision to explicitly guide the model learning how to
+reconstruct the ideal observation based on collaboration. CORE instantiates the
+idea with three major components: a compressor for each agent to create more
+compact feature representation for efficient broadcasting, a lightweight
+attentive collaboration component for cross-agent message aggregation, and a
+reconstruction module to reconstruct the observation based on aggregated
+feature representations. This learning-to-reconstruct idea is task-agnostic,
+and offers clear and reasonable supervision to inspire more effective
+collaboration, eventually promoting perception tasks. We validate CORE on
+OPV2V, a large-scale multi-agent percetion dataset, in two tasks, i.e., 3D
+object detection and semantic segmentation. Results demonstrate that the model
+achieves state-of-the-art performance on both tasks, and is more
+communication-efficient.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023; Code: https://github.com/zllxot/CORE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatial-temporal <span class="highlight-title">Transformer</span>-guided Diffusion based Data Augmentation
+  for Efficient Skeleton-based Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13434v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13434v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Jiang, Han Chen, Hanseok Ko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, skeleton-based human action has become a hot research topic because
+the compact representation of human skeletons brings new blood to this research
+domain. As a result, researchers began to notice the importance of using RGB or
+other sensors to analyze human action by extracting skeleton information.
+Leveraging the rapid development of deep learning (DL), a significant number of
+skeleton-based human action approaches have been presented with fine-designed
+DL structures recently. However, a well-trained DL model always demands
+high-quality and sufficient data, which is hard to obtain without costing high
+expenses and human labor. In this paper, we introduce a novel data augmentation
+method for skeleton-based action recognition tasks, which can effectively
+generate high-quality and diverse sequential actions. In order to obtain
+natural and realistic action sequences, we propose denoising diffusion
+probabilistic models (DDPMs) that can generate a series of synthetic action
+sequences, and their generation process is precisely guided by a
+spatial-temporal transformer (ST-Trans). Experimental results show that our
+method outperforms the state-of-the-art (SOTA) motion generation approaches on
+different naturality and diversity metrics. It proves that its high-quality
+synthetic data can also be effectively deployed to existing action recognition
+models with significant performance improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SST: Real-time End-to-end Monocular 3D Reconstruction via Sparse
+  Spatial-Temporal Guidance <span class="chip">ICME 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.06524v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.06524v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyangguang Zhang, Zhiqiang Lou, Yan Di, Federico Tombari, Xiangyang Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-time monocular 3D reconstruction is a challenging problem that remains
+unsolved. Although recent end-to-end methods have demonstrated promising
+results, tiny structures and geometric boundaries are hardly captured due to
+their insufficient supervision neglecting spatial details and oversimplified
+feature fusion ignoring temporal cues. To address the problems, we propose an
+end-to-end 3D reconstruction network SST, which utilizes Sparse estimated
+points from visual SLAM system as additional Spatial guidance and fuses
+Temporal features via a novel cross-modal attention mechanism, achieving more
+detailed reconstruction results. We propose a Local Spatial-Temporal Fusion
+module to exploit more informative spatial-temporal cues from multi-view color
+information and sparse priors, as well a Global Spatial-Temporal Fusion module
+to refine the local TSDF volumes with the world-frame model from coarse to
+fine. Extensive experiments on ScanNet and 7-Scenes demonstrate that SST
+outperforms all state-of-the-art competitors, whilst keeping a high inference
+speed at 59 FPS, enabling real-world applications with real-time requirements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICME 2023 (oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FairGen: Towards Fair Graph Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17743v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17743v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lecheng Zheng, Dawei Zhou, Hanghang Tong, Jiejun Xu, Yada Zhu, Jingrui He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There have been tremendous efforts over the past decades dedicated to the
+generation of realistic graphs in a variety of domains, ranging from social
+networks to computer networks, from gene regulatory networks to online
+transaction networks. Despite the remarkable success, the vast majority of
+these works are unsupervised in nature and are typically trained to minimize
+the expected graph reconstruction loss, which would result in the
+representation disparity issue in the generated graphs, i.e., the protected
+groups (often minorities) contribute less to the objective and thus suffer from
+systematically higher errors. In this paper, we aim to tailor graph generation
+to downstream mining tasks by leveraging label information and user-preferred
+parity constraint. In particular, we start from the investigation of
+representation disparity in the context of graph generative models. To mitigate
+the disparity, we propose a fairness-aware graph generative model named
+FairGen. Our model jointly trains a label-informed graph generation module and
+a fair representation learning module by progressively learning the behaviors
+of the protected and unprotected groups, from the `easy' concepts to the `hard'
+ones. In addition, we propose a generic context sampling strategy for graph
+generative models, which is proven to be capable of fairly capturing the
+contextual information of each group with a high probability. Experimental
+results on seven real-world data sets, including web-based graphs, demonstrate
+that FairGen (1) obtains performance on par with state-of-the-art graph
+generative models across six network properties, (2) mitigates the
+representation disparity issues in the generated graphs, and (3) substantially
+boosts the model performance by up to 17% in downstream tasks via data
+augmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linear CNNs Discover the Statistical Structure of the <span class="highlight-title">Dataset</span> Using Only
+  the Most Dominant Frequencies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02034v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02034v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hannah Pinson, Joeri Lenaerts, Vincent Ginis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We here present a stepping stone towards a deeper understanding of
+convolutional neural networks (CNNs) in the form of a theory of learning in
+linear CNNs. Through analyzing the gradient descent equations, we discover that
+the evolution of the network during training is determined by the interplay
+between the dataset structure and the convolutional network structure. We show
+that linear CNNs discover the statistical structure of the dataset with
+non-linear, ordered, stage-like transitions, and that the speed of discovery
+changes depending on the relationship between the dataset and the convolutional
+network structure. Moreover, we find that this interplay lies at the heart of
+what we call the ``dominant frequency bias'', where linear CNNs arrive at these
+discoveries using only the dominant frequencies of the different structural
+parts present in the dataset. We furthermore provide experiments that show how
+our theory relates to deep, non-linear CNNs used in practice. Our findings shed
+new light on the inner working of CNNs, and can help explain their shortcut
+learning and their tendency to rely on texture instead of shape.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Productive Crop Field Detection: A New <span class="highlight-title">Dataset</span> and Deep Learning
+  Benchmark Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11990v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11990v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduardo Nascimento, John Just, Jurandy Almeida, Tiago Almeida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In precision agriculture, detecting productive crop fields is an essential
+practice that allows the farmer to evaluate operating performance separately
+and compare different seed varieties, pesticides, and fertilizers. However,
+manually identifying productive fields is often a time-consuming and
+error-prone task. Previous studies explore different methods to detect crop
+fields using advanced machine learning algorithms, but they often lack good
+quality labeled data. In this context, we propose a high-quality dataset
+generated by machine operation combined with Sentinel-2 images tracked over
+time. As far as we know, it is the first one to overcome the lack of labeled
+samples by using this technique. In sequence, we apply a semi-supervised
+classification of unlabeled data and state-of-the-art supervised and
+self-supervised deep learning methods to detect productive crop fields
+automatically. Finally, the results demonstrate high accuracy in Positive
+Unlabeled learning, which perfectly fits the problem where we have high
+confidence in the positive samples. Best performances have been found in
+Triplet Loss Siamese given the existence of an accurate dataset and Contrastive
+Learning considering situations where we do not have a comprehensive labeled
+dataset available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of the paper https://doi.org/10.1109/lgrs.2023.3296064
+  published in IEEE Geoscience and Remote Sensing Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting the Partly Scratch-off Lottery Ticket for Quantization-Aware
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08544v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08544v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunshan Zhong, Gongrui Nan, Yuxin Zhang, Fei Chao, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization-aware training (QAT) receives extensive popularity as it well
+retains the performance of quantized networks. In QAT, the contemporary
+experience is that all quantized weights are updated for an entire training
+process. In this paper, this experience is challenged based on an interesting
+phenomenon we observed. Specifically, a large portion of quantized weights
+reaches the optimal quantization level after a few training epochs, which we
+refer to as the partly scratch-off lottery ticket. This
+straightforward-yet-valuable observation naturally inspires us to zero out
+gradient calculations of these weights in the remaining training period to
+avoid meaningless updating. To effectively find the ticket, we develop a
+heuristic method, dubbed lottery ticket scratcher (LTS), which freezes a weight
+once the distance between the full-precision one and its quantization level is
+smaller than a controllable threshold. Surprisingly, the proposed LTS typically
+eliminates 50%-70% weight updating and 25%-35% FLOPs of the backward pass,
+while still resulting on par with or even better performance than the compared
+baseline. For example, compared with the baseline, LTS improves 2-bit
+MobileNetV2 by 5.05%, eliminating 46% weight updating and 23% FLOPs of the
+backward pass. Code is at url{https://github.com/zysxmu/LTS}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Importance of Aligning Training Strategy with Evaluation for Diffusion
+  Models in 3D Multiclass Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06040v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06040v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunguan Fu, Yiwen Li, Shaheer U. Saeed, Matthew J. Clarkson, Yipeng Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, denoising diffusion probabilistic models (DDPM) have been applied
+to image segmentation by generating segmentation masks conditioned on images,
+while the applications were mainly limited to 2D networks without exploiting
+potential benefits from the 3D formulation. In this work, we studied the
+DDPM-based segmentation model for 3D multiclass segmentation on two large
+multiclass data sets (prostate MR and abdominal CT). We observed that the
+difference between training and test methods led to inferior performance for
+existing DDPM methods. To mitigate the inconsistency, we proposed a recycling
+method which generated corrupted masks based on the model's prediction at a
+previous time step instead of using ground truth. The proposed method achieved
+statistically significantly improved performance compared to existing DDPMs,
+independent of a number of other techniques for reducing train-test
+discrepancy, including performing mask prediction, using Dice loss, and
+reducing the number of diffusion time steps during training. The performance of
+diffusion models was also competitive and visually similar to
+non-diffusion-based U-net, within the same compute budget. The JAX-based
+diffusion framework has been released at
+https://github.com/mathpluscode/ImgX-DiffSeg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Deep Generative Models workshop at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying & Modeling Multimodal Interactions: An Information
+  Decomposition Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12247v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12247v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Pu Liang, Yun Cheng, Xiang Fan, Chun Kai Ling, Suzanne Nie, Richard Chen, Zihao Deng, Nicholas Allen, Randy Auerbach, Faisal Mahmood, Ruslan Salakhutdinov, Louis-Philippe Morency
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent explosion of interest in multimodal applications has resulted in a
+wide selection of datasets and methods for representing and integrating
+information from different modalities. Despite these empirical advances, there
+remain fundamental research questions: How can we quantify the interactions
+that are necessary to solve a multimodal task? Subsequently, what are the most
+suitable multimodal models to capture these interactions? To answer these
+questions, we propose an information-theoretic approach to quantify the degree
+of redundancy, uniqueness, and synergy relating input modalities with an output
+task. We term these three measures as the PID statistics of a multimodal
+distribution (or PID for short), and introduce two new estimators for these PID
+statistics that scale to high-dimensional distributions. To validate PID
+estimation, we conduct extensive experiments on both synthetic datasets where
+the PID is known and on large-scale multimodal benchmarks where PID estimations
+are compared with human annotations. Finally, we demonstrate their usefulness
+in (1) quantifying interactions within multimodal datasets, (2) quantifying
+interactions captured by multimodal models, (3) principled approaches for model
+selection, and (4) three real-world case studies engaging with domain experts
+in pathology, mood prediction, and robotic perception where our framework helps
+to recommend strong multimodal models for each application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at: https://github.com/pliang279/PID</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VOCALExplore: Pay-as-You-Go Video Data Exploration and Model Building
+  [Technical Report] 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04068v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04068v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maureen Daum, Enhao Zhang, Dong He, Stephen Mussmann, Brandon Haynes, Ranjay Krishna, Magdalena Balazinska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce VOCALExplore, a system designed to support users in building
+domain-specific models over video datasets. VOCALExplore supports interactive
+labeling sessions and trains models using user-supplied labels. VOCALExplore
+maximizes model quality by automatically deciding how to select samples based
+on observed skew in the collected labels. It also selects the optimal video
+representations to use when training models by casting feature selection as a
+rising bandit problem. Finally, VOCALExplore implements optimizations to
+achieve low latency without sacrificing model performance. We demonstrate that
+VOCALExplore achieves close to the best possible model quality given candidate
+acquisition functions and feature extractors, and it does so with low visible
+latency (~1 second per iteration) and no expensive preprocessing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SwiftFormer: Efficient Additive Attention for <span class="highlight-title">Transformer</span>-based
+  Real-time Mobile Vision Applications <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15446v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15446v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-attention has become a defacto choice for capturing global context in
+various vision applications. However, its quadratic computational complexity
+with respect to image resolution limits its use in real-time applications,
+especially for deployment on resource-constrained mobile devices. Although
+hybrid approaches have been proposed to combine the advantages of convolutions
+and self-attention for a better speed-accuracy trade-off, the expensive matrix
+multiplication operations in self-attention remain a bottleneck. In this work,
+we introduce a novel efficient additive attention mechanism that effectively
+replaces the quadratic matrix multiplication operations with linear
+element-wise multiplications. Our design shows that the key-value interaction
+can be replaced with a linear layer without sacrificing any accuracy. Unlike
+previous state-of-the-art methods, our efficient formulation of self-attention
+enables its usage at all stages of the network. Using our proposed efficient
+additive attention, we build a series of models called "SwiftFormer" which
+achieves state-of-the-art performance in terms of both accuracy and mobile
+inference speed. Our small variant achieves 78.5% top-1 ImageNet-1K accuracy
+with only 0.8 ms latency on iPhone 14, which is more accurate and 2x faster
+compared to MobileViT-v2. Code: https://github.com/Amshaker/SwiftFormer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">11</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Mainstream Bias in Recommendation via Cost-sensitive Learning <span class="chip">ICTIR'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roger Zhe Li, Julián Urbano, Alan Hanjalic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mainstream bias, where some users receive poor recommendations because their
+preferences are uncommon or simply because they are less active, is an
+important aspect to consider regarding fairness in recommender systems.
+Existing methods to mitigate mainstream bias do not explicitly model the
+importance of these non-mainstream users or, when they do, it is in a way that
+is not necessarily compatible with the data and recommendation model at hand.
+In contrast, we use the recommendation utility as a more generic and implicit
+proxy to quantify mainstreamness, and propose a simple user-weighting approach
+to incorporate it into the training process while taking the cost of potential
+recommendation errors into account. We provide extensive experimental results
+showing that quantifying mainstreamness via utility is better able at
+identifying non-mainstream users, and that they are indeed better served when
+training the model in a cost-sensitive way. This is achieved with negligible or
+no loss in overall recommendation accuracy, meaning that the models learn a
+better balance across users. In addition, we show that research of this kind,
+which evaluates recommendation quality at the individual user level, may not be
+reliable if not using enough interactions when assessing model performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures, accepted to ICTIR'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gaussian Graph with Prototypical Contrastive Learning in E-Commerce
+  Bundle Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13468v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13468v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao-Yang Liu, Liucheng Sun, Chenwei Weng, Qijin Chen, Chengfu Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bundle recommendation aims to provide a bundle of items to satisfy the user
+preference on e-commerce platform. Existing successful solutions are based on
+the contrastive graph learning paradigm where graph neural networks (GNNs) are
+employed to learn representations from user-level and bundle-level graph views
+with a contrastive learning module to enhance the cooperative association
+between different views. Nevertheless, they ignore the uncertainty issue which
+has a significant impact in real bundle recommendation scenarios due to the
+lack of discriminative information caused by highly sparsity or diversity. We
+further suggest that their instancewise contrastive learning fails to
+distinguish the semantically similar negatives (i.e., sampling bias issue),
+resulting in performance degradation. In this paper, we propose a novel
+Gaussian Graph with Prototypical Contrastive Learning (GPCL) framework to
+overcome these challenges. In particular, GPCL embeds each user/bundle/item as
+a Gaussian distribution rather than a fixed vector. We further design a
+prototypical contrastive learning module to capture the contextual information
+and mitigate the sampling bias issue. Extensive experiments demonstrate that
+benefiting from the proposed components, we achieve new state-of-the-art
+performance compared to previous methods on several public datasets. Moreover,
+GPCL has been deployed on real-world e-commerce platform and achieved
+substantial improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comprehensive <span class="highlight-title">Review</span> on Semantic Information Retrieval and Ontology
+  Engineering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13427v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13427v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sumit Sharma, Sarika Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Situation awareness is a crucial cognitive skill that enables individuals to
+perceive, comprehend, and project the current state of their environment
+accurately. It involves being conscious of relevant information, understanding
+its meaning, and using that understanding to make well-informed decisions.
+Awareness systems often need to integrate new knowledge and adapt to changing
+environments. Ontology reasoning facilitates knowledge integration and
+evolution, allowing for seamless updates and expansions of the ontology. With
+the consideration of above, we are providing a quick review on semantic
+information retrieval and ontology engineering to understand the emerging
+challenges and future research. In the review we have found that the ontology
+reasoning addresses the limitations of traditional systems by providing a
+formal, flexible, and scalable framework for knowledge representation,
+reasoning, and inference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An End-to-End Workflow using Topic Segmentation and Text Summarisation
+  Methods for Improved Podcast Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Aquilina, Sean Diacono, Panagiotis Papapetrou, Maria Movin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The consumption of podcast media has been increasing rapidly. Due to the
+lengthy nature of podcast episodes, users often carefully select which ones to
+listen to. Although episode descriptions aid users by providing a summary of
+the entire podcast, they do not provide a topic-by-topic breakdown. This study
+explores the combined application of topic segmentation and text summarisation
+methods to investigate how podcast episode comprehension can be improved. We
+have sampled 10 episodes from Spotify's English-Language Podcast Dataset and
+employed TextTiling and TextSplit to segment them. Moreover, three text
+summarisation models, namely T5, BART, and Pegasus, were applied to provide a
+very short title for each segment. The segmentation part was evaluated using
+our annotated sample with the $P_k$ and WindowDiff ($WD$) metrics. A survey was
+also rolled out ($N=25$) to assess the quality of the generated summaries. The
+TextSplit algorithm achieved the lowest mean for both evaluation metrics
+($\bar{P_k}=0.41$ and $\bar{WD}=0.41$), while the T5 model produced the best
+summaries, achieving a relevancy score only $8\%$ less to the one achieved by
+the human-written titles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embedding Models for Supervised Automatic Extraction and Classification
+  of Named Entities in Scientific Acknowledgements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nina Smirnova, Philipp Mayr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acknowledgments in scientific papers may give an insight into aspects of the
+scientific community, such as reward systems, collaboration patterns, and
+hidden research trends. The aim of the paper is to evaluate the performance of
+different embedding models for the task of automatic extraction and
+classification of acknowledged entities from the acknowledgment text in
+scientific papers. We trained and implemented a named entity recognition (NER)
+task using the Flair NLP framework. The training was conducted using three
+default Flair NER models with four differently-sized corpora and different
+versions of the Flair NLP framework. The Flair Embeddings model trained on the
+medium corpus with the latest FLAIR version showed the best accuracy of 0.79.
+Expanding the size of a training corpus from very small to medium size
+massively increased the accuracy of all training algorithms, but further
+expansion of the training corpus did not bring further improvement. Moreover,
+the performance of the model slightly deteriorated. Our model is able to
+recognize six entity types: funding agency, grant number, individuals,
+university, corporation, and miscellaneous. The model works more precisely for
+some entity types than for others; thus, individuals and grant numbers showed a
+very good F1-Score over 0.9. Most of the previous works on acknowledgment
+analysis were limited by the manual evaluation of data and therefore by the
+amount of processed data. This model can be applied for the comprehensive
+analysis of acknowledgment texts and may potentially make a great contribution
+to the field of automated acknowledgment analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The present paper is an extended version of the article Evaluation of
+  Embedding Models for Automatic Extraction and Classification of Acknowledged
+  Entities in Scientific Documents (Smirnova and Mayr, 2022) presented at the
+  3rd Workshop on Extraction and Evaluation of Knowledge Entities from
+  Scientific Documents (EEKE2022). arXiv admin note: substantial text overlap
+  with arXiv:2206.10939</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Intent Taxonomy of Legal Case Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunqiu Shao, Haitao Li, Yueyue Wu, Yiqun Liu, Qingyao Ai, Jiaxin Mao, Yixiao Ma, Shaoping Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legal case retrieval is a special Information Retrieval~(IR) task focusing on
+legal case documents. Depending on the downstream tasks of the retrieved case
+documents, users' information needs in legal case retrieval could be
+significantly different from those in Web search and traditional ad-hoc
+retrieval tasks. While there are several studies that retrieve legal cases
+based on text similarity, the underlying search intents of legal retrieval
+users, as shown in this paper, are more complicated than that yet mostly
+unexplored. To this end, we present a novel hierarchical intent taxonomy of
+legal case retrieval. It consists of five intent types categorized by three
+criteria, i.e., search for Particular Case(s), Characterization, Penalty,
+Procedure, and Interest. The taxonomy was constructed transparently and
+evaluated extensively through interviews, editorial user studies, and query log
+analysis. Through a laboratory user study, we reveal significant differences in
+user behavior and satisfaction under different search intents in legal case
+retrieval. Furthermore, we apply the proposed taxonomy to various downstream
+legal retrieval tasks, e.g., result ranking and satisfaction prediction, and
+demonstrate its effectiveness. Our work provides important insights into the
+understanding of user intents in legal case retrieval and potentially leads to
+better retrieval techniques in the legal domain, such as intent-aware ranking
+strategies and evaluation methodologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, work in process</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ClusterSeq: Enhancing Sequential Recommender Systems with Clustering
+  based Meta-Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13766v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13766v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammmadmahdi Maheri, Reza Abdollahzadeh, Bardia Mohammadi, Mina Rafiei, Jafar Habibi, Hamid R. Rabiee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In practical scenarios, the effectiveness of sequential recommendation
+systems is hindered by the user cold-start problem, which arises due to limited
+interactions for accurately determining user preferences. Previous studies have
+attempted to address this issue by combining meta-learning with user and
+item-side information. However, these approaches face inherent challenges in
+modeling user preference dynamics, particularly for "minor users" who exhibit
+distinct preferences compared to more common or "major users." To overcome
+these limitations, we present a novel approach called ClusterSeq, a
+Meta-Learning Clustering-Based Sequential Recommender System. ClusterSeq
+leverages dynamic information in the user sequence to enhance item prediction
+accuracy, even in the absence of side information. This model preserves the
+preferences of minor users without being overshadowed by major users, and it
+capitalizes on the collective knowledge of users within the same cluster.
+Extensive experiments conducted on various benchmark datasets validate the
+effectiveness of ClusterSeq. Empirical results consistently demonstrate that
+ClusterSeq outperforms several state-of-the-art meta-learning recommenders.
+Notably, compared to existing meta-learning methods, our proposed approach
+achieves a substantial improvement of 16-39% in Mean Reciprocal Rank (MRR).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soft <span class="highlight-title">Prompt</span> Tuning for Augmenting Dense Retrieval with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Peng, Xuyang Wu, Yi Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dense retrieval (DR) converts queries and documents into dense embeddings and
+measures the similarity between queries and documents in vector space. One of
+the challenges in DR is the lack of domain-specific training data. While DR
+models can learn from large-scale public datasets like MS MARCO through
+transfer learning, evidence shows that not all DR models and domains can
+benefit from transfer learning equally. Recently, some researchers have
+resorted to large language models (LLMs) to improve the zero-shot and few-shot
+DR models. However, the hard prompts or human-written prompts utilized in these
+works cannot guarantee the good quality of generated weak queries. To tackle
+this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,
+we leverage soft prompt-tuning to optimize a task-specific soft prompt on
+limited ground truth data and then prompt the LLMs to tag unlabeled documents
+with weak queries, yielding enough weak document-query pairs to train
+task-specific dense retrievers. We design a filter to select high-quality
+example document-query pairs in the prompt to further improve the quality of
+weak tagged queries. To the best of our knowledge, there is no prior work
+utilizing soft prompt tuning to augment DR models. The experiments demonstrate
+that SPTAR outperforms the unsupervised baselines BM25 and the recently
+proposed LLMs-based augmentation method for DR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>fix typo InPairs which should be InPars</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Criterion-based Heterogeneous Collaborative Filtering for Multi-behavior
+  Implicit Recommendation <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.11876v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.11876v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Luo, Daqing Wu, Yiyang Gu, Chong Chen, Luchen Liu, Jinwen Ma, Ming Zhang, Minghua Deng, Jianqiang Huang, Xian-Sheng Hua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed the explosive growth of interaction behaviors in
+multimedia information systems, where multi-behavior recommender systems have
+received increasing attention by leveraging data from various auxiliary
+behaviors such as tip and collect. Among various multi-behavior recommendation
+methods, non-sampling methods have shown superiority over negative sampling
+methods. However, two observations are usually ignored in existing
+state-of-the-art non-sampling methods based on binary regression: (1) users
+have different preference strengths for different items, so they cannot be
+measured simply by binary implicit data; (2) the dependency across multiple
+behaviors varies for different users and items. To tackle the above issue, we
+propose a novel non-sampling learning framework named Criterion-guided
+Heterogeneous Collaborative Filtering (CHCF). CHCF introduces both upper and
+lower thresholds to indicate selection criteria, which will guide user
+preference learning. Besides, CHCF integrates criterion learning and user
+preference learning into a unified framework, which can be trained jointly for
+the interaction prediction of the target behavior. We further theoretically
+demonstrate that the optimization of Collaborative Metric Learning can be
+approximately achieved by the CHCF learning framework in a non-sampling form
+effectively. Extensive experiments on three real-world datasets show the
+effectiveness of CHCF in heterogeneous scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Transactions on Knowledge Discovery from Data (TKDD)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RRAML: Reinforced Retrieval Augmented Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12798v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12798v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bacciu, Florin Cocunasu, Federico Siciliano, Fabrizio Silvestri, Nicola Tonellotto, Giovanni Trappolini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large language models (LLMs) has revolutionized machine
+learning and related fields, showcasing remarkable abilities in comprehending,
+generating, and manipulating human language. However, their conventional usage
+through API-based text prompt submissions imposes certain limitations in terms
+of context constraints and external source availability. To address these
+challenges, we propose a novel framework called Reinforced Retrieval Augmented
+Machine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs
+with supporting information retrieved by a purpose-built retriever from a vast
+user-provided database. By leveraging recent advancements in reinforcement
+learning, our method effectively addresses several critical challenges.
+Firstly, it circumvents the need for accessing LLM gradients. Secondly, our
+method alleviates the burden of retraining LLMs for specific tasks, as it is
+often impractical or impossible due to restricted access to the model and the
+computational intensity involved. Additionally we seamlessly link the
+retriever's task with the reasoner, mitigating hallucinations and reducing
+irrelevant, and potentially damaging retrieved documents. We believe that the
+research agenda outlined in this paper has the potential to profoundly impact
+the field of AI, democratizing access to and utilization of LLMs for a wide
+range of entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interface Design to Mitigate Inflation in Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12424v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12424v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rana Shahout, Yehonatan Peisakhovsky, Sasha Stoikov, Nikhil Garg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation systems rely on user-provided data to learn about item quality
+and provide personalized recommendations. An implicit assumption when
+aggregating ratings into item quality is that ratings are strong indicators of
+item quality. In this work, we test this assumption using data collected from a
+music discovery application. Our study focuses on two factors that cause rating
+inflation: heterogeneous user rating behavior and the dynamics of personalized
+recommendations. We show that user rating behavior substantially varies by
+user, leading to item quality estimates that reflect the users who rated an
+item more than the item quality itself. Additionally, items that are more
+likely to be shown via personalized recommendations can experience a
+substantial increase in their exposure and potential bias toward them. To
+mitigate these effects, we analyze the results of a randomized controlled trial
+in which the rating interface was modified. The test resulted in a substantial
+improvement in user rating behavior and a reduction in item quality inflation.
+These findings highlight the importance of carefully considering the
+assumptions underlying recommendation systems and designing interfaces that
+encourage accurate rating behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">135</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARB: Advanced Reasoning Benchmark for Large Language Models <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13692v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13692v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomohiro Sawada, Daniel Paleka, Alexander Havrilla, Pranav Tadepalli, Paula Vidas, Alexander Kranias, John J. Nay, Kshitij Gupta, Aran Komatsuzaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable performance on
+various quantitative reasoning and knowledge benchmarks. However, many of these
+benchmarks are losing utility as LLMs get increasingly high scores, despite not
+yet reaching expert performance in these domains. We introduce ARB, a novel
+benchmark composed of advanced reasoning problems in multiple fields. ARB
+presents a more challenging test than prior benchmarks, featuring problems in
+mathematics, physics, biology, chemistry, and law. As a subset of ARB, we
+introduce a challenging set of math and physics problems which require advanced
+symbolic reasoning and domain knowledge. We evaluate recent models such as
+GPT-4 and Claude on ARB and demonstrate that current models score well below
+50% on more demanding tasks. In order to improve both automatic and assisted
+evaluation capabilities, we introduce a rubric-based evaluation approach,
+allowing GPT-4 to score its own intermediate reasoning steps. Further, we
+conduct a human evaluation of the symbolic subset of ARB, finding promising
+agreement between annotators and GPT-4 rubric evaluation scores.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to NeurIPS Datasets and Benchmarks Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High Probability Analysis for Non-Convex Stochastic Optimization with
+  Clipping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaojie Li, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient clipping is a commonly used technique to stabilize the training
+process of neural networks. A growing body of studies has shown that gradient
+clipping is a promising technique for dealing with the heavy-tailed behavior
+that emerged in stochastic optimization as well. While gradient clipping is
+significant, its theoretical guarantees are scarce. Most theoretical guarantees
+only provide an in-expectation analysis and only focus on optimization
+performance. In this paper, we provide high probability analysis in the
+non-convex setting and derive the optimization bound and the generalization
+bound simultaneously for popular stochastic optimization algorithms with
+gradient clipping, including stochastic gradient descent and its variants of
+momentum and adaptive stepsizes. With the gradient clipping, we study a
+heavy-tailed assumption that the gradients only have bounded $\alpha$-th
+moments for some $\alpha \in (1, 2]$, which is much weaker than the standard
+bounded second-moment assumption. Overall, our study provides a relatively
+complete picture for the theoretical guarantee of stochastic optimization
+algorithms with clipping.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RED CoMETS: An ensemble classifier for symbolically represented
+  multivariate time series <span class="chip">ALT</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13679v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13679v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca A. Bennett, Zahraa S. Abdallah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multivariate time series classification is a rapidly growing research field
+with practical applications in finance, healthcare, engineering, and more. The
+complexity of classifying multivariate time series data arises from its high
+dimensionality, temporal dependencies, and varying lengths. This paper
+introduces a novel ensemble classifier called RED CoMETS (Random Enhanced
+Co-eye for Multivariate Time Series), which addresses these challenges. RED
+CoMETS builds upon the success of Co-eye, an ensemble classifier specifically
+designed for symbolically represented univariate time series, and extends its
+capabilities to handle multivariate data. The performance of RED CoMETS is
+evaluated on benchmark datasets from the UCR archive, where it demonstrates
+competitive accuracy when compared to state-of-the-art techniques in
+multivariate settings. Notably, it achieves the highest reported accuracy in
+the literature for the 'HandMovementDirection' dataset. Moreover, the proposed
+method significantly reduces computation time compared to Co-eye, making it an
+efficient and effective choice for multivariate time series classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AALTD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards an AI Accountability Policy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13658v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13658v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Przemyslaw Grabowicz, Nicholas Perello, Yair Zick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This white paper is a response to the "AI Accountability Policy Request for
+Comments" by the National Telecommunications and Information Administration of
+the United States. The question numbers for which comments were requested are
+provided in superscripts at the end of key sentences answering the respective
+questions. The white paper offers a set of interconnected recommendations for
+an AI accountability policy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safety Margins for Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13642v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13642v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Grushin, Walt Woods, Alvaro Velasquez, Simon Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Any autonomous controller will be unsafe in some situations. The ability to
+quantitatively identify when these unsafe situations are about to occur is
+crucial for drawing timely human oversight in, e.g., freight transportation
+applications. In this work, we demonstrate that the true criticality of an
+agent's situation can be robustly defined as the mean reduction in reward given
+some number of random actions. Proxy criticality metrics that are computable in
+real-time (i.e., without actually simulating the effects of random actions) can
+be compared to the true criticality, and we show how to leverage these proxy
+metrics to generate safety margins, which directly tie the consequences of
+potentially incorrect actions to an anticipated loss in overall performance. We
+evaluate our approach on learned policies from APE-X and A3C within an Atari
+environment, and demonstrate how safety margins decrease as agents approach
+failure states. The integration of safety margins into programs for monitoring
+deployed agents allows for the real-time identification of potentially
+catastrophic situations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2 pages, 2 figures. Presented at the 2023 IEEE Conference on
+  Artificial Intelligence (CAI), Santa Clara, CA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling machine learning-based chemical plant simulation: A method for
+  fine-tuning a model to induce stable fixed points 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Malte Esders, Gimmy Alex Fernandez Ramirez, Michael Gastegger, Satya Swarup Samal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Idealized first-principles models of chemical plants can be inaccurate. An
+alternative is to fit a Machine Learning (ML) model directly to plant sensor
+data. We use a structured approach: Each unit within the plant gets represented
+by one ML model. After fitting the models to the data, the models are connected
+into a flowsheet-like directed graph. We find that for smaller plants, this
+approach works well, but for larger plants, the complex dynamics arising from
+large and nested cycles in the flowsheet lead to instabilities in the cycle
+solver. We analyze this problem in depth and show that it is not merely a
+specialized concern but rather a more pervasive challenge that will likely
+occur whenever ML is applied to larger plants. To address this problem, we
+present a way to fine-tune ML models such that solving cycles with the usual
+methods becomes robust again.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI and ethics in insurance: a new solution to mitigate proxy
+  discrimination in risk modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marguerite Sauce, Antoine Chancel, Antoine Ly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of Machine Learning is experiencing growing interest from the
+general public, and in recent years there have been numerous press articles
+questioning its objectivity: racism, sexism, \dots Driven by the growing
+attention of regulators on the ethical use of data in insurance, the actuarial
+community must rethink pricing and risk selection practices for fairer
+insurance. Equity is a philosophy concept that has many different definitions
+in every jurisdiction that influence each other without currently reaching
+consensus. In Europe, the Charter of Fundamental Rights defines guidelines on
+discrimination, and the use of sensitive personal data in algorithms is
+regulated. If the simple removal of the protected variables prevents any
+so-called `direct' discrimination, models are still able to `indirectly'
+discriminate between individuals thanks to latent interactions between
+variables, which bring better performance (and therefore a better
+quantification of risk, segmentation of prices, and so on). After introducing
+the key concepts related to discrimination, we illustrate the complexity of
+quantifying them. We then propose an innovative method, not yet met in the
+literature, to reduce the risks of indirect discrimination thanks to
+mathematical concepts of linear algebra. This technique is illustrated in a
+concrete case of risk selection in life insurance, demonstrating its simplicity
+of use and its promising performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint - WIP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-GPU Approach for Training of Graph ML Models on large CFD Meshes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13592v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13592v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Strönisch, Maximilian Sander, Andreas Knüpfer, Marcus Meyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mesh-based numerical solvers are an important part in many design tool
+chains. However, accurate simulations like computational fluid dynamics are
+time and resource consuming which is why surrogate models are employed to
+speed-up the solution process. Machine Learning based surrogate models on the
+other hand are fast in predicting approximate solutions but often lack
+accuracy. Thus, the development of the predictor in a predictor-corrector
+approach is the focus here, where the surrogate model predicts a flow field and
+the numerical solver corrects it. This paper scales a state-of-the-art
+surrogate model from the domain of graph-based machine learning to
+industry-relevant mesh sizes of a numerical flow simulation. The approach
+partitions and distributes the flow domain to multiple GPUs and provides halo
+exchange between these partitions during training. The utilized graph neural
+network operates directly on the numerical mesh and is able to preserve complex
+geometries as well as all other properties of the mesh. The proposed surrogate
+model is evaluated with an application on a three dimensional turbomachinery
+setup and compared to a traditionally trained distributed model. The results
+show that the traditional approach produces superior predictions and
+outperforms the proposed surrogate model. Possible explanations, improvements
+and future directions are outlined.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Settling the Sample Complexity of Online Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Zhang, Yuxin Chen, Jason D. Lee, Simon S. Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A central issue lying at the heart of online reinforcement learning (RL) is
+data efficiency. While a number of recent works achieved asymptotically minimal
+regret in online RL, the optimality of these results is only guaranteed in a
+``large-sample'' regime, imposing enormous burn-in cost in order for their
+algorithms to operate optimally. How to achieve minimax-optimal regret without
+incurring any burn-in cost has been an open problem in RL theory.
+  We settle this problem for the context of finite-horizon inhomogeneous Markov
+decision processes. Specifically, we prove that a modified version of Monotonic
+Value Propagation (MVP), a model-based algorithm proposed by
+\cite{zhang2020reinforcement}, achieves a regret on the order of (modulo log
+factors) \begin{equation*}
+  \min\big\{ \sqrt{SAH^3K}, \,HK \big\}, \end{equation*} where $S$ is the
+number of states, $A$ is the number of actions, $H$ is the planning horizon,
+and $K$ is the total number of episodes. This regret matches the minimax lower
+bound for the entire range of sample size $K\geq 1$, essentially eliminating
+any burn-in requirement. It also translates to a PAC sample complexity (i.e.,
+the number of episodes needed to yield $\varepsilon$-accuracy) of
+$\frac{SAH^3}{\varepsilon^2}$ up to log factor, which is minimax-optimal for
+the full $\varepsilon$-range.
+  Further, we extend our theory to unveil the influences of problem-dependent
+quantities like the optimal value/cost and certain variances. The key technical
+innovation lies in the development of a new regret decomposition strategy and a
+novel analysis paradigm to decouple complicated statistical dependency -- a
+long-standing challenge facing the analysis of online RL in the sample-hungry
+regime.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparing Forward and Inverse Design Paradigms: A Case Study on
+  Refractory High-Entropy Alloys 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13581v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13581v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arindam Debnath, Lavanya Raman, Wenjie Li, Adam M. Krajewski, Marcia Ahn, Shuang Lin, Shunli Shang, Allison M. Beese, Zi-Kui Liu, Wesley F. Reinhart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid design of advanced materials is a topic of great scientific
+interest. The conventional, ``forward'' paradigm of materials design involves
+evaluating multiple candidates to determine the best candidate that matches the
+target properties. However, recent advances in the field of deep learning have
+given rise to the possibility of an ``inverse'' design paradigm for advanced
+materials, wherein a model provided with the target properties is able to find
+the best candidate. Being a relatively new concept, there remains a need to
+systematically evaluate how these two paradigms perform in practical
+applications. Therefore, the objective of this study is to directly,
+quantitatively compare the forward and inverse design modeling paradigms. We do
+so by considering two case studies of refractory high-entropy alloy design with
+different objectives and constraints and comparing the inverse design method to
+other forward schemes like localized forward search, high throughput screening,
+and multi objective optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinterpreting survival analysis in the universal approximator age 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13579v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13579v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sören Dittmer, Michael Roberts, Jacobus Preller, AIX COVNET, James H. F. Rudd, John A. D. Aston, Carola-Bibiane Schönlieb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Survival analysis is an integral part of the statistical toolbox. However,
+while most domains of classical statistics have embraced deep learning,
+survival analysis only recently gained some minor attention from the deep
+learning community. This recent development is likely in part motivated by the
+COVID-19 pandemic. We aim to provide the tools needed to fully harness the
+potential of survival analysis in deep learning. On the one hand, we discuss
+how survival analysis connects to classification and regression. On the other
+hand, we provide technical tools. We provide a new loss function, evaluation
+metrics, and the first universal approximating network that provably produces
+survival curves without numeric integration. We show that the loss function and
+model outperform other approaches using a large numerical study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PT$\mathrm{L}^{p}$: Partial Transport $\mathrm{L}^{p}$ Distances 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinran Liu, Yikun Bai, Huy Tran, Zhanqi Zhu, Matthew Thorpe, Soheil Kolouri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal transport and its related problems, including optimal partial
+transport, have proven to be valuable tools in machine learning for computing
+meaningful distances between probability or positive measures. This success has
+led to a growing interest in defining transport-based distances that allow for
+comparing signed measures and, more generally, multi-channeled signals.
+Transport $\mathrm{L}^{p}$ distances are notable extensions of the optimal
+transport framework to signed and possibly multi-channeled signals. In this
+paper, we introduce partial transport $\mathrm{L}^{p}$ distances as a new
+family of metrics for comparing generic signals, benefiting from the robustness
+of partial transport distances. We provide theoretical background such as the
+existence of optimal plans and the behavior of the distance in various limits.
+Furthermore, we introduce the sliced variation of these distances, which allows
+for rapid comparison of generic signals. Finally, we demonstrate the
+application of the proposed distances in signal class separability and nearest
+neighbor classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decision-Focused Learning: Foundations, State of the Art, Benchmark and
+  Future Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jayanta Mandi, James Kotary, Senne Berden, Maxime Mulamba, Victor Bucarey, Tias Guns, Ferdinando Fioretto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision-focused learning (DFL) is an emerging paradigm in machine learning
+which trains a model to optimize decisions, integrating prediction and
+optimization in an end-to-end system. This paradigm holds the promise to
+revolutionize decision-making in many real-world applications which operate
+under uncertainty, where the estimation of unknown parameters within these
+decision models often becomes a substantial roadblock. This paper presents a
+comprehensive review of DFL. It provides an in-depth analysis of the various
+techniques devised to integrate machine learning and optimization models
+introduces a taxonomy of DFL methods distinguished by their unique
+characteristics, and conducts an extensive empirical evaluation of these
+methods proposing suitable benchmark dataset and tasks for DFL. Finally, the
+study provides valuable insights into current and potential future avenues in
+DFL research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Experimental Survey and Benchmarking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Node Injection Link Stealing Attack 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13548v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13548v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oualid Zari, Javier Parra-Arnau, Ayşe Ünsal, Melek Önen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a stealthy and effective attack that exposes
+privacy vulnerabilities in Graph Neural Networks (GNNs) by inferring private
+links within graph-structured data. Focusing on the inductive setting where new
+nodes join the graph and an API is used to query predictions, we investigate
+the potential leakage of private edge information. We also propose methods to
+preserve privacy while maintaining model utility. Our attack demonstrates
+superior performance in inferring the links compared to the state of the art.
+Furthermore, we examine the application of differential privacy (DP) mechanisms
+to mitigate the impact of our proposed attack, we analyze the trade-off between
+privacy preservation and model utility. Our work highlights the privacy
+vulnerabilities inherent in GNNs, underscoring the importance of developing
+robust privacy-preserving mechanisms for their application.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transfer Learning for Portfolio Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13546v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13546v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyang Cao, Haotian Gu, Xin Guo, Mathieu Rosenbaum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we explore the possibility of utilizing transfer learning
+techniques to address the financial portfolio optimization problem. We
+introduce a novel concept called "transfer risk", within the optimization
+framework of transfer learning. A series of numerical experiments are conducted
+from three categories: cross-continent transfer, cross-sector transfer, and
+cross-frequency transfer. In particular, 1. a strong correlation between the
+transfer risk and the overall performance of transfer learning methods is
+established, underscoring the significance of transfer risk as a viable
+indicator of "transferability"; 2. transfer risk is shown to provide a
+computationally efficient way to identify appropriate source tasks in transfer
+learning, enhancing the efficiency and effectiveness of the transfer learning
+approach; 3. additionally, the numerical experiments offer valuable new
+insights for portfolio management across these different settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A model for efficient dynamical ranking in networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13544v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13544v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Della Vecchia, Kibidi Neocosmos, Daniel B. Larremore, Cristopher Moore, Caterina De Bacco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a physics-inspired method for inferring dynamic rankings in
+directed temporal networks - networks in which each directed and timestamped
+edge reflects the outcome and timing of a pairwise interaction. The inferred
+ranking of each node is real-valued and varies in time as each new edge,
+encoding an outcome like a win or loss, raises or lowers the node's estimated
+strength or prestige, as is often observed in real scenarios including
+sequences of games, tournaments, or interactions in animal hierarchies. Our
+method works by solving a linear system of equations and requires only one
+parameter to be tuned. As a result, the corresponding algorithm is scalable and
+efficient. We test our method by evaluating its ability to predict interactions
+(edges' existence) and their outcomes (edges' directions) in a variety of
+applications, including both synthetic and real data. Our analysis shows that
+in many cases our method's performance is better than existing methods for
+predicting dynamic rankings and interaction outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Calibration in Dense Classification with Adaptive Label
+  Perturbation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Liu, Changkun Ye, Shan Wang, Ruikai Cui, Jing Zhang, Kaihao Zhang, Nick Barnes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For safety-related applications, it is crucial to produce trustworthy deep
+neural networks whose prediction is associated with confidence that can
+represent the likelihood of correctness for subsequent decision-making.
+Existing dense binary classification models are prone to being over-confident.
+To improve model calibration, we propose Adaptive Stochastic Label Perturbation
+(ASLP) which learns a unique label perturbation level for each training image.
+ASLP employs our proposed Self-Calibrating Binary Cross Entropy (SC-BCE) loss,
+which unifies label perturbation processes including stochastic approaches
+(like DisturbLabel), and label smoothing, to correct calibration while
+maintaining classification rates. ASLP follows Maximum Entropy Inference of
+classic statistical mechanics to maximise prediction entropy with respect to
+missing information. It performs this while: (1) preserving classification
+accuracy on known data as a conservative solution, or (2) specifically improves
+model calibration degree by minimising the gap between the prediction accuracy
+and expected confidence of the target training label. Extensive results
+demonstrate that ASLP can significantly improve calibration degrees of dense
+binary classification models on both in-distribution and out-of-distribution
+data. The code is available on https://github.com/Carlisle-Liu/ASLP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ INFINITY: Neural Field Modeling for Reynolds-Averaged Navier-Stokes
+  Equations <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13538v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13538v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Louis Serrano, Leon Migus, Yuan Yin, Jocelyn Ahmed Mazari, Patrick Gallinari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For numerical design, the development of efficient and accurate surrogate
+models is paramount. They allow us to approximate complex physical phenomena,
+thereby reducing the computational burden of direct numerical simulations. We
+propose INFINITY, a deep learning model that utilizes implicit neural
+representations (INRs) to address this challenge. Our framework encodes
+geometric information and physical fields into compact representations and
+learns a mapping between them to infer the physical fields. We use an airfoil
+design optimization problem as an example task and we evaluate our approach on
+the challenging AirfRANS dataset, which closely resembles real-world industrial
+use-cases. The experimental results demonstrate that our framework achieves
+state-of-the-art performance by accurately inferring physical fields throughout
+the volume and surface. Additionally we demonstrate its applicability in
+contexts such as design exploration and shape optimization: our model can
+correctly predict drag and lift coefficients while adhering to the equations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 Workshop on Synergy of Scientific and Machine Learning
+  Modeling</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do algorithms and barriers for sparse principal component analysis
+  extend to other structured settings? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanyi Wang, Mengqi Lou, Ashwin Pananjady
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a principal component analysis problem under the spiked Wishart
+model in which the structure in the signal is captured by a class of
+union-of-subspace models. This general class includes vanilla sparse PCA as
+well as its variants with graph sparsity. With the goal of studying these
+problems under a unified statistical and computational lens, we establish
+fundamental limits that depend on the geometry of the problem instance, and
+show that a natural projected power method exhibits local convergence to the
+statistically near-optimal neighborhood of the solution. We complement these
+results with end-to-end analyses of two important special cases given by path
+and tree sparsity in a general basis, showing initialization methods and
+matching evidence of computational hardness. Overall, our results indicate that
+several of the phenomena observed for vanilla sparse PCA extend in a natural
+fashion to its structured counterparts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differentiable Turbulence II 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13533v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13533v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Varun Shankar, Romit Maulik, Venkatasubramanian Viswanathan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differentiable fluid simulators are increasingly demonstrating value as
+useful tools for developing data-driven models in computational fluid dynamics
+(CFD). Differentiable turbulence, or the end-to-end training of machine
+learning (ML) models embedded in CFD solution algorithms, captures both the
+generalization power and limited upfront cost of physics-based simulations, and
+the flexibility and automated training of deep learning methods. We develop a
+framework for integrating deep learning models into a generic finite element
+numerical scheme for solving the Navier-Stokes equations, applying the
+technique to learn a sub-grid scale closure using a multi-scale graph neural
+network. We demonstrate the method on several realizations of flow over a
+backwards-facing step, testing on both unseen Reynolds numbers and new
+geometry. We show that the learned closure can achieve accuracy comparable to
+traditional large eddy simulation on a finer grid that amounts to an equivalent
+speedup of 10x. As the desire and need for cheaper CFD simulations grows, we
+see hybrid physics-ML methods as a path forward to be exploited in the near
+future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Long-Term predictions of Turbulence using Neural Operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13517v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13517v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Gonzalez, François-Xavier Demoulin, Simon Bernard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores Neural Operators to predict turbulent flows, focusing on
+the Fourier Neural Operator (FNO) model. It aims to develop
+reduced-order/surrogate models for turbulent flow simulations using Machine
+Learning. Different model configurations are analyzed, with U-NET structures
+(UNO and U-FNET) performing better than the standard FNO in accuracy and
+stability. U-FNET excels in predicting turbulence at higher Reynolds numbers.
+Regularization terms, like gradient and stability losses, are essential for
+stable and accurate predictions. The study emphasizes the need for improved
+metrics for deep learning models in fluid flow prediction. Further research
+should focus on models handling complex flows and practical benchmarking
+metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ETMM14 proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continuous Time Evidential Distributions for Irregular Time Series <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13503v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13503v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taylor W. Killian, Haoran Zhang, Thomas Hartvigsen, Ava P. Amini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prevalent in many real-world settings such as healthcare, irregular time
+series are challenging to formulate predictions from. It is difficult to infer
+the value of a feature at any given time when observations are sporadic, as it
+could take on a range of values depending on when it was last observed. To
+characterize this uncertainty we present EDICT, a strategy that learns an
+evidential distribution over irregular time series in continuous time. This
+distribution enables well-calibrated and flexible inference of partially
+observed features at any time of interest, while expanding uncertainty
+temporally for sparse, irregular observations. We demonstrate that EDICT
+attains competitive performance on challenging time series classification tasks
+and enabling uncertainty-guided inference when encountering noisy data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 Workshop on Interpretable Machine Learning in Healthcare.
+  Code is available at https://github.com/twkillian/EDICT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Reinforcement Learning for Robust Goal-Based Wealth Management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tessa Bauman, Bruno Gašperov, Stjepan Begušić, Zvonko Kostanjčar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Goal-based investing is an approach to wealth management that prioritizes
+achieving specific financial goals. It is naturally formulated as a sequential
+decision-making problem as it requires choosing the appropriate investment
+until a goal is achieved. Consequently, reinforcement learning, a machine
+learning technique appropriate for sequential decision-making, offers a
+promising path for optimizing these investment strategies. In this paper, a
+novel approach for robust goal-based wealth management based on deep
+reinforcement learning is proposed. The experimental results indicate its
+superiority over several goal-based wealth management benchmarks on both
+simulated and historical market data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding Money Launderers Using Heterogeneous Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fredrik Johannessen, Martin Jullum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current anti-money laundering (AML) systems, predominantly rule-based,
+exhibit notable shortcomings in efficiently and precisely detecting instances
+of money laundering. As a result, there has been a recent surge toward
+exploring alternative approaches, particularly those utilizing machine
+learning. Since criminals often collaborate in their money laundering
+endeavors, accounting for diverse types of customer relations and links becomes
+crucial. In line with this, the present paper introduces a graph neural network
+(GNN) approach to identify money laundering activities within a large
+heterogeneous network constructed from real-world bank transactions and
+business role data belonging to DNB, Norway's largest bank. Specifically, we
+extend the homogeneous GNN method known as the Message Passing Neural Network
+(MPNN) to operate effectively on a heterogeneous graph. As part of this
+procedure, we propose a novel method for aggregating messages across different
+edges of the graph. Our findings highlight the importance of using an
+appropriate GNN architecture when combining information in heterogeneous
+graphs. The performance results of our model demonstrate great potential in
+enhancing the quality of electronic surveillance systems employed by banks to
+detect instances of money laundering. To the best of our knowledge, this is the
+first published work applying GNN on a large real-world heterogeneous network
+for anti-money laundering purposes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zshot: An Open-source Framework for Zero-Shot Named Entity Recognition
+  and Relation Extraction <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13497v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13497v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriele Picco, Marcos Martínez Galindo, Alberto Purpura, Leopold Fuchs, Vanessa López, Hoang Thanh Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Zero-Shot Learning (ZSL) task pertains to the identification of entities
+or relations in texts that were not seen during training. ZSL has emerged as a
+critical research area due to the scarcity of labeled data in specific domains,
+and its applications have grown significantly in recent years. With the advent
+of large pretrained language models, several novel methods have been proposed,
+resulting in substantial improvements in ZSL performance. There is a growing
+demand, both in the research community and industry, for a comprehensive ZSL
+framework that facilitates the development and accessibility of the latest
+methods and pretrained models.In this study, we propose a novel ZSL framework
+called Zshot that aims to address the aforementioned challenges. Our primary
+objective is to provide a platform that allows researchers to compare different
+state-of-the-art ZSL methods with standard benchmark datasets. Additionally, we
+have designed our framework to support the industry with readily available APIs
+for production under the standard SpaCy NLP pipeline. Our API is extendible and
+evaluable, moreover, we include numerous enhancements such as boosting the
+accuracy with pipeline ensembling and visualization utilities available as a
+SpaCy extension.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Duet: efficient and scalable hybriD neUral rElation undersTanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13494v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13494v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixin Zhang, Hongzhi Wang, Yabin Lu, Ziqi Li, Chang Shu, Yu Yan, Donghua Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardinality estimation methods based on probability distribution estimation
+have achieved high-precision estimation results compared to traditional
+methods. However, the most advanced methods suffer from high estimation costs
+due to the sampling method they use when dealing with range queries. Also, such
+a sampling method makes them difficult to differentiate, so the supervision
+signal from the query workload is difficult to train the model to improve the
+accuracy of cardinality estimation. In this paper, we propose a new hybrid and
+deterministic modeling approach (Duet) for the cardinality estimation problem
+which has better efficiency and scalability compared to previous approaches.
+Duet allows for direct cardinality estimation of range queries with
+significantly lower time and memory costs, as well as in a differentiable form.
+As the prediction process of this approach is differentiable, we can
+incorporate queries with larger model estimation errors into the training
+process to address the long-tail distribution problem of model estimation
+errors on high dimensional tables. We evaluate Duet on classical datasets and
+benchmarks, and the results prove the effectiveness of Duet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rational kernel-based interpolation for complex-valued frequency
+  response functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13484v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13484v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julien Bect, Niklas Georg, Ulrich Römer, Sebastian Schöps
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work is concerned with the kernel-based approximation of a
+complex-valued function from data, where the frequency response function of a
+partial differential equation in the frequency domain is of particular
+interest. In this setting, kernel methods are employed more and more
+frequently, however, standard kernels do not perform well. Moreover, the role
+and mathematical implications of the underlying pair of kernels, which arises
+naturally in the complex-valued case, remain to be addressed. We introduce new
+reproducing kernel Hilbert spaces of complex-valued functions, and formulate
+the problem of complex-valued interpolation with a kernel pair as minimum norm
+interpolation in these spaces. Moreover, we combine the interpolant with a
+low-order rational function, where the order is adaptively selected based on a
+new model selection criterion. Numerical results on examples from different
+fields, including electromagnetics and acoustic examples, illustrate the
+performance of the method, also in comparison to available rational
+approximation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages main paper, 6 pages supplement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Combinatorial Auctions and Graph Neural Networks for Local Energy
+  Flexibility Markets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Awadelrahman M. A. Ahmed, Frank Eliassen, Yan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a new combinatorial auction framework for local energy
+flexibility markets, which addresses the issue of prosumers' inability to
+bundle multiple flexibility time intervals. To solve the underlying NP-complete
+winner determination problems, we present a simple yet powerful heterogeneous
+tri-partite graph representation and design graph neural network-based models.
+Our models achieve an average optimal value deviation of less than 5\% from an
+off-the-shelf optimization tool and show linear inference time complexity
+compared to the exponential complexity of the commercial solver. Contributions
+and results demonstrate the potential of using machine learning to efficiently
+allocate energy flexibility resources in local markets and solving optimization
+problems in general.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in The IEEE PES ISGT Europe 2023 (ISGT Europe 2023),
+  Grenoble, France, on October, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gaussian Graph with Prototypical Contrastive Learning in E-Commerce
+  Bundle Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13468v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13468v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao-Yang Liu, Liucheng Sun, Chenwei Weng, Qijin Chen, Chengfu Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bundle recommendation aims to provide a bundle of items to satisfy the user
+preference on e-commerce platform. Existing successful solutions are based on
+the contrastive graph learning paradigm where graph neural networks (GNNs) are
+employed to learn representations from user-level and bundle-level graph views
+with a contrastive learning module to enhance the cooperative association
+between different views. Nevertheless, they ignore the uncertainty issue which
+has a significant impact in real bundle recommendation scenarios due to the
+lack of discriminative information caused by highly sparsity or diversity. We
+further suggest that their instancewise contrastive learning fails to
+distinguish the semantically similar negatives (i.e., sampling bias issue),
+resulting in performance degradation. In this paper, we propose a novel
+Gaussian Graph with Prototypical Contrastive Learning (GPCL) framework to
+overcome these challenges. In particular, GPCL embeds each user/bundle/item as
+a Gaussian distribution rather than a fixed vector. We further design a
+prototypical contrastive learning module to capture the contextual information
+and mitigate the sampling bias issue. Extensive experiments demonstrate that
+benefiting from the proposed components, we achieve new state-of-the-art
+performance compared to previous methods on several public datasets. Moreover,
+GPCL has been deployed on real-world e-commerce platform and achieved
+substantial improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating processed-based models and machine learning for crop yield
+  prediction <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13466v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13466v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michiel G. J. Kallenberg, Bernardo Maestrini, Ron van Bree, Paul Ravensbergen, Christos Pylianidis, Frits van Evert, Ioannis N. Athanasiadis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crop yield prediction typically involves the utilization of either
+theory-driven process-based crop growth models, which have proven to be
+difficult to calibrate for local conditions, or data-driven machine learning
+methods, which are known to require large datasets. In this work we investigate
+potato yield prediction using a hybrid meta-modeling approach. A crop growth
+model is employed to generate synthetic data for (pre)training a convolutional
+neural net, which is then fine-tuned with observational data. When applied in
+silico, our meta-modeling approach yields better predictions than a baseline
+comprising a purely data-driven approach. When tested on real-world data from
+field trials (n=303) and commercial fields (n=77), the meta-modeling approach
+yields competitive results with respect to the crop growth model. In the latter
+set, however, both models perform worse than a simple linear regression with a
+hand-picked feature set and dedicated preprocessing designed by domain experts.
+Our findings indicate the potential of meta-modeling for accurate crop yield
+prediction; however, further advancements and validation using extensive
+real-world datasets is recommended to solidify its practical effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures, Accepted after peer-review at the 1st workshop on
+  Synergy of Scientific and Machine Learning Modeling, SynS & ML ICML,
+  Honolulu, Hawaii, USA. July, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fundamental causal bounds of quantum random access memories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13460v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13460v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfei Wang, Yuri Alexeev, Liang Jiang, Frederic T. Chong, Junyu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum devices should operate in adherence to quantum physics principles.
+Quantum random access memory (QRAM), a fundamental component of many essential
+quantum algorithms for tasks such as linear algebra, data search, and machine
+learning, is often proposed to offer $\mathcal{O}(\log N)$ circuit depth for
+$\mathcal{O}(N)$ data size, given $N$ qubits. However, this claim appears to
+breach the principle of relativity when dealing with a large number of qubits
+in quantum materials interacting locally. In our study we critically explore
+the intrinsic bounds of rapid quantum memories based on causality, employing
+the relativistic quantum field theory and Lieb-Robinson bounds in quantum
+many-body systems. In this paper, we consider a hardware-efficient QRAM design
+in hybrid quantum acoustic systems. Assuming clock cycle times of approximately
+$10^{-3}$ seconds and a lattice spacing of about 1 micrometer, we show that
+QRAM can accommodate up to $\mathcal{O}(10^7)$ logical qubits in 1 dimension,
+$\mathcal{O}(10^{15})$ to $\mathcal{O}(10^{20})$ in various 2D architectures,
+and $\mathcal{O}(10^{24})$ in 3 dimensions. We contend that this causality
+bound broadly applies to other quantum hardware systems. Our findings highlight
+the impact of fundamental quantum physics constraints on the long-term
+performance of quantum computing applications in data science and suggest
+potential quantum memory designs for performance enhancement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8+24=32 pages, many figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A behavioural <span class="highlight-title">transformer</span> for effective collaboration between a robot
+  and a non-stationary human 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruaridh Mon-Williams, Theodoros Stouraitis, Sethu Vijayakumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key challenge in human-robot collaboration is the non-stationarity created
+by humans due to changes in their behaviour. This alters environmental
+transitions and hinders human-robot collaboration. We propose a principled
+meta-learning framework to explore how robots could better predict human
+behaviour, and thereby deal with issues of non-stationarity. On the basis of
+this framework, we developed Behaviour-Transform (BeTrans). BeTrans is a
+conditional transformer that enables a robot agent to adapt quickly to new
+human agents with non-stationary behaviours, due to its notable performance
+with sequential data. We trained BeTrans on simulated human agents with
+different systematic biases in collaborative settings. We used an original
+customisable environment to show that BeTrans effectively collaborates with
+simulated human agents and adapts faster to non-stationary simulated human
+agents than SOTA techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Network Traffic Classification based on Single Flow Time Series Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13434v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13434v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josef Koumar, Karel Hynek, Tomáš Čejka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Network traffic monitoring using IP flows is used to handle the current
+challenge of analyzing encrypted network communication. Nevertheless, the
+packet aggregation into flow records naturally causes information loss;
+therefore, this paper proposes a novel flow extension for traffic features
+based on the time series analysis of the Single Flow Time series, i.e., a time
+series created by the number of bytes in each packet and its timestamp. We
+propose 69 universal features based on the statistical analysis of data points,
+time domain analysis, packet distribution within the flow timespan, time series
+behavior, and frequency domain analysis. We have demonstrated the usability and
+universality of the proposed feature vector for various network traffic
+classification tasks using 15 well-known publicly available datasets. Our
+evaluation shows that the novel feature vector achieves classification
+performance similar or better than related works on both binary and multiclass
+classification tasks. In more than half of the evaluated tasks, the
+classification performance increased by up to 5\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to The 19th International Conference on Network and Service
+  Management (CNSM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Achieving Linear Speedup in Decentralized Stochastic Compositional
+  Minimax Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongchang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The stochastic compositional minimax problem has attracted a surge of
+attention in recent years since it covers many emerging machine learning
+models. Meanwhile, due to the emergence of distributed data, optimizing this
+kind of problem under the decentralized setting becomes badly needed. However,
+the compositional structure in the loss function brings unique challenges to
+designing efficient decentralized optimization algorithms. In particular, our
+study shows that the standard gossip communication strategy cannot achieve
+linear speedup for decentralized compositional minimax problems due to the
+large consensus error about the inner-level function. To address this issue, we
+developed a novel decentralized stochastic compositional gradient descent
+ascent with momentum algorithm to reduce the consensus error in the inner-level
+function. As such, our theoretical results demonstrate that it is able to
+achieve linear speedup with respect to the number of workers. We believe this
+novel algorithmic design could benefit the development of decentralized
+compositional optimization. Finally, we applied our methods to the imbalanced
+classification problem. The extensive experimental results provide evidence for
+the effectiveness of our algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A signal processing interpretation of noise-reduction convolutional
+  neural networks <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis A. Zavala-Mondragón, Peter H. N. de With, Fons van der Sommen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Encoding-decoding CNNs play a central role in data-driven noise reduction and
+can be found within numerous deep-learning algorithms. However, the development
+of these CNN architectures is often done in ad-hoc fashion and theoretical
+underpinnings for important design choices is generally lacking. Up to this
+moment there are different existing relevant works that strive to explain the
+internal operation of these CNNs. Still, these ideas are either scattered
+and/or may require significant expertise to be accessible for a bigger
+audience. In order to open up this exciting field, this article builds
+intuition on the theory of deep convolutional framelets and explains diverse ED
+CNN architectures in a unified theoretical framework. By connecting basic
+principles from signal processing to the field of deep learning, this
+self-contained material offers significant guidance for designing robust and
+efficient novel CNN architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article is currently accepted in IEEE Signal Processing Magazine
+  (SPM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non Intrusive Intelligibility Predictor for Hearing Impaired Individuals
+  using Self Supervised Speech Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13423v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13423v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Close, Thomas Hain, Stefan Goetze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised speech representations (SSSRs) have been successfully applied
+to a number of speech-processing tasks, e.g. as feature extractor for speech
+quality (SQ) prediction, which is, in turn, relevant for assessment and
+training speech enhancement systems for users with normal or impaired hearing.
+However, exact knowledge of why and how quality-related information is encoded
+well in such representations remains poorly understood. In this work,
+techniques for non-intrusive prediction of SQ ratings are extended to the
+prediction of intelligibility for hearing-impaired users. It is found that
+self-supervised representations are useful as input features to non-intrusive
+prediction models, achieving competitive performance to more complex systems. A
+detailed analysis of the performance depending on Clarity Prediction Challenge
+1 listeners and enhancement systems indicates that more data might be needed to
+allow generalisation to unknown systems and (hearing-impaired) individuals
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ASRU 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the learning Dynamics of Attention Networks <span class="chip">ECAI-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul Vashisht, Harish G. Ramaswamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention models are typically learned by optimizing one of three standard
+loss functions that are variously called -- soft attention, hard attention, and
+latent variable marginal likelihood (LVML) attention. All three paradigms are
+motivated by the same goal of finding two models -- a `focus' model that
+`selects' the right \textit{segment} of the input and a `classification' model
+that processes the selected segment into the target label. However, they differ
+significantly in the way the selected segments are aggregated, resulting in
+distinct dynamics and final results. We observe a unique signature of models
+learned using these paradigms and explain this as a consequence of the
+evolution of the classification model under gradient descent when the focus
+model is fixed. We also analyze these paradigms in a simple setting and derive
+closed-form expressions for the parameter trajectory under gradient flow. With
+the soft attention loss, the focus model improves quickly at initialization and
+splutters later on. On the other hand, hard attention loss behaves in the
+opposite fashion. Based on our observations, we propose a simple hybrid
+approach that combines the advantages of the different loss functions and
+demonstrates it on a collection of semi-synthetic and real-world datasets
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint: Accepted at ECAI-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Co-Design of Out-of-Distribution Detectors for Autonomous Emergency
+  Braking Systems <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13419v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13419v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Yuhas, Arvind Easwaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning enabled components (LECs), while critical for decision making in
+autonomous vehicles (AVs), are likely to make incorrect decisions when
+presented with samples outside of their training distributions.
+Out-of-distribution (OOD) detectors have been proposed to detect such samples,
+thereby acting as a safety monitor, however, both OOD detectors and LECs
+require heavy utilization of embedded hardware typically found in AVs. For both
+components, there is a tradeoff between non-functional and functional
+performance, and both impact a vehicle's safety. For instance, giving an OOD
+detector a longer response time can increase its accuracy at the expense of the
+LEC. We consider an LEC with binary output like an autonomous emergency braking
+system (AEBS) and use risk, the combination of severity and occurrence of a
+failure, to model the effect of both components' design parameters on each
+other's functional and non-functional performance, as well as their impact on
+system safety. We formulate a co-design methodology that uses this risk model
+to find the design parameters for an OOD detector and LEC that decrease risk
+below that of the baseline system and demonstrate it on a vision based AEBS.
+Using our methodology, we achieve a 42.3% risk reduction while maintaining
+equivalent resource utilization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, ITSC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Communication-Efficient Orchestrations for URLLC Service via
+  Hierarchical Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Shi, Milad Ganjalizadeh, Hossein Shokri Ghadikolaei, Marina Petrova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultra-reliable low latency communications (URLLC) service is envisioned to
+enable use cases with strict reliability and latency requirements in 5G. One
+approach for enabling URLLC services is to leverage Reinforcement Learning (RL)
+to efficiently allocate wireless resources. However, with conventional RL
+methods, the decision variables (though being deployed at various network
+layers) are typically optimized in the same control loop, leading to
+significant practical limitations on the control loop's delay as well as
+excessive signaling and energy consumption. In this paper, we propose a
+multi-agent Hierarchical RL (HRL) framework that enables the implementation of
+multi-level policies with different control loop timescales. Agents with faster
+control loops are deployed closer to the base station, while the ones with
+slower control loops are at the edge or closer to the core network providing
+high-level guidelines for low-level actions. On a use case from the prior art,
+with our HRL framework, we optimized the maximum number of retransmissions and
+transmission power of industrial devices. Our extensive simulation results on
+the factory automation scenario show that the HRL framework achieves better
+performance as the baseline single-agent RL method, with significantly less
+overhead of signal transmissions and delay compared to the one-agent RL
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been accepted in IEEE 34th Annual International
+  Symposium on Personal, Indoor and Mobile Radio Communications (PIMRC)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Memory Wall Effects in CNN Engines with On-the-Fly Weights
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13412v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13412v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stylianos I. Venieris, Javier Fernandez-Marques, Nicholas D. Lane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unprecedented accuracy of convolutional neural networks (CNNs) across a
+broad range of AI tasks has led to their widespread deployment in mobile and
+embedded settings. In a pursuit for high-performance and energy-efficient
+inference, significant research effort has been invested in the design of
+FPGA-based CNN accelerators. In this context, single computation engines
+constitute a popular approach to support diverse CNN modes without the overhead
+of fabric reconfiguration. Nevertheless, this flexibility often comes with
+significantly degraded performance on memory-bound layers and resource
+underutilisation due to the suboptimal mapping of certain layers on the
+engine's fixed configuration. In this work, we investigate the implications in
+terms of CNN engine design for a class of models that introduce a
+pre-convolution stage to decompress the weights at run time. We refer to these
+approaches as on-the-fly. This paper presents unzipFPGA, a novel CNN inference
+system that counteracts the limitations of existing CNN engines. The proposed
+framework comprises a novel CNN hardware architecture that introduces a weights
+generator module that enables the on-chip on-the-fly generation of weights,
+alleviating the negative impact of limited bandwidth on memory-bound layers. We
+further enhance unzipFPGA with an automated hardware-aware methodology that
+tailors the weights generation mechanism to the target CNN-device pair, leading
+to an improved accuracy-performance balance. Finally, we introduce an input
+selective processing element (PE) design that balances the load between PEs in
+suboptimally mapped layers. The proposed framework yields hardware designs that
+achieve an average of 2.57x performance efficiency gain over highly optimised
+GPU designs for the same power constraints and up to 3.94x higher performance
+density over a diverse range of state-of-the-art FPGA-based CNN accelerators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM TODAES, 2023. arXiv admin note: substantial text
+  overlap with arXiv:2103.05600</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Double-Edged Sword of Big Data and Information Technology for the
+  Disadvantaged: A Cautionary Tale from Open Banking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Savina Dine Kim, Galina Andreeva, Michael Rovatsos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research article analyses and demonstrates the hidden implications for
+fairness of seemingly neutral data coupled with powerful technology, such as
+machine learning (ML), using Open Banking as an example. Open Banking has
+ignited a revolution in financial services, opening new opportunities for
+customer acquisition, management, retention, and risk assessment. However, the
+granularity of transaction data holds potential for harm where unnoticed
+proxies for sensitive and prohibited characteristics may lead to indirect
+discrimination. Against this backdrop, we investigate the dimensions of
+financial vulnerability (FV), a global concern resulting from COVID-19 and
+rising inflation. Specifically, we look to understand the behavioral elements
+leading up to FV and its impact on at-risk, disadvantaged groups through the
+lens of fair interpretation. Using a unique dataset from a UK FinTech lender,
+we demonstrate the power of fine-grained transaction data while simultaneously
+cautioning its safe usage. Three ML classifiers are compared in predicting the
+likelihood of FV, and groups exhibiting different magnitudes and forms of FV
+are identified via clustering to highlight the effects of feature combination.
+Our results indicate that engineered features of financial behavior can be
+predictive of omitted personal information, particularly sensitive or protected
+characteristics, shedding light on the hidden dangers of Open Banking data. We
+discuss the implications and conclude fairness via unawareness is ineffective
+in this new technological environment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counterfactual Explanation via Search in Gaussian Mixture Distributed
+  Latent Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan Zhao, Klaus Broelemann, Gjergji Kasneci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual Explanations (CEs) are an important tool in Algorithmic
+Recourse for addressing two questions: 1. What are the crucial factors that led
+to an automated prediction/decision? 2. How can these factors be changed to
+achieve a more favorable outcome from a user's perspective? Thus, guiding the
+user's interaction with AI systems by proposing easy-to-understand explanations
+and easy-to-attain feasible changes is essential for the trustworthy adoption
+and long-term acceptance of AI systems. In the literature, various methods have
+been proposed to generate CEs, and different quality measures have been
+suggested to evaluate these methods. However, the generation of CEs is usually
+computationally expensive, and the resulting suggestions are unrealistic and
+thus non-actionable. In this paper, we introduce a new method to generate CEs
+for a pre-trained binary classifier by first shaping the latent space of an
+autoencoder to be a mixture of Gaussian distributions. CEs are then generated
+in latent space by linear interpolation between the query sample and the
+centroid of the target class. We show that our method maintains the
+characteristics of the input sample during the counterfactual search. In
+various experiments, we show that the proposed method is competitive based on
+different quality measures on image and tabular datasets -- efficiently returns
+results that are closer to the original data manifold compared to three
+state-of-the-art methods, which are essential for realistic high-dimensional
+machine learning applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BotHawk: An Approach for Bots Detection in Open Source Software Projects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13386v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13386v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fenglin Bi, Zhiwei Zhu, Wei Wang, Xiaoya Xia, Hassan Ali Khan, Peng Pu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social coding platforms have revolutionized collaboration in software
+development, leading to using software bots for streamlining operations.
+However, The presence of open-source software (OSS) bots gives rise to problems
+including impersonation, spamming, bias, and security risks. Identifying bot
+accounts and behavior is a challenging task in the OSS project. This research
+aims to investigate bots' behavior in open-source software projects and
+identify bot accounts with maximum possible accuracy. Our team gathered a
+dataset of 19,779 accounts that meet standardized criteria to enable future
+research on bots in open-source projects. We follow a rigorous workflow to
+ensure that the data we collect is accurate, generalizable, scalable, and
+up-to-date. We've identified four types of bot accounts in open-source software
+projects by analyzing their behavior across 17 features in 5 dimensions. Our
+team created BotHawk, a highly effective model for detecting bots in
+open-source software projects. It outperforms other models, achieving an AUC of
+0.947 and an F1-score of 0.89. BotHawk can detect a wider variety of bots,
+including CI/CD and scanning bots. Furthermore, we find that the number of
+followers, number of repositories, and tags contain the most relevant features
+to identify the account type.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Dataset, Bots Detection, Classification. Open-source Software Bots</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaff-PD: Communication Efficient Fair and Robust Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaodong Yu, Sai Praneeth Karimireddy, Yi Ma, Michael I. Jordan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Scaff-PD, a fast and communication-efficient algorithm for
+distributionally robust federated learning. Our approach improves fairness by
+optimizing a family of distributionally robust objectives tailored to
+heterogeneous clients. We leverage the special structure of these objectives,
+and design an accelerated primal dual (APD) algorithm which uses bias corrected
+local steps (as in Scaffold) to achieve significant gains in communication
+efficiency and convergence speed. We evaluate Scaff-PD on several benchmark
+datasets and demonstrate its effectiveness in improving fairness and robustness
+while maintaining competitive accuracy. Our results suggest that Scaff-PD is a
+promising approach for federated learning in resource-constrained and
+heterogeneous settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Submodular Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13372v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13372v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manish Prajapat, Mojmír Mutný, Melanie N. Zeilinger, Andreas Krause
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In reinforcement learning (RL), rewards of states are typically considered
+additive, and following the Markov assumption, they are $\textit{independent}$
+of states visited previously. In many important applications, such as coverage
+control, experiment design and informative path planning, rewards naturally
+have diminishing returns, i.e., their value decreases in light of similar
+states visited previously. To tackle this, we propose $\textit{submodular RL}$
+(SubRL), a paradigm which seeks to optimize more general, non-additive (and
+history-dependent) rewards modelled via submodular set functions which capture
+diminishing returns. Unfortunately, in general, even in tabular settings, we
+show that the resulting optimization problem is hard to approximate. On the
+other hand, motivated by the success of greedy algorithms in classical
+submodular optimization, we propose SubPO, a simple policy gradient-based
+algorithm for SubRL that handles non-additive rewards by greedily maximizing
+marginal gains. Indeed, under some assumptions on the underlying Markov
+Decision Process (MDP), SubPO recovers optimal constant factor approximations
+of submodular bandits. Moreover, we derive a natural policy gradient approach
+for locally optimizing SubRL instances even in large state- and action- spaces.
+We showcase the versatility of our approach by applying SubPO to several
+applications, such as biodiversity monitoring, Bayesian experiment design,
+informative path planning, and coverage maximization. Our results demonstrate
+sample efficiency, as well as scalability to high-dimensional state-action
+spaces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Regions of Interest for Bayesian Optimization with Adaptive
+  Level-Set Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengxue Zhang, Jialin Song, James Bowden, Alexander Ladd, Yisong Yue, Thomas A. Desautels, Yuxin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study Bayesian optimization (BO) in high-dimensional and non-stationary
+scenarios. Existing algorithms for such scenarios typically require extensive
+hyperparameter tuning, which limits their practical effectiveness. We propose a
+framework, called BALLET, which adaptively filters for a high-confidence region
+of interest (ROI) as a superlevel-set of a nonparametric probabilistic model
+such as a Gaussian process (GP). Our approach is easy to tune, and is able to
+focus on local region of the optimization space that can be tackled by existing
+BO methods. The key idea is to use two probabilistic models: a coarse GP to
+identify the ROI, and a localized GP for optimization within the ROI. We show
+theoretically that BALLET can efficiently shrink the search space, and can
+exhibit a tighter regret bound than standard BO without ROI filtering. We
+demonstrate empirically the effectiveness of BALLET on both synthetic and
+real-world optimization tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computational Guarantees for Doubly Entropic Wasserstein Barycenters via
+  Damped Sinkhorn Iterations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13370v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13370v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lénaïc Chizat, Tomas Vaškevičius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the computation of doubly regularized Wasserstein barycenters, a
+recently introduced family of entropic barycenters governed by inner and outer
+regularization strengths. Previous research has demonstrated that various
+regularization parameter choices unify several notions of entropy-penalized
+barycenters while also revealing new ones, including a special case of debiased
+barycenters. In this paper, we propose and analyze an algorithm for computing
+doubly regularized Wasserstein barycenters. Our procedure builds on damped
+Sinkhorn iterations followed by exact maximization/minimization steps and
+guarantees convergence for any choice of regularization parameters. An inexact
+variant of our algorithm, implementable using approximate Monte Carlo sampling,
+offers the first non-asymptotic convergence guarantees for approximating
+Wasserstein barycenters between discrete point clouds in the
+free-support/grid-free setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High Dimensional Distributed Gradient Descent with Arbitrary Number of
+  Byzantine Attackers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Puning Zhao, Zhiguo Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust distributed learning with Byzantine failures has attracted extensive
+research interests in recent years. However, most of existing methods suffer
+from curse of dimensionality, which is increasingly serious with the growing
+complexity of modern machine learning models. In this paper, we design a new
+method that is suitable for high dimensional problems, under arbitrary number
+of Byzantine attackers. The core of our design is a direct high dimensional
+semi-verified mean estimation method. Our idea is to identify a subspace first.
+The components of mean value perpendicular to this subspace can be estimated
+via gradient vectors uploaded from worker machines, while the components within
+this subspace are estimated using auxiliary dataset. We then use our new method
+as the aggregator of distributed learning problems. Our theoretical analysis
+shows that the new method has minimax optimal statistical rates. In particular,
+the dependence on dimensionality is significantly improved compared with
+previous works.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Importance Measurement based on Decision Tree Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Huang, Diptesh Das, Koji Tsuda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Random forest is effective for prediction tasks but the randomness of tree
+generation hinders interpretability in feature importance analysis. To address
+this, we proposed DT-Sampler, a SAT-based method for measuring feature
+importance in tree-based model. Our method has fewer parameters than random
+forest and provides higher interpretability and stability for the analysis in
+real-world problems. An implementation of DT-Sampler is available at
+https://github.com/tsudalab/DT-sampler.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Optimal Approximation Factors in Misspecified Off-Policy Value
+  Function Estimation <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philip Amortila, Nan Jiang, Csaba Szepesvári
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Theoretical guarantees in reinforcement learning (RL) are known to suffer
+multiplicative blow-up factors with respect to the misspecification error of
+function approximation. Yet, the nature of such \emph{approximation factors} --
+especially their optimal form in a given learning problem -- is poorly
+understood. In this paper we study this question in linear off-policy value
+function estimation, where many open questions remain. We study the
+approximation factor in a broad spectrum of settings, such as with the weighted
+$L_2$-norm (where the weighting is the offline state distribution), the
+$L_\infty$ norm, the presence vs. absence of state aliasing, and full vs.
+partial coverage of the state space. We establish the optimal asymptotic
+approximation factors (up to constants) for all of these settings. In
+particular, our bounds identify two instance-dependent factors for the
+$L_2(\mu)$ norm and only one for the $L_\infty$ norm, which are shown to
+dictate the hardness of off-policy evaluation under misspecification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023. The arXiv version contains improved results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QuIP: 2-Bit Quantization of Large Language Models With Guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerry Chee, Yaohui Cai, Volodymyr Kuleshov, Christopher De Sa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work studies post-training parameter quantization in large language
+models (LLMs). We introduce quantization with incoherence processing (QuIP), a
+new method based on the insight that quantization benefits from incoherent
+weight and Hessian matrices, i.e., from the weights and the directions in which
+it is important to round them accurately being unaligned with the coordinate
+axes. QuIP consists of two steps: (1) an adaptive rounding procedure minimizing
+a quadratic proxy objective; (2) efficient pre- and post-processing that
+ensures weight and Hessian incoherence via multiplication by random orthogonal
+matrices. We complement QuIP with the first theoretical analysis for an
+LLM-scale quantization algorithm, and show that our theory also applies to an
+existing method, OPTQ. Empirically, we find that our incoherence preprocessing
+improves several existing quantization algorithms and yields the first LLM
+quantization methods that produce viable results using only two bits per
+weight. Our code can be found at https://github.com/jerry-chee/QuIP .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modify Training Directions in Function Space to Reduce Generalization
+  Error 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13290v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13290v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Yu, Wenlian Lu, Boyu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose theoretical analyses of a modified natural gradient descent method
+in the neural network function space based on the eigendecompositions of neural
+tangent kernel and Fisher information matrix. We firstly present analytical
+expression for the function learned by this modified natural gradient under the
+assumptions of Gaussian distribution and infinite width limit. Thus, we
+explicitly derive the generalization error of the learned neural network
+function using theoretical methods from eigendecomposition and statistics
+theory. By decomposing of the total generalization error attributed to
+different eigenspace of the kernel in function space, we propose a criterion
+for balancing the errors stemming from training set and the distribution
+discrepancy between the training set and the true data. Through this approach,
+we establish that modifying the training direction of the neural network in
+function space leads to a reduction in the total generalization error.
+Furthermore, We demonstrate that this theoretical framework is capable to
+explain many existing results of generalization enhancing methods. These
+theoretical results are also illustrated by numerical examples on synthetic
+data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Curvature-based <span class="highlight-title">Transformer</span> for Molecular Property Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yili Chen, Zhengyu Li, Zheng Wan, Hui Yu, Xian Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prediction of molecular properties is one of the most important and
+challenging tasks in the field of artificial intelligence-based drug design.
+Among the current mainstream methods, the most commonly used feature
+representation for training DNN models is based on SMILES and molecular graphs,
+although these methods are concise and effective, they also limit the ability
+to capture spatial information. In this work, we propose Curvature-based
+Transformer to improve the ability of Graph Transformer neural network models
+to extract structural information on molecular graph data by introducing
+Discretization of Ricci Curvature. To embed the curvature in the model, we add
+the curvature information of the graph as positional Encoding to the node
+features during the attention-score calculation. This method can introduce
+curvature information from graph data without changing the original network
+architecture, and it has the potential to be extended to other models. We
+performed experiments on chemical molecular datasets including PCQM4M-LST,
+MoleculeNet and compared with models such as Uni-Mol, Graphormer, and the
+results show that this method can achieve the state-of-the-art results. It is
+proved that the discretized Ricci curvature also reflects the structural and
+functional relationship while describing the local geometry of the graph
+molecular data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unbiased Weight Maximization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Chung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A biologically plausible method for training an Artificial Neural Network
+(ANN) involves treating each unit as a stochastic Reinforcement Learning (RL)
+agent, thereby considering the network as a team of agents. Consequently, all
+units can learn via REINFORCE, a local learning rule modulated by a global
+reward signal, which aligns more closely with biologically observed forms of
+synaptic plasticity. Nevertheless, this learning method is often slow and
+scales poorly with network size due to inefficient structural credit
+assignment, since a single reward signal is broadcast to all units without
+considering individual contributions. Weight Maximization, a proposed solution,
+replaces a unit's reward signal with the norm of its outgoing weight, thereby
+allowing each hidden unit to maximize the norm of the outgoing weight instead
+of the global reward signal. In this research report, we analyze the
+theoretical properties of Weight Maximization and propose a variant, Unbiased
+Weight Maximization. This new approach provides an unbiased learning rule that
+increases learning speed and improves asymptotic performance. Notably, to our
+knowledge, this is the first learning rule for a network of Bernoulli-logistic
+units that is unbiased and scales well with the number of network's units in
+terms of learning speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated K-Means Clustering via Dual Decomposition-based Distributed
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vassilios Yfantis, Achim Wagner, Martin Ruskowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of distributed optimization in machine learning can be motivated
+either by the resulting preservation of privacy or the increase in
+computational efficiency. On the one hand, training data might be stored across
+multiple devices. Training a global model within a network where each node only
+has access to its confidential data requires the use of distributed algorithms.
+Even if the data is not confidential, sharing it might be prohibitive due to
+bandwidth limitations. On the other hand, the ever-increasing amount of
+available data leads to large-scale machine learning problems. By splitting the
+training process across multiple nodes its efficiency can be significantly
+increased. This paper aims to demonstrate how dual decomposition can be applied
+for distributed training of $ K $-means clustering problems. After an overview
+of distributed and federated machine learning, the mixed-integer quadratically
+constrained programming-based formulation of the $ K $-means clustering
+training problem is presented. The training can be performed in a distributed
+manner by splitting the data across different nodes and linking these nodes
+through consensus constraints. Finally, the performance of the subgradient
+method, the bundle trust method, and the quasi-Newton dual ascent algorithm are
+evaluated on a set of benchmark problems. While the mixed-integer
+programming-based formulation of the clustering problems suffers from weak
+integer relaxations, the presented approach can potentially be used to enable
+an efficient solution in the future, both in a central and distributed setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Split Learning with Only Positive Labels for
+  resource-constrained IoT environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Praveen Joshi, Chandra Thapa, Mohammed Hasanuzzaman, Ted Scully, Haithem Afli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributed collaborative machine learning (DCML) is a promising method in
+the Internet of Things (IoT) domain for training deep learning models, as data
+is distributed across multiple devices. A key advantage of this approach is
+that it improves data privacy by removing the necessity for the centralized
+aggregation of raw data but also empowers IoT devices with low computational
+power. Among various techniques in a DCML framework, federated split learning,
+known as splitfed learning (SFL), is the most suitable for efficient training
+and testing when devices have limited computational capabilities. Nevertheless,
+when resource-constrained IoT devices have only positive labeled data,
+multiclass classification deep learning models in SFL fail to converge or
+provide suboptimal results. To overcome these challenges, we propose splitfed
+learning with positive labels (SFPL). SFPL applies a random shuffling function
+to the smashed data received from clients before supplying it to the server for
+model training. Additionally, SFPL incorporates the local batch normalization
+for the client-side model portion during the inference phase. Our results
+demonstrate that SFPL outperforms SFL: (i) by factors of 51.54 and 32.57 for
+ResNet-56 and ResNet-32, respectively, with the CIFAR-100 dataset, and (ii) by
+factors of 9.23 and 8.52 for ResNet-32 and ResNet-8, respectively, with
+CIFAR-10 dataset. Overall, this investigation underscores the efficacy of the
+proposed SFPL framework in DCML.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Structural Credit Assignment with Coordinated Exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Chung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A biologically plausible method for training an Artificial Neural Network
+(ANN) involves treating each unit as a stochastic Reinforcement Learning (RL)
+agent, thereby considering the network as a team of agents. Consequently, all
+units can learn via REINFORCE, a local learning rule modulated by a global
+reward signal, which aligns more closely with biologically observed forms of
+synaptic plasticity. However, this learning method tends to be slow and does
+not scale well with the size of the network. This inefficiency arises from two
+factors impeding effective structural credit assignment: (i) all units
+independently explore the network, and (ii) a single reward is used to evaluate
+the actions of all units. Accordingly, methods aimed at improving structural
+credit assignment can generally be classified into two categories. The first
+category includes algorithms that enable coordinated exploration among units,
+such as MAP propagation. The second category encompasses algorithms that
+compute a more specific reward signal for each unit within the network, like
+Weight Maximization and its variants. In this research report, our focus is on
+the first category. We propose the use of Boltzmann machines or a recurrent
+network for coordinated exploration. We show that the negative phase, which is
+typically necessary to train Boltzmann machines, can be removed. The resulting
+learning rules are similar to the reward-modulated Hebbian learning rule.
+Experimental results demonstrate that coordinated exploration significantly
+exceeds independent exploration in training speed for multiple stochastic and
+discrete units based on REINFORCE, even surpassing straight-through estimator
+(STE) backpropagation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoSAS: Deep Semi-Supervised Anomaly Detection with
+  Contamination-Resilient Continuous Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongzuo Xu, Yijie Wang, Guansong Pang, Songlei Jian, Ning Liu, Yongjun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised anomaly detection methods leverage a few anomaly examples to
+yield drastically improved performance compared to unsupervised models.
+However, they still suffer from two limitations: 1) unlabeled anomalies (i.e.,
+anomaly contamination) may mislead the learning process when all the unlabeled
+data are employed as inliers for model training; 2) only discrete supervision
+information (such as binary or ordinal data labels) is exploited, which leads
+to suboptimal learning of anomaly scores that essentially take on a continuous
+distribution. Therefore, this paper proposes a novel semi-supervised anomaly
+detection method, which devises \textit{contamination-resilient continuous
+supervisory signals}. Specifically, we propose a mass interpolation method to
+diffuse the abnormality of labeled anomalies, thereby creating new data samples
+labeled with continuous abnormal degrees. Meanwhile, the contaminated area can
+be covered by new data samples generated via combinations of data with correct
+labels. A feature learning-based objective is added to serve as an optimization
+constraint to regularize the network and further enhance the robustness w.r.t.
+anomaly contamination. Extensive experiments on 11 real-world datasets show
+that our approach significantly outperforms state-of-the-art competitors by
+20%-30% in AUC-PR and obtains more robust and superior performance in settings
+with different anomaly contamination levels and varying numbers of labeled
+anomalies. The source code is available at https://github.com/xuhongzuo/rosas/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Information Processing and Management (IP&M)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-aware Query-enhanced <span class="highlight-title">Transformer</span> for Audio-Visual Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinxiang Liu, Chen Ju, Chaofan Ma, Yanfeng Wang, Yu Wang, Ya Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of the audio-visual segmentation (AVS) task is to segment the
+sounding objects in the video frames using audio cues. However, current
+fusion-based methods have the performance limitations due to the small
+receptive field of convolution and inadequate fusion of audio-visual features.
+To overcome these issues, we propose a novel \textbf{Au}dio-aware
+query-enhanced \textbf{TR}ansformer (AuTR) to tackle the task. Unlike existing
+methods, our approach introduces a multimodal transformer architecture that
+enables deep fusion and aggregation of audio-visual features. Furthermore, we
+devise an audio-aware query-enhanced transformer decoder that explicitly helps
+the model focus on the segmentation of the pinpointed sounding objects based on
+audio signals, while disregarding silent yet salient objects. Experimental
+results show that our method outperforms previous methods and demonstrates
+better generalization ability in multi-sound and open-set scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2305.11019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spectral-DP: Differentially Private Deep Learning through Spectral
+  Perturbation and Filtering <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ce Feng, Nuo Xu, Wujie Wen, Parv Venkitasubramaniam, Caiwen Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differential privacy is a widely accepted measure of privacy in the context
+of deep learning algorithms, and achieving it relies on a noisy training
+approach known as differentially private stochastic gradient descent (DP-SGD).
+DP-SGD requires direct noise addition to every gradient in a dense neural
+network, the privacy is achieved at a significant utility cost. In this work,
+we present Spectral-DP, a new differentially private learning approach which
+combines gradient perturbation in the spectral domain with spectral filtering
+to achieve a desired privacy guarantee with a lower noise scale and thus better
+utility. We develop differentially private deep learning methods based on
+Spectral-DP for architectures that contain both convolution and fully connected
+layers. In particular, for fully connected layers, we combine a block-circulant
+based spatial restructuring with Spectral-DP to achieve better utility. Through
+comprehensive experiments, we study and provide guidelines to implement
+Spectral-DP deep learning on benchmark datasets. In comparison with
+state-of-the-art DP-SGD based approaches, Spectral-DP is shown to have
+uniformly better utility performance in both training from scratch and transfer
+learning settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 2023 IEEE Symposium on Security and Privacy (SP)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Primer on the Data Cleaning Pipeline 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rebecca C. Steorts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The availability of both structured and unstructured databases, such as
+electronic health data, social media data, patent data, and surveys that are
+often updated in real time, among others, has grown rapidly over the past
+decade. With this expansion, the statistical and methodological questions
+around data integration, or rather merging multiple data sources, has also
+grown. Specifically, the science of the ``data cleaning pipeline'' contains
+four stages that allow an analyst to perform downstream tasks, predictive
+analyses, or statistical analyses on ``cleaned data.'' This article provides a
+review of this emerging field, introducing technical terminology and commonly
+used methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedMEKT: Distillation-based Embedding Knowledge Transfer for Multimodal
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huy Q. Le, Minh N. H. Nguyen, Chu Myaet Thwal, Yu Qiao, Chaoning Zhang, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) enables a decentralized machine learning paradigm for
+multiple clients to collaboratively train a generalized global model without
+sharing their private data. Most existing works simply propose typical FL
+systems for single-modal data, thus limiting its potential on exploiting
+valuable multimodal data for future personalized applications. Furthermore, the
+majority of FL approaches still rely on the labeled data at the client side,
+which is limited in real-world applications due to the inability of
+self-annotation from users. In light of these limitations, we propose a novel
+multimodal FL framework that employs a semi-supervised learning approach to
+leverage the representations from different modalities. Bringing this concept
+into a system, we develop a distillation-based multimodal embedding knowledge
+transfer mechanism, namely FedMEKT, which allows the server and clients to
+exchange the joint knowledge of their learning models extracted from a small
+multimodal proxy dataset. Our FedMEKT iteratively updates the generalized
+global encoders with the joint embedding knowledge from the participating
+clients. Thereby, to address the modality discrepancy and labeled data
+constraint in existing FL systems, our proposed FedMEKT comprises local
+multimodal autoencoder learning, generalized multimodal autoencoder
+construction, and generalized classifier learning. Through extensive
+experiments on three multimodal human activity recognition datasets, we
+demonstrate that FedMEKT achieves superior global encoder performance on linear
+evaluation and guarantees user privacy for personal data and model parameters
+while demanding less communication cost than other baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transferability of Graph Neural Networks using Graphon and Sampling
+  Theories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Martina Neuman, Jason J. Bramburger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have become powerful tools for processing
+graph-based information in various domains. A desirable property of GNNs is
+transferability, where a trained network can swap in information from a
+different graph without retraining and retain its accuracy. A recent method of
+capturing transferability of GNNs is through the use of graphons, which are
+symmetric, measurable functions representing the limit of large dense graphs.
+In this work, we contribute to the application of graphons to GNNs by
+presenting an explicit two-layer graphon neural network (WNN) architecture. We
+prove its ability to approximate bandlimited signals within a specified error
+tolerance using a minimal number of network weights. We then leverage this
+result, to establish the transferability of an explicit two-layer GNN over all
+sufficiently large graphs in a sequence converging to a graphon. Our work
+addresses transferability between both deterministic weighted graphs and simple
+random graphs and overcomes issues related to the curse of dimensionality that
+arise in other GNN results. The proposed WNN and GNN architectures offer
+practical solutions for handling graph data of varying sizes while maintaining
+performance guarantees without extensive retraining.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Investigation into Glomeruli Detection in Kidney H&E and PAS Images
+  using YOLO 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kimia Hemmatirad, Morteza Babaie, Jeffrey Hodgin, Liron Pantanowitz, H. R. Tizhoosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Context: Analyzing digital pathology images is necessary to draw diagnostic
+conclusions by investigating tissue patterns and cellular morphology. However,
+manual evaluation can be time-consuming, expensive, and prone to inter- and
+intra-observer variability. Objective: To assist pathologists using
+computerized solutions, automated tissue structure detection and segmentation
+must be proposed. Furthermore, generating pixel-level object annotations for
+histopathology images is expensive and time-consuming. As a result, detection
+models with bounding box labels may be a feasible solution. Design: This paper
+studies. YOLO-v4 (You-Only-Look-Once), a real-time object detector for
+microscopic images. YOLO uses a single neural network to predict several
+bounding boxes and class probabilities for objects of interest. YOLO can
+enhance detection performance by training on whole slide images. YOLO-v4 has
+been used in this paper. for glomeruli detection in human kidney images.
+Multiple experiments have been designed and conducted based on different
+training data of two public datasets and a private dataset from the University
+of Michigan for fine-tuning the model. The model was tested on the private
+dataset from the University of Michigan, serving as an external validation of
+two different stains, namely hematoxylin and eosin (H&E) and periodic
+acid-Schiff (PAS). Results: Average specificity and sensitivity for all
+experiments, and comparison of existing segmentation methods on the same
+datasets are discussed. Conclusions: Automated glomeruli detection in human
+kidney images is possible using modern AI models. The design and validation for
+different stains still depends on variability of public multi-stain datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counterfactual Explanation Policies in RL <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shripad V. Deshmukh, Srivatsan R, Supriti Vijay, Jayakumar Subramanian, Chirag Agarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Reinforcement Learning (RL) agents are increasingly employed in diverse
+decision-making problems using reward preferences, it becomes important to
+ensure that policies learned by these frameworks in mapping observations to a
+probability distribution of the possible actions are explainable. However,
+there is little to no work in the systematic understanding of these complex
+policies in a contrastive manner, i.e., what minimal changes to the policy
+would improve/worsen its performance to a desired level. In this work, we
+present COUNTERPOL, the first framework to analyze RL policies using
+counterfactual explanations in the form of minimal changes to the policy that
+lead to the desired outcome. We do so by incorporating counterfactuals in
+supervised learning in RL with the target outcome regulated using desired
+return. We establish a theoretical connection between Counterpol and widely
+used trust region-based policy optimization methods in RL. Extensive empirical
+analysis shows the efficacy of COUNTERPOL in generating explanations for
+(un)learning skills while keeping close to the original policy. Our results on
+five different RL environments with diverse state and action spaces demonstrate
+the utility of counterfactual explanations, paving the way for new frontiers in
+designing and developing counterfactual policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML Workshop on Counterfactuals in Minds and Machines, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Memory Decoding with EEG Data and Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13181v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13181v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Glenn Bruns, Michael Haidar, Federico Rubino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We describe a method for the neural decoding of memory from EEG data. Using
+this method, a concept being recalled can be identified from an EEG trace with
+an average top-1 accuracy of about 78.4% (chance 4%). The method employs deep
+representation learning with supervised contrastive loss to map an EEG
+recording of brain activity to a low-dimensional space. Because representation
+learning is used, concepts can be identified even if they do not appear in the
+training data set. However, reference EEG data must exist for each such
+concept. We also show an application of the method to the problem of
+information retrieval. In neural information retrieval, EEG data is captured
+while a user recalls the contents of a document, and a list of links to
+predicted documents is produced.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pretrain</span>ed Deep 2.5D Models for Efficient Predictive Modeling from
+  Retinal OCT <span class="chip">MICCAI'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taha Emre, Marzieh Oghbaie, Arunava Chakravarty, Antoine Rivail, Sophie Riedl, Julia Mai, Hendrik P. N. Scholl, Sobha Sivaprasad, Daniel Rueckert, Andrew Lotery, Ursula Schmidt-Erfurth, Hrvoje Bogunović
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of medical imaging, 3D deep learning models play a crucial role
+in building powerful predictive models of disease progression. However, the
+size of these models presents significant challenges, both in terms of
+computational resources and data requirements. Moreover, achieving high-quality
+pretraining of 3D models proves to be even more challenging. To address these
+issues, hybrid 2.5D approaches provide an effective solution for utilizing 3D
+volumetric data efficiently using 2D models. Combining 2D and 3D techniques
+offers a promising avenue for optimizing performance while minimizing memory
+requirements. In this paper, we explore 2.5D architectures based on a
+combination of convolutional neural networks (CNNs), long short-term memory
+(LSTM), and Transformers. In addition, leveraging the benefits of recent
+non-contrastive pretraining approaches in 2D, we enhanced the performance and
+data efficiency of 2.5D techniques even further. We demonstrate the
+effectiveness of architectures and associated pretraining on a task of
+predicting progression to wet age-related macular degeneration (AMD) within a
+six-month period on two large longitudinal OCT datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at OMIA-X MICCAI'23 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Design Analog Circuits to Meet Threshold Specifications <span class="chip">ICML 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dmitrii Krylov, Pooya Khajeh, Junhan Ouyang, Thomas Reeves, Tongkai Liu, Hiba Ajmal, Hamidreza Aghasi, Roy Fox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated design of analog and radio-frequency circuits using supervised or
+reinforcement learning from simulation data has recently been studied as an
+alternative to manual expert design. It is straightforward for a design agent
+to learn an inverse function from desired performance metrics to circuit
+parameters. However, it is more common for a user to have threshold performance
+criteria rather than an exact target vector of feasible performance measures.
+In this work, we propose a method for generating from simulation data a dataset
+on which a system can be trained via supervised learning to design circuits to
+meet threshold specifications. We moreover perform the to-date most extensive
+evaluation of automated analog circuit design, including experimenting in a
+significantly more diverse set of circuits than in prior work, covering linear,
+nonlinear, and autonomous circuit configurations, and show that our method
+consistently reaches success rate better than 90% at 5% error margin, while
+also improving data efficiency by upward of an order of magnitude. A demo of
+this system is available at circuits.streamlit.app
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in proceedings of ICML 23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the unreasonable vulnerability of <span class="highlight-title">transformer</span>s for image restoration
+  -- and an easy fix 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Agnihotri, Kanchana Vaishnavi Gandikota, Julia Grabinski, Paramanand Chandramouli, Margret Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following their success in visual recognition tasks, Vision
+Transformers(ViTs) are being increasingly employed for image restoration. As a
+few recent works claim that ViTs for image classification also have better
+robustness properties, we investigate whether the improved adversarial
+robustness of ViTs extends to image restoration. We consider the recently
+proposed Restormer model, as well as NAFNet and the "Baseline network" which
+are both simplified versions of a Restormer. We use Projected Gradient Descent
+(PGD) and CosPGD, a recently proposed adversarial attack tailored to pixel-wise
+prediction tasks for our robustness evaluation. Our experiments are performed
+on real-world images from the GoPro dataset for image deblurring. Our analysis
+indicates that contrary to as advocated by ViTs in image classification works,
+these models are highly susceptible to adversarial attacks. We attempt to
+improve their robustness through adversarial training. While this yields a
+significant increase in robustness for Restormer, results on other networks are
+less promising. Interestingly, the design choices in NAFNet and Baselines,
+which were based on iid performance, and not on robust generalization, seem to
+be at odds with the model robustness. Thus, we investigate this further and
+find a fix.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tags: Robustness, adversarial attacks, image deblurring, image
+  restoration, NAFNet, Baseline, Restormer, adversarial training</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Sharpened Cosine Similarity <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Skyler Wu, Fred Lu, Edward Raff, James Holt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional layers have long served as the primary workhorse for image
+classification. Recently, an alternative to convolution was proposed using the
+Sharpened Cosine Similarity (SCS), which in theory may serve as a better
+feature detector. While multiple sources report promising results, there has
+not been to date a full-scale empirical analysis of neural network performance
+using these new layers. In our work, we explore SCS's parameter behavior and
+potential as a drop-in replacement for convolutions in multiple CNN
+architectures benchmarked on CIFAR-10. We find that while SCS may not yield
+significant increases in accuracy, it may learn more interpretable
+representations. We also find that, in some circumstances, SCS may confer a
+slight increase in adversarial robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to I Can't Believe It's Not Better Workshop (ICBINB) at
+  NeurIPS 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WebArena: A Realistic Web Environment for Building Autonomous Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyan Zhou, Frank F. Xu, Hao Zhu, Xuhui Zhou, Robert Lo, Abishek Sridhar, Xianyi Cheng, Yonatan Bisk, Daniel Fried, Uri Alon, Graham Neubig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With generative AI advances, the exciting potential for autonomous agents to
+manage daily tasks via natural language commands has emerged. However, cur rent
+agents are primarily created and tested in simplified synthetic environments,
+substantially limiting real-world scenario representation. In this paper, we
+build an environment for agent command and control that is highly realistic and
+reproducible. Specifically, we focus on agents that perform tasks on websites,
+and we create an environment with fully functional websites from four common
+domains: e-commerce, social forum discussions, collaborative software
+development, and content management. Our environment is enriched with tools
+(e.g., a map) and external knowledge bases (e.g., user manuals) to encourage
+human-like task-solving. Building upon our environment, we release a set of
+benchmark tasks focusing on evaluating the functional correctness of task
+completions. The tasks in our benchmark are diverse, long-horizon, and are
+designed to emulate tasks that humans routinely perform on the internet. We
+design and implement several autonomous agents, integrating recent techniques
+such as reasoning before acting. The results demonstrate that solving complex
+tasks is challenging: our best GPT-4-based agent only achieves an end-to-end
+task success rate of 10.59%. These results highlight the need for further
+development of robust agents, that current state-of-the-art LMs are far from
+perfect performance in these real-life tasks, and that WebArena can be used to
+measure such progress. Our code, data, environment reproduction resources, and
+video demonstrations are publicly available at https://webarena.dev/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SplitFed resilience to packet loss: Where to split, that is the question <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chamani Shiranthika, Zahra Hafezi Kafshgari, Parvaneh Saeedi, Ivan V. Bajić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized machine learning has broadened its scope recently with the
+invention of Federated Learning (FL), Split Learning (SL), and their hybrids
+like Split Federated Learning (SplitFed or SFL). The goal of SFL is to reduce
+the computational power required by each client in FL and parallelize SL while
+maintaining privacy. This paper investigates the robustness of SFL against
+packet loss on communication links. The performance of various SFL aggregation
+strategies is examined by splitting the model at two points -- shallow split
+and deep split -- and testing whether the split point makes a statistically
+significant difference to the accuracy of the final model. Experiments are
+carried out on a segmentation model for human embryo images and indicate the
+statistically significant advantage of a deeper split point.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, MICCAI 2023 Workshop on Distributed,
+  Collaborative and Federated Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAEA: Multimodal Attribution for Embodied AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vidhi Jain, Jayant Sravan Tamarapalli, Sahiti Yerramilli, Yonatan Bisk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding multimodal perception for embodied AI is an open question
+because such inputs may contain highly complementary as well as redundant
+information for the task. A relevant direction for multimodal policies is
+understanding the global trends of each modality at the fusion layer. To this
+end, we disentangle the attributions for visual, language, and previous action
+inputs across different policies trained on the ALFRED dataset. Attribution
+analysis can be utilized to rank and group the failure scenarios, investigate
+modeling and dataset biases, and critically analyze multimodal EAI policies for
+robustness and user trust before deployment. We present MAEA, a framework to
+compute global attributions per modality of any differentiable policy. In
+addition, we show how attributions enable lower-level behavior analysis in EAI
+policies for language and visual attributions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relationship between Batch Size and Number of Steps Needed for Nonconvex
+  Optimization of Stochastic Gradient Descent using Armijo Line Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13831v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13831v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuki Tsukada, Hideaki Iiduka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic gradient descent (SGD) is the simplest deep learning optimizer
+with which to train deep neural networks. While SGD can use various learning
+rates, such as constant or diminishing rates, the previous numerical results
+showed that SGD performs better than other deep learning optimizers using when
+it uses learning rates given by line search methods. In this paper, we perform
+a convergence analysis on SGD with a learning rate given by an Armijo line
+search for nonconvex optimization. The analysis indicates that the upper bound
+of the expectation of the squared norm of the full gradient becomes small when
+the number of steps and the batch size are large. Next, we show that, for SGD
+with the Armijo-line-search learning rate, the number of steps needed for
+nonconvex optimization is a monotone decreasing convex function of the batch
+size; that is, the number of steps needed for nonconvex optimization decreases
+as the batch size increases. Furthermore, we show that the stochastic
+first-order oracle (SFO) complexity, which is the stochastic gradient
+computation cost, is a convex function of the batch size; that is, there exists
+a critical batch size that minimizes the SFO complexity. Finally, we provide
+numerical results that support our theoretical results. The numerical results
+indicate that the number of steps needed for training deep neural networks
+decreases as the batch size increases and that there exist the critical batch
+sizes that can be estimated from the theoretical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Offline Reinforcement Learning with On-Policy Q-Function Regularization <span class="chip">ECML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13824v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13824v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laixi Shi, Robert Dadashi, Yuejie Chi, Pablo Samuel Castro, Matthieu Geist
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The core challenge of offline reinforcement learning (RL) is dealing with the
+(potentially catastrophic) extrapolation error induced by the distribution
+shift between the history dataset and the desired policy. A large portion of
+prior work tackles this challenge by implicitly/explicitly regularizing the
+learning policy towards the behavior policy, which is hard to estimate reliably
+in practice. In this work, we propose to regularize towards the Q-function of
+the behavior policy instead of the behavior policy itself, under the premise
+that the Q-function can be estimated more reliably and easily by a SARSA-style
+estimate and handles the extrapolation error more straightforwardly. We propose
+two algorithms taking advantage of the estimated Q-function through
+regularizations, and demonstrate they exhibit strong performance on the D4RL
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at European Conference on Machine Learning (ECML), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometric Wavelet Scattering Networks on Compact Riemannian Manifolds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1905.10448v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1905.10448v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Perlmutter, Feng Gao, Guy Wolf, Matthew Hirn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Euclidean scattering transform was introduced nearly a decade ago to
+improve the mathematical understanding of convolutional neural networks.
+Inspired by recent interest in geometric deep learning, which aims to
+generalize convolutional neural networks to manifold and graph-structured
+domains, we define a geometric scattering transform on manifolds. Similar to
+the Euclidean scattering transform, the geometric scattering transform is based
+on a cascade of wavelet filters and pointwise nonlinearities. It is invariant
+to local isometries and stable to certain types of diffeomorphisms. Empirical
+results demonstrate its utility on several geometric learning tasks. Our
+results generalize the deformation stability and local translation invariance
+of Euclidean scattering, and demonstrate the importance of linking the used
+filter structures to the underlying geometry of the data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages; 3 figures; 2 tables; v4: Fixed a minor error. Convergence
+  in Equation 13 is in L2 not p.w. modified proof of Theorem 3.3 accordingly</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stabilizing <span class="highlight-title">Transformer</span> Training by Preventing Attention Entropy
+  Collapse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangfei Zhai, Tatiana Likhomanenko, Etai Littwin, Dan Busbridge, Jason Ramapuram, Yizhe Zhang, Jiatao Gu, Josh Susskind
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training stability is of great importance to Transformers. In this work, we
+investigate the training dynamics of Transformers by examining the evolution of
+the attention layers. In particular, we track the attention entropy for each
+attention head during the course of training, which is a proxy for model
+sharpness. We identify a common pattern across different architectures and
+tasks, where low attention entropy is accompanied by high training instability,
+which can take the form of oscillating loss or divergence. We denote the
+pathologically low attention entropy, corresponding to highly concentrated
+attention scores, as $\textit{entropy collapse}$. As a remedy, we propose
+$\sigma$Reparam, a simple and efficient solution where we reparametrize all
+linear layers with spectral normalization and an additional learned scalar. We
+demonstrate that $\sigma$Reparam successfully prevents entropy collapse in the
+attention layers, promoting more stable training. Additionally, we prove a
+tight lower bound of the attention entropy, which decreases exponentially fast
+with the spectral norm of the attention logits, providing additional motivation
+for our approach. We conduct experiments with $\sigma$Reparam on image
+classification, image self-supervised learning, machine translation, speech
+recognition, and language modeling tasks. We show that $\sigma$Reparam provides
+stability and robustness with respect to the choice of hyperparameters, going
+so far as enabling training (a) a Vision Transformer {to competitive
+performance} without warmup, weight decay, layer normalization or adaptive
+optimizers; (b) deep architectures in machine translation and (c) speech
+recognition to competitive performance without warmup and adaptive optimizers.
+Code is available at \url{https://github.com/apple/ml-sigma-reparam}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficiently Learning One-Hidden-Layer ReLU Networks via Schur
+  Polynomials 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12840v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12840v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilias Diakonikolas, Daniel M. Kane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of PAC learning a linear combination of $k$ ReLU
+activations under the standard Gaussian distribution on $\mathbb{R}^d$ with
+respect to the square loss. Our main result is an efficient algorithm for this
+learning task with sample and computational complexity $(dk/\epsilon)^{O(k)}$,
+where $\epsilon>0$ is the target accuracy. Prior work had given an algorithm
+for this problem with complexity $(dk/\epsilon)^{h(k)}$, where the function
+$h(k)$ scales super-polynomially in $k$. Interestingly, the complexity of our
+algorithm is near-optimal within the class of Correlational Statistical Query
+algorithms. At a high-level, our algorithm uses tensor decomposition to
+identify a subspace such that all the $O(k)$-order moments are small in the
+orthogonal directions. Its analysis makes essential use of the theory of Schur
+polynomials to show that the higher-moment error tensors are small given that
+the lower-order ones are.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accelerated primal-dual methods with enlarged step sizes and operator
+  learning for nonsmooth optimal control problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongcun Song, Xiaoming Yuan, Hangrui Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a general class of nonsmooth optimal control problems with
+partial differential equation (PDE) constraints, which are very challenging due
+to its nonsmooth objective functionals and the resulting high-dimensional and
+ill-conditioned systems after discretization. We focus on the application of a
+primal-dual method, with which different types of variables can be treated
+individually and thus its main computation at each iteration only requires
+solving two PDEs. Our target is to accelerate the primal-dual method with
+either larger step sizes or operator learning techniques. For the accelerated
+primal-dual method with larger step sizes, its convergence can be still proved
+rigorously while it numerically accelerates the original primal-dual method in
+a simple and universal way. For the operator learning acceleration, we
+construct deep neural network surrogate models for the involved PDEs. Once a
+neural operator is learned, solving a PDE requires only a forward pass of the
+neural network, and the computational cost is thus substantially reduced. The
+accelerated primal-dual method with operator learning is mesh-free, numerically
+efficient, and scalable to different types of PDEs. The acceleration
+effectiveness of these two techniques is promisingly validated by some
+preliminary numerical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sharp Convergence Rates for Matching Pursuit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07679v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07679v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason M. Klusowski, Jonathan W. Siegel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the fundamental limits of matching pursuit, or the pure greedy
+algorithm, for approximating a target function by a sparse linear combination
+of elements from a dictionary. When the target function is contained in the
+variation space corresponding to the dictionary, many impressive works over the
+past few decades have obtained upper and lower bounds on the error of matching
+pursuit, but they do not match. The main contribution of this paper is to close
+this gap and obtain a sharp characterization of the decay rate of matching
+pursuit. Specifically, we construct a worst case dictionary which shows that
+the existing best upper bound cannot be significantly improved. It turns out
+that, unlike other greedy algorithm variants, the converge rate is suboptimal
+and is determined by the solution to a certain non-linear equation. This
+enables us to conclude that any amount of shrinkage improves matching pursuit
+in the worst case.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizing DP-SGD with Shuffling and Batch Clipping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05796v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05796v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marten van Dijk, Phuong Ha Nguyen, Toan N. Nguyen, Lam M. Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classical differential private DP-SGD implements individual clipping with
+random subsampling, which forces a mini-batch SGD approach. We provide a
+general differential private algorithmic framework that goes beyond DP-SGD and
+allows any possible first order optimizers (e.g., classical SGD and momentum
+based SGD approaches) in combination with batch clipping, which clips an
+aggregate of computed gradients rather than summing clipped gradients (as is
+done in individual clipping). The framework also admits sampling techniques
+beyond random subsampling such as shuffling. Our DP analysis follows the $f$-DP
+approach and introduces a new proof technique which allows us to derive simple
+closed form expressions and to also analyse group privacy. In particular, for
+$E$ epochs work and groups of size $g$, we show a $\sqrt{g E}$ DP dependency
+for batch clipping with shuffling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Update disclaimers</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> video <span class="highlight-title">pretrain</span>ing yields human-aligned visual
+  representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06433v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06433v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Parthasarathy, S. M. Ali Eslami, João Carreira, Olivier J. Hénaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans learn powerful representations of objects and scenes by observing how
+they evolve over time. Yet, outside of specific tasks that require explicit
+temporal understanding, static image pretraining remains the dominant paradigm
+for learning visual foundation models. We question this mismatch, and ask
+whether video pretraining can yield visual representations that bear the
+hallmarks of human perception: generalisation across tasks, robustness to
+perturbations, and consistency with human judgements. To that end we propose a
+novel procedure for curating videos, and develop a contrastive framework which
+learns from the complex transformations therein. This simple paradigm for
+distilling knowledge from videos, called VITO, yields general representations
+that far outperform prior video pretraining methods on image understanding
+tasks, and image pretraining methods on video understanding tasks. Moreover,
+VITO representations are significantly more robust to natural and synthetic
+deformations than image-, video-, and adversarially-trained ones. Finally,
+VITO's predictions are strongly aligned with human judgements, surpassing
+models that were specifically trained for that purpose. Together, these results
+suggest that video pretraining could be a simple way of learning unified,
+robust, and human-aligned representations of the visual world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unification of popular artificial neural network activation functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11007v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11007v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Mostafanejad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a unified representation of the most popular neural network
+activation functions. Adopting Mittag-Leffler functions of fractional calculus,
+we propose a flexible and compact functional form that is able to interpolate
+between various activation functions and mitigate common problems in training
+neural networks such as vanishing and exploding gradients. The presented gated
+representation extends the scope of fixed-shape activation functions to their
+adaptive counterparts whose shape can be learnt from the training data. The
+derivatives of the proposed functional form can also be expressed in terms of
+Mittag-Leffler functions making it a suitable candidate for gradient-based
+backpropagation algorithms. By training multiple neural networks of different
+complexities on various datasets with different sizes, we demonstrate that
+adopting a unified gated representation of activation functions offers a
+promising and affordable alternative to individual built-in implementations of
+activation functions in conventional machine learning frameworks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The present revised version includes new results on ShuffleNet-v2 and
+  ResNet-101 neural networks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variability of echo state network prediction horizon for partially
+  observed dynamical systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10797v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10797v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ajit Mahata, Reetish Padhi, Amit Apte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Study of dynamical systems using partial state observation is an important
+problem due to its applicability to many real-world systems. We address the
+problem by proposing an echo state network (ESN) framework with partial state
+input with partial or full state output. Application to the Lorenz system and
+Chua's oscillator (both numerically simulated and experimental systems)
+demonstrate the effectiveness of our method. We show that the ESN, as an
+autonomous dynamical system, is capable of making short-term predictions up to
+a few Lyapunov times. However, the prediction horizon has high variability
+depending on the initial condition - an aspect that we explore in detail using
+the distribution of the prediction horizon. Further, using a variety of
+statistical metrics to compare the long-term dynamics of the ESN predictions
+with numerically simulated or experimental dynamics and observed similar
+results, we show that the ESN can effectively learn the system's dynamics even
+when trained with noisy numerical or experimental datasets. Thus, we
+demonstrate the potential of ESNs to serve as cheap surrogate models for
+simulating the dynamics of systems where complete observations are unavailable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Reinforcement Learning-Assisted Federated Learning for Robust
+  Short-term Utility Demand Forecasting in Electricity Wholesale Markets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.11715v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.11715v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenghao Huang, Weilong Chen, Shengrong Bu, Yanru Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Short-term load forecasting (STLF) plays a significant role in the operation
+of electricity trading markets. Considering the growing concern of data
+privacy, federated learning (FL) is increasingly adopted to train STLF models
+for utility companies (UCs) in recent research. Inspiringly, in wholesale
+markets, as it is not realistic for power plants (PPs) to access UCs' data
+directly, FL is definitely a feasible solution of obtaining an accurate STLF
+model for PPs. However, due to FL's distributed nature and intense competition
+among UCs, defects increasingly occur and lead to poor performance of the STLF
+model, indicating that simply adopting FL is not enough. In this paper, we
+propose a DRL-assisted FL approach, DEfect-AwaRe federated soft actor-critic
+(DearFSAC), to robustly train an accurate STLF model for PPs to forecast
+precise short-term utility electricity demand. Firstly. we design a STLF model
+based on long short-term memory (LSTM) using just historical load data and time
+data. Furthermore, considering the uncertainty of defects occurrence, a deep
+reinforcement learning (DRL) algorithm is adopted to assist FL by alleviating
+model degradation caused by defects. In addition, for faster convergence of FL
+training, an auto-encoder is designed for both dimension reduction and quality
+evaluation of uploaded models. In the simulations, we validate our approach on
+real data of Helsinki's UCs in 2019. The results show that DearFSAC outperforms
+all the other approaches no matter if defects occur or not.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Replica Analysis of the Linear Model with Markov or Hidden Markov Signal
+  Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2009.13370v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2009.13370v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lan V. Truong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper estimates free energy, average mutual information, and minimum
+mean square error (MMSE) of a linear model under two assumptions: (1) the
+source is generated by a Markov chain, (2) the source is generated via a hidden
+Markov model. Our estimates are based on the replica method in statistical
+physics. We show that under the posterior mean estimator, the linear model with
+Markov sources or hidden Markov sources is decoupled into single-input AWGN
+channels with state information available at both encoder and decoder where the
+state distribution follows the left Perron-Frobenius eigenvector with unit
+Manhattan norm of the stochastic matrix of Markov chains. Numerical results
+show that the free energies and MSEs obtained via the replica method are
+closely approximate to their counterparts achieved by the Metropolis-Hastings
+algorithm or some well-known approximate message passing algorithms in the
+research literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A shorter version to appear in IEEE Transactions on Information
+  Theory (accepted in July 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ marl-jax: Multi-Agent Reinforcement Leaning Framework <span class="chip">ECML-PKDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13808v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13808v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kinal Mehta, Anuj Mahajan, Pawan Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in Reinforcement Learning (RL) have led to many exciting
+applications. These advancements have been driven by improvements in both
+algorithms and engineering, which have resulted in faster training of RL
+agents. We present marl-jax, a multi-agent reinforcement learning software
+package for training and evaluating social generalization of the agents. The
+package is designed for training a population of agents in multi-agent
+environments and evaluating their ability to generalize to diverse background
+agents. It is built on top of DeepMind's JAX ecosystem~\cite{deepmind2020jax}
+and leverages the RL ecosystem developed by DeepMind. Our framework marl-jax is
+capable of working in cooperative and competitive, simultaneous-acting
+environments with multiple agents. The package offers an intuitive and
+user-friendly command-line interface for training a population and evaluating
+its generalization capabilities. In conclusion, marl-jax provides a valuable
+resource for researchers interested in exploring social generalization in the
+context of MARL. The open-source code for marl-jax is available at:
+\href{https://github.com/kinalmehta/marl-jax}{https://github.com/kinalmehta/marl-jax}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECML-PKDD 2023 Demo Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scalable Stochastic Gradient Riemannian Langevin Dynamics in
+  Non-Diagonal Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05101v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05101v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanlin Yu, Marcelo Hartmann, Bernardo Williams, Arto Klami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic-gradient sampling methods are often used to perform Bayesian
+inference on neural networks. It has been observed that the methods in which
+notions of differential geometry are included tend to have better performances,
+with the Riemannian metric improving posterior exploration by accounting for
+the local curvature. However, the existing methods often resort to simple
+diagonal metrics to remain computationally efficient. This loses some of the
+gains. We propose two non-diagonal metrics that can be used in
+stochastic-gradient samplers to improve convergence and exploration but have
+only a minor computational overhead over diagonal metrics. We show that for
+fully connected neural networks (NNs) with sparsity-inducing priors and
+convolutional NNs with correlated priors, using these metrics can provide
+improvements. For some other choices the posterior is sufficiently easy also
+for the simpler metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaxMin-L2-SVC-NCH: A Novel Approach for Support Vector Classifier
+  Training and Parameter Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07343v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07343v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linkai Luo, Qiaoling Yang, Hong Peng, Yiding Wang, Ziyang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The selection of Gaussian kernel parameters plays an important role in the
+applications of support vector classification (SVC). A commonly used method is
+the k-fold cross validation with grid search (CV), which is extremely
+time-consuming because it needs to train a large number of SVC models. In this
+paper, a new approach is proposed to train SVC and optimize the selection of
+Gaussian kernel parameters. We first formulate the training and parameter
+selection of SVC as a minimax optimization problem named as MaxMin-L2-SVC-NCH,
+in which the minimization problem is an optimization problem of finding the
+closest points between two normal convex hulls (L2-SVC-NCH) while the
+maximization problem is an optimization problem of finding the optimal Gaussian
+kernel parameters. A lower time complexity can be expected in MaxMin-L2-SVC-NCH
+because CV is not needed. We then propose a projected gradient algorithm (PGA)
+for training L2-SVC-NCH. The famous sequential minimal optimization (SMO)
+algorithm is a special case of the PGA. Thus, the PGA can provide more
+flexibility than the SMO. Furthermore, the solution of the maximization problem
+is done by a gradient ascent algorithm with dynamic learning rate. The
+comparative experiments between MaxMin-L2-SVC-NCH and the previous best
+approaches on public datasets show that MaxMin-L2-SVC-NCH greatly reduces the
+number of models to be trained while maintaining competitive test accuracy.
+These findings indicate that MaxMin-L2-SVC-NCH is a better choice for SVC
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Agents For Attacking Inaudible Voice Activated Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12204v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12204v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Forrest McKee, David Noever
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper applies reinforcement learning to novel Internet of Thing
+configurations. Our analysis of inaudible attacks on voice-activated devices
+confirms the alarming risk factor of 7.6 out of 10, underlining significant
+security vulnerabilities scored independently by NIST National Vulnerability
+Database (NVD). Our baseline network model showcases a scenario in which an
+attacker uses inaudible voice commands to gain unauthorized access to
+confidential information on a secured laptop. We simulated many attack
+scenarios on this baseline network model, revealing the potential for mass
+exploitation of interconnected devices to discover and own privileged
+information through physical access without adding new hardware or amplifying
+device skills. Using Microsoft's CyberBattleSim framework, we evaluated six
+reinforcement learning algorithms and found that Deep-Q learning with
+exploitation proved optimal, leading to rapid ownership of all nodes in fewer
+steps. Our findings underscore the critical need for understanding
+non-conventional networks and new cybersecurity measures in an ever-expanding
+digital landscape, particularly those characterized by mobile devices, voice
+activation, and non-linear microphones susceptible to malicious actors
+operating stealth attacks in the near-ultrasound or inaudible ranges. By 2024,
+this new attack surface might encompass more digital voice assistants than
+people on the planet yet offer fewer remedies than conventional patching or
+firmware fixes since the inaudible attacks arise inherently from the microphone
+design and digital signal processing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating Curricula with Replays: Its Effects on Continual Learning <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05747v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05747v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ren Jie Tee, Mengmi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans engage in learning and reviewing processes with curricula when
+acquiring new skills or knowledge. This human learning behavior has inspired
+the integration of curricula with replay methods in continual learning agents.
+The goal is to emulate the human learning process, thereby improving knowledge
+retention and facilitating learning transfer. Existing replay methods in
+continual learning agents involve the random selection and ordering of data
+from previous tasks, which has shown to be effective. However, limited research
+has explored the integration of different curricula with replay methods to
+enhance continual learning. Our study takes initial steps in examining the
+impact of integrating curricula with replay methods on continual learning in
+three specific aspects: the interleaved frequency of replayed exemplars with
+training data, the sequence in which exemplars are replayed, and the strategy
+for selecting exemplars into the replay buffer. These aspects of curricula
+design align with cognitive psychology principles and leverage the benefits of
+interleaved practice during replays, easy-to-hard rehearsal, and exemplar
+selection strategy involving exemplars from a uniform distribution of
+difficulties. Based on our results, these three curricula effectively mitigated
+catastrophic forgetting and enhanced positive knowledge transfer, demonstrating
+the potential of curricula in advancing continual learning methodologies. Our
+code and data are available:
+https://github.com/ZhangLab-DeepNeuroCogLab/Integrating-Curricula-with-Replays
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, accepted in AAAI Summer Symposium Series
+  Proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soft <span class="highlight-title">Prompt</span> Tuning for Augmenting Dense Retrieval with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Peng, Xuyang Wu, Yi Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dense retrieval (DR) converts queries and documents into dense embeddings and
+measures the similarity between queries and documents in vector space. One of
+the challenges in DR is the lack of domain-specific training data. While DR
+models can learn from large-scale public datasets like MS MARCO through
+transfer learning, evidence shows that not all DR models and domains can
+benefit from transfer learning equally. Recently, some researchers have
+resorted to large language models (LLMs) to improve the zero-shot and few-shot
+DR models. However, the hard prompts or human-written prompts utilized in these
+works cannot guarantee the good quality of generated weak queries. To tackle
+this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,
+we leverage soft prompt-tuning to optimize a task-specific soft prompt on
+limited ground truth data and then prompt the LLMs to tag unlabeled documents
+with weak queries, yielding enough weak document-query pairs to train
+task-specific dense retrievers. We design a filter to select high-quality
+example document-query pairs in the prompt to further improve the quality of
+weak tagged queries. To the best of our knowledge, there is no prior work
+utilizing soft prompt tuning to augment DR models. The experiments demonstrate
+that SPTAR outperforms the unsupervised baselines BM25 and the recently
+proposed LLMs-based augmentation method for DR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>fix typo InPairs which should be InPars</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-Invasive Fairness in Learning through the Lens of Data Drift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17566v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17566v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Yang, Alexandra Meliou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Learning (ML) models are widely employed to drive many modern data
+systems. While they are undeniably powerful tools, ML models often demonstrate
+imbalanced performance and unfair behaviors. The root of this problem often
+lies in the fact that different subpopulations commonly display divergent
+trends: as a learning algorithm tries to identify trends in the data, it
+naturally favors the trends of the majority groups, leading to a model that
+performs poorly and unfairly for minority populations. Our goal is to improve
+the fairness and trustworthiness of ML models by applying only non-invasive
+interventions, i.e., without altering the data or the learning algorithm. We
+use a simple but key insight: the divergence of trends between different
+populations, and, consecutively, between a learned model and minority
+populations, is analogous to data drift, which indicates the poor conformance
+between parts of the data and the trained model. We explore two strategies
+(model-splitting and reweighing) to resolve this drift, aiming to improve the
+overall conformance of models to the underlying data. Both our methods
+introduce novel ways to employ the recently-proposed data profiling primitive
+of Conformance Constraints. Our experimental evaluation over 7 real-world
+datasets shows that both DifFair and ConFair improve the fairness of ML models.
+We demonstrate scenarios where DifFair has an edge, though ConFair has the
+greatest practical impact and outperforms other baselines. Moreover, as a
+model-agnostic technique, ConFair stays robust when used against different
+models than the ones on which the weights have been learned, which is not the
+case for other state of the art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Optimal Fair Classification Trees: Trade-offs Between
+  Interpretability, Fairness, and Accuracy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.09932v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.09932v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathanael Jo, Sina Aghaei, Andrés Gómez, Phebe Vayanos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing use of machine learning in high-stakes domains -- where
+people's livelihoods are impacted -- creates an urgent need for interpretable,
+fair, and highly accurate algorithms. With these needs in mind, we propose a
+mixed integer optimization (MIO) framework for learning optimal classification
+trees -- one of the most interpretable models -- that can be augmented with
+arbitrary fairness constraints. In order to better quantify the "price of
+interpretability", we also propose a new measure of model interpretability
+called decision complexity that allows for comparisons across different classes
+of machine learning models. We benchmark our method against state-of-the-art
+approaches for fair classification on popular datasets; in doing so, we conduct
+one of the first comprehensive analyses of the trade-offs between
+interpretability, fairness, and predictive accuracy. Given a fixed disparity
+threshold, our method has a price of interpretability of about 4.2 percentage
+points in terms of out-of-sample accuracy compared to the best performing,
+complex models. However, our method consistently finds decisions with almost
+full parity, while other methods rarely do.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harmonizing Feature Attributions Across Deep Learning Architectures:
+  Enhancing Interpretability and Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02150v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02150v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abdul Kadir, Gowtham Krishna Addluri, Daniel Sonntag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring the trustworthiness and interpretability of machine learning models
+is critical to their deployment in real-world applications. Feature attribution
+methods have gained significant attention, which provide local explanations of
+model predictions by attributing importance to individual input features. This
+study examines the generalization of feature attributions across various deep
+learning architectures, such as convolutional neural networks (CNNs) and vision
+transformers. We aim to assess the feasibility of utilizing a feature
+attribution method as a future detector and examine how these features can be
+harmonized across multiple models employing distinct architectures but trained
+on the same data distribution. By exploring this harmonization, we aim to
+develop a more coherent and optimistic understanding of feature attributions,
+enhancing the consistency of local explanations across diverse deep-learning
+models. Our findings highlight the potential for harmonized feature attribution
+methods to improve interpretability and foster trust in machine learning
+applications, regardless of the underlying architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version of the contribution has been submitted in KI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Meta-Referential Games to Learn Compositional Learning Behaviours 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.08012v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.08012v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Denamganaï, Sondess Missaoui, James Alfred Walker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human beings use compositionality to generalise from past experiences to
+novel experiences. We assume a separation of our experiences into fundamental
+atomic components that can be recombined in novel ways to support our ability
+to engage with novel experiences. We frame this as the ability to learn to
+generalise compositionally, and we will refer to behaviours making use of this
+ability as compositional learning behaviours (CLBs). A central problem to
+learning CLBs is the resolution of a binding problem (BP). While it is another
+feat of intelligence that human beings perform with ease, it is not the case
+for state-of-the-art artificial agents. Thus, in order to build artificial
+agents able to collaborate with human beings, we propose to develop a novel
+benchmark to investigate agents' abilities to exhibit CLBs by solving a
+domain-agnostic version of the BP. We take inspiration from the language
+emergence and grounding framework of referential games and propose a
+meta-learning extension of referential games, entitled Meta-Referential Games,
+and use this framework to build our benchmark, that we name Symbolic Behaviour
+Benchmark (S2B). We provide baseline results showing that our benchmark is a
+compelling challenge that we hope will spur the research community towards
+developing more capable artificial agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DataComp: In search of the next generation of multimodal <span class="highlight-title">dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14108v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14108v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samir Yitzhak Gadre, Gabriel Ilharco, Alex Fang, Jonathan Hayase, Georgios Smyrnis, Thao Nguyen, Ryan Marten, Mitchell Wortsman, Dhruba Ghosh, Jieyu Zhang, Eyal Orgad, Rahim Entezari, Giannis Daras, Sarah Pratt, Vivek Ramanujan, Yonatan Bitton, Kalyani Marathe, Stephen Mussmann, Richard Vencu, Mehdi Cherti, Ranjay Krishna, Pang Wei Koh, Olga Saukh, Alexander Ratner, Shuran Song, Hannaneh Hajishirzi, Ali Farhadi, Romain Beaumont, Sewoong Oh, Alex Dimakis, Jenia Jitsev, Yair Carmon, Vaishaal Shankar, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal datasets are a critical component in recent breakthroughs such as
+Stable Diffusion and GPT-4, yet their design does not receive the same research
+attention as model architectures or training algorithms. To address this
+shortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset
+experiments centered around a new candidate pool of 12.8 billion image-text
+pairs from Common Crawl. Participants in our benchmark design new filtering
+techniques or curate new data sources and then evaluate their new dataset by
+running our standardized CLIP training code and testing the resulting model on
+38 downstream test sets. Our benchmark consists of multiple compute scales
+spanning four orders of magnitude, which enables the study of scaling trends
+and makes the benchmark accessible to researchers with varying resources. Our
+baseline experiments show that the DataComp workflow leads to better training
+sets. In particular, our best baseline, DataComp-1B, enables training a CLIP
+ViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming
+OpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training
+procedure and compute. We release DataComp and all accompanying code at
+www.datacomp.ai.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ G-invariant diffusion maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07350v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07350v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eitan Rosen, Xiuyuan Cheng, Yoel Shkolnisky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diffusion maps embedding of data lying on a manifold have shown success
+in tasks ranging from dimensionality reduction and clustering, to data
+visualization. In this work, we consider embedding data sets which were sampled
+from a manifold which is closed under the action of a continuous matrix group.
+An example of such a data set is images who's planar rotations are arbitrary.
+The G-invariant graph Laplacian, introduced in a previous work of the authors,
+admits eigenfunctions in the form of tensor products between the elements of
+the irreducible unitary representations of the group and eigenvectors of
+certain matrices. We employ these eigenfunctions to derive diffusion maps that
+intrinsically account for the group action on the data. In particular, we
+construct both equivariant and invariant embeddings which can be used naturally
+to cluster and align the data points. We demonstrate the effectiveness of our
+construction with simulated data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedTracker: Furnishing Ownership Verification and Traceability for
+  Federated Learning Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.07160v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.07160v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Shao, Wenyuan Yang, Hanlin Gu, Zhan Qin, Lixin Fan, Qiang Yang, Kui Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a distributed machine learning paradigm allowing
+multiple clients to collaboratively train a global model without sharing their
+local data. However, FL entails exposing the model to various participants.
+This poses a risk of unauthorized model distribution or resale by the malicious
+client, compromising the intellectual property rights of the FL group. To deter
+such misbehavior, it is essential to establish a mechanism for verifying the
+ownership of the model and as well tracing its origin to the leaker among the
+FL participants. In this paper, we present FedTracker, the first FL model
+protection framework that provides both ownership verification and
+traceability. FedTracker adopts a bi-level protection scheme consisting of
+global watermark mechanism and local fingerprint mechanism. The former
+authenticates the ownership of the global model, while the latter identifies
+which client the model is derived from. FedTracker leverages Continual Learning
+(CL) principles to embedding the watermark in a way that preserves the utility
+of the FL model on both primitive task and watermark task. FedTracker also
+devises a novel metric to better discriminate different fingerprints.
+Experimental results show FedTracker is effective in ownership verification,
+traceability, and maintains good fidelity and robustness against various
+watermark removal attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Faster Predict-and-Optimize with Davis-Yin Splitting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13395v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13395v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel McKenzie, Samy Wu Fung, Howard Heaton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many applications, a combinatorial problem must be repeatedly solved with
+similar, but distinct parameters. Yet, the parameters $w$ are not directly
+observed; only contextual data $d$ that correlates with $w$ is available. It is
+tempting to use a neural network to predict $w$ given $d$, but training such a
+model requires reconciling the discrete nature of combinatorial optimization
+with the gradient-based frameworks used to train neural networks. When the
+problem in question is an Integer Linear Program (ILP), one approach to
+overcoming this issue is to consider a continuous relaxation of the
+combinatorial problem. While existing methods utilizing this approach have
+shown to be highly effective on small problems (10-100 variables), they do not
+scale well to large problems. In this work, we draw on ideas from modern convex
+optimization to design a network and training scheme which scales effortlessly
+to problems with thousands of variables.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TEFL: Turbo Explainable Federated Learning for 6G Trustworthy Zero-Touch
+  Network Slicing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10147v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10147v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Swastika Roy, Hatim Chergui, Christos Verikoukis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sixth-generation (6G) networks anticipate intelligently supporting a massive
+number of coexisting and heterogeneous slices associated with various vertical
+use cases. Such a context urges the adoption of artificial intelligence
+(AI)-driven zero-touch management and orchestration (MANO) of the end-to-end
+(E2E) slices under stringent service level agreements (SLAs). Specifically, the
+trustworthiness of the AI black-boxes in real deployment can be achieved by
+explainable AI (XAI) tools to build transparency between the interacting actors
+in the slicing ecosystem, such as tenants, infrastructure providers and
+operators. Inspired by the turbo principle, this paper presents a novel
+iterative explainable federated learning (FL) approach where a constrained
+resource allocation model and an \emph{explainer} exchange -- in a closed loop
+(CL) fashion -- soft attributions of the features as well as inference
+predictions to achieve a transparent and SLA-aware zero-touch service
+management (ZSM) of 6G network slices at RAN-Edge setup under non-independent
+identically distributed (non-IID) datasets. In particular, we quantitatively
+validate the faithfulness of the explanations via the so-called
+attribution-based \emph{confidence metric} that is included as a constraint in
+the run-time FL optimization task. In this respect, Integrated-Gradient (IG) as
+well as Input $\times$ Gradient and SHAP are used to generate the attributions
+for the turbo explainable FL (TEFL), wherefore simulation results under
+different methods confirm its superiority over an unconstrained
+Integrated-Gradient \emph{post-hoc} FL baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Overlapes with the new version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Deep Learning Approach for Overall Survival prediction in Lung Cancer
+  with Missing Values 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11465v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11465v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Camillo Maria Caruso, Valerio Guarrasi, Sara Ramella, Paolo Soda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most challenging fields where Artificial Intelligence (AI) can be
+applied is lung cancer research, specifically non-small cell lung cancer
+(NSCLC). In particular, overall survival (OS), the time between diagnosis and
+death, is a vital indicator of patient status, enabling tailored treatment and
+improved OS rates. In this analysis, there are two challenges to take into
+account. First, few studies effectively exploit the information available from
+each patient, leveraging both uncensored (i.e., dead) and censored (i.e.,
+survivors) patients, considering also the events' time. Second, the handling of
+incomplete data is a common issue in the medical field. This problem is
+typically tackled through the use of imputation methods. Our objective is to
+present an AI model able to overcome these limits, effectively learning from
+both censored and uncensored patients and their available features, for the
+prediction of OS for NSCLC patients. We present a novel approach to survival
+analysis with missing values in the context of NSCLC, which exploits the
+strengths of the transformer architecture to account only for available
+features without requiring any imputation strategy. By making use of ad-hoc
+losses for OS, it is able to account for both censored and uncensored patients,
+as well as changes in risks over time. We compared our method with
+state-of-the-art models for survival analysis coupled with different imputation
+strategies. We evaluated the results obtained over a period of 6 years using
+different time granularities obtaining a Ct-index, a time-dependent variant of
+the C-index, of 71.97, 77.58 and 80.72 for time units of 1 month, 1 year and 2
+years, respectively, outperforming all state-of-the-art methods regardless of
+the imputation method used.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-linear Neurons with Human-like Apical Dendrite Activations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2003.03229v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2003.03229v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mariana-Iuliana Georgescu, Radu Tudor Ionescu, Nicolae-Catalin Ristea, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In order to classify linearly non-separable data, neurons are typically
+organized into multi-layer neural networks that are equipped with at least one
+hidden layer. Inspired by some recent discoveries in neuroscience, we propose a
+new model of artificial neuron along with a novel activation function enabling
+the learning of nonlinear decision boundaries using a single neuron. We show
+that a standard neuron followed by our novel apical dendrite activation (ADA)
+can learn the XOR logical function with 100% accuracy. Furthermore, we conduct
+experiments on six benchmark data sets from computer vision, signal processing
+and natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,
+Tiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions
+provide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and
+Swish, for various neural network architectures, e.g. one-hidden-layer or
+two-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural
+networks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain
+further performance improvements when we change the standard model of the
+neuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our
+code is available at: https://github.com/raduionescu/pynada.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Applied Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonparametric Linear Feature Learning in Regression Through
+  Regularisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12754v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12754v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bertille Follain, Umut Simsekli, Francis Bach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representation learning plays a crucial role in automated feature selection,
+particularly in the context of high-dimensional data, where non-parametric
+methods often struggle. In this study, we focus on supervised learning
+scenarios where the pertinent information resides within a lower-dimensional
+linear subspace of the data, namely the multi-index model. If this subspace
+were known, it would greatly enhance prediction, computation, and
+interpretation. To address this challenge, we propose a novel method for linear
+feature learning with non-parametric prediction, which simultaneously estimates
+the prediction function and the linear subspace. Our approach employs empirical
+risk minimisation, augmented with a penalty on function derivatives, ensuring
+versatility. Leveraging the orthogonality and rotation invariance properties of
+Hermite polynomials, we introduce our estimator, named RegFeaL. By utilising
+alternative minimisation, we iteratively rotate the data to improve alignment
+with leading directions and accurately estimate the relevant dimension in
+practical settings. We establish that our method yields a consistent estimator
+of the prediction function with explicit rates. Additionally, we provide
+empirical results demonstrating the performance of RegFeaL in various
+experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting the Robustness of the Minimum Error Entropy Criterion: A
+  Transfer Learning Case Study <span class="chip">ECAI-23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08572v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08572v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis Pedro Silvestrin, Shujian Yu, Mark Hoogendoorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coping with distributional shifts is an important part of transfer learning
+methods in order to perform well in real-life tasks. However, most of the
+existing approaches in this area either focus on an ideal scenario in which the
+data does not contain noises or employ a complicated training paradigm or model
+design to deal with distributional shifts. In this paper, we revisit the
+robustness of the minimum error entropy (MEE) criterion, a widely used
+objective in statistical signal processing to deal with non-Gaussian noises,
+and investigate its feasibility and usefulness in real-life transfer learning
+regression tasks, where distributional shifts are common. Specifically, we put
+forward a new theoretical result showing the robustness of MEE against
+covariate shift. We also show that by simply replacing the mean squared error
+(MSE) loss with the MEE on basic transfer learning algorithms such as
+fine-tuning and linear probing, we can achieve competitive performance with
+respect to state-of-the-art transfer learning algorithms. We justify our
+arguments on both synthetic data and 5 real-world time-series data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Manuscript accepted at ECAI-23. Code available at
+  https://github.com/lpsilvestrin/mee-finetune</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TriMLP: Revenge of a MLP-like Architecture in Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14675v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14675v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiheng Jiang, Yuanbo Xu, Yongjian Yang, Funing Yang, Pengyang Wang, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a MLP-like architecture for sequential
+recommendation, namely TriMLP, with a novel Triangular Mixer for cross-token
+communications. In designing Triangular Mixer, we simplify the cross-token
+operation in MLP as the basic matrix multiplication, and drop the
+lower-triangle neurons of the weight matrix to block the anti-chronological
+order connections from future tokens. Accordingly, the information leakage
+issue can be remedied and the prediction capability of MLP can be fully
+excavated under the standard auto-regressive mode. Take a step further, the
+mixer serially alternates two delicate MLPs with triangular shape, tagged as
+global and local mixing, to separately capture the long range dependencies and
+local patterns on fine-grained level, i.e., long and short-term preferences.
+Empirical study on 12 datasets of different scales (50K\textasciitilde 10M
+user-item interactions) from 4 benchmarks (Amazon, MovieLens, Tenrec and LBSN)
+show that TriMLP consistently attains promising accuracy/efficiency trade-off,
+where the average performance boost against several state-of-the-art baselines
+achieves up to 14.88% with 8.65% less inference cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonparametric Generative Modeling with Conditional Sliced-Wasserstein
+  Flows <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02164v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02164v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Du, Tianbo Li, Tianyu Pang, Shuicheng Yan, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sliced-Wasserstein Flow (SWF) is a promising approach to nonparametric
+generative modeling but has not been widely adopted due to its suboptimal
+generative quality and lack of conditional modeling capabilities. In this work,
+we make two major contributions to bridging this gap. First, based on a
+pleasant observation that (under certain conditions) the SWF of joint
+distributions coincides with those of conditional distributions, we propose
+Conditional Sliced-Wasserstein Flow (CSWF), a simple yet effective extension of
+SWF that enables nonparametric conditional modeling. Second, we introduce
+appropriate inductive biases of images into SWF with two techniques inspired by
+local connectivity and multiscale representation in vision research, which
+greatly improve the efficiency and quality of modeling images. With all the
+improvements, we achieve generative performance comparable with many deep
+parametric generative models on both conditional and unconditional tasks in a
+purely nonparametric fashion, demonstrating its great potential.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span> Training Strategies for Forecasting Multiple Load Time
+  Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10891v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10891v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Hertel, Maximilian Beichter, Benedikt Heidrich, Oliver Neumann, Benjamin Schäfer, Ralf Mikut, Veit Hagenmeyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the smart grid of the future, accurate load forecasts on the level of
+individual clients can help to balance supply and demand locally and to prevent
+grid outages. While the number of monitored clients will increase with the
+ongoing smart meter rollout, the amount of data per client will always be
+limited. We evaluate whether a Transformer load forecasting model benefits from
+a transfer learning strategy, where a global univariate model is trained on the
+load time series from multiple clients. In experiments with two datasets
+containing load time series from several hundred clients, we find that the
+global training strategy is superior to the multivariate and local training
+strategies used in related work. On average, the global training strategy
+results in 21.8% and 12.8% lower forecasting errors than the two other
+strategies, measured across forecasting horizons from one day to one month into
+the future. A comparison to linear models, multi-layer perceptrons and LSTMs
+shows that Transformers are effective for load forecasting when they are
+trained with the global training strategy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Real-time Neural-MPC: Deep Learning Model Predictive Control for
+  Quadrotors and Agile Robotic Platforms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.07747v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.07747v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Salzmann, Elia Kaufmann, Jon Arrizabalaga, Marco Pavone, Davide Scaramuzza, Markus Ryll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model Predictive Control (MPC) has become a popular framework in embedded
+control for high-performance autonomous systems. However, to achieve good
+control performance using MPC, an accurate dynamics model is key. To maintain
+real-time operation, the dynamics models used on embedded systems have been
+limited to simple first-principle models, which substantially limits their
+representative power. In contrast to such simple models, machine learning
+approaches, specifically neural networks, have been shown to accurately model
+even complex dynamic effects, but their large computational complexity hindered
+combination with fast real-time iteration loops. With this work, we present
+Real-time Neural MPC, a framework to efficiently integrate large, complex
+neural network architectures as dynamics models within a model-predictive
+control pipeline. Our experiments, performed in simulation and the real world
+onboard a highly agile quadrotor platform, demonstrate the capabilities of the
+described system to run learned models with, previously infeasible, large
+modeling capacity using gradient-based online optimization MPC. Compared to
+prior implementations of neural networks in online optimization MPC we can
+leverage models of over 4000 times larger parametric capacity in a 50Hz
+real-time window on an embedded platform. Further, we show the feasibility of
+our framework on real-world problems by reducing the positional tracking error
+by up to 82% when compared to state-of-the-art MPC approaches without neural
+network dynamics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diversity Induced Environment Design via Self-Play 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02119v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02119v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dexun Li, Wenjun Li, Pradeep Varakantham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work on designing an appropriate distribution of environments has
+shown promise for training effective generally capable agents. Its success is
+partly because of a form of adaptive curriculum learning that generates
+environment instances (or levels) at the frontier of the agent's capabilities.
+However, such an environment design framework often struggles to find effective
+levels in challenging design spaces and requires costly interactions with the
+environment. In this paper, we aim to introduce diversity in the Unsupervised
+Environment Design (UED) framework. Specifically, we propose a task-agnostic
+method to identify observed/hidden states that are representative of a given
+level. The outcome of this method is then utilized to characterize the
+diversity between two levels, which as we show can be crucial to effective
+performance. In addition, to improve sampling efficiency, we incorporate the
+self-play technique that allows the environment generator to automatically
+generate environments that are of great benefit to the training agent.
+Quantitatively, our approach, Diversity-induced Environment Design via
+Self-Play (DivSP), shows compelling performance over existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ B2Opt: Learning to Optimize Black-box Optimization with Little Budget 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11787v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11787v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaobin Li, Kai Wu, Xiaoyu Zhang, Handing Wang, Jing Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The core challenge of high-dimensional and expensive black-box optimization
+(BBO) is how to obtain better performance faster with little function
+evaluation cost. The essence of the problem is how to design an efficient
+optimization strategy tailored to the target task. This paper designs a
+powerful optimization framework to automatically learn the optimization
+strategies from the target or cheap surrogate task without human intervention.
+However, current methods are weak for this due to poor representation of
+optimization strategy. To achieve this, 1) drawing on the mechanism of genetic
+algorithm, we propose a deep neural network framework called B2Opt, which has a
+stronger representation of optimization strategies based on survival of the
+fittest; 2) B2Opt can utilize the cheap surrogate functions of the target task
+to guide the design of the efficient optimization strategies. Compared to the
+state-of-the-art BBO baselines, B2Opt can achieve multiple orders of magnitude
+performance improvement with less function evaluation cost. We validate our
+proposal on high-dimensional synthetic functions and two real-world
+applications. We also find that deep B2Opt performs better than shallow ones.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sharp Restricted Isometry Property Bounds for Low-rank Matrix Recovery
+  Problems with Corrupted Measurements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.08232v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.08232v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziye Ma, Yingjie Bi, Javad Lavaei, Somayeh Sojoudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study a general low-rank matrix recovery problem with
+linear measurements corrupted by some noise. The objective is to understand
+under what conditions on the restricted isometry property (RIP) of the problem
+local search methods can find the ground truth with a small error. By analyzing
+the landscape of the non-convex problem, we first propose a global guarantee on
+the maximum distance between an arbitrary local minimizer and the ground truth
+under the assumption that the RIP constant is smaller than $1/2$. We show that
+this distance shrinks to zero as the intensity of the noise reduces. Our new
+guarantee is sharp in terms of the RIP constant and is much stronger than the
+existing results. We then present a local guarantee for problems with an
+arbitrary RIP constant, which states that any local minimizer is either
+considerably close to the ground truth or far away from it. Next, we prove the
+strict saddle property, which guarantees the global convergence of the
+perturbed gradient descent method in polynomial time. The developed results
+demonstrate how the noise intensity and the RIP constant of the problem affect
+the landscape of the problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dictionary Learning under Symmetries via Group Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19557v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19557v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subhroshekhar Ghosh, Aaron Y. R. Low, Yong Sheng Soh, Zhuohang Feng, Brendan K. Y. Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dictionary learning problem can be viewed as a data-driven process to
+learn a suitable transformation so that data is sparsely represented directly
+from example data. In this paper, we examine the problem of learning a
+dictionary that is invariant under a pre-specified group of transformations.
+Natural settings include Cryo-EM, multi-object tracking, synchronization, pose
+estimation, etc. We specifically study this problem under the lens of
+mathematical representation theory. Leveraging the power of non-abelian Fourier
+analysis for functions over compact groups, we prescribe an algorithmic recipe
+for learning dictionaries that obey such invariances. We relate the dictionary
+learning problem in the physical domain, which is naturally modelled as being
+infinite dimensional, with the associated computational problem, which is
+necessarily finite dimensional. We establish that the dictionary learning
+problem can be effectively understood as an optimization instance over certain
+matrix orbitopes having a particular block-diagonal structure governed by the
+irreducible representations of the group of symmetries. This perspective
+enables us to introduce a band-limiting procedure which obtains dimensionality
+reduction in applications. We provide guarantees for our computational ansatz
+to provide a desirable dictionary learning outcome. We apply our paradigm to
+investigate the dictionary learning problem for the groups SO(2) and SO(3).
+While the SO(2)-orbitope admits an exact spectrahedral description,
+substantially less is understood about the SO(3)-orbitope. We describe a
+tractable spectrahedral outer approximation of the SO(3)-orbitope, and
+contribute an alternating minimization paradigm to perform optimization in this
+setting. We provide numerical experiments to highlight the efficacy of our
+approach in learning SO(3)-invariant dictionaries, both on synthetic and on
+real world data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ROI: A method for identifying organizations receiving personal data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.09495v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.09495v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Rodriguez, Jose M. Del Alamo, Miguel Cozar, Boni Garcia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many studies have exposed the massive collection of personal data in the
+digital ecosystem through, for instance, websites, mobile apps, or smart
+devices. This fact goes unnoticed by most users, who are also unaware that the
+collectors are sharing their personal data with many different organizations
+around the globe. This paper assesses techniques available in the state of the
+art to identify the organizations receiving this personal data. Based on our
+findings, we propose ROI (Receiver Organization Identifier), a fully automated
+method that combines different techniques to achieve a 95.71% precision score
+in identifying an organization receiving personal data. We demonstrate our
+method in the wild by evaluating 10,000 Android apps and exposing the
+organizations that receive users' personal data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One Explanation Does Not Fit XIL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07136v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07136v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Friedrich, David Steinmann, Kristian Kersting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current machine learning models produce outstanding results in many areas
+but, at the same time, suffer from shortcut learning and spurious correlations.
+To address such flaws, the explanatory interactive machine learning (XIL)
+framework has been proposed to revise a model by employing user feedback on a
+model's explanation. This work sheds light on the explanations used within this
+framework. In particular, we investigate simultaneous model revision through
+multiple explanation methods. To this end, we identified that \textit{one
+explanation does not fit XIL} and propose considering multiple ones when
+revising models via XIL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retentive Network: A Successor to <span class="highlight-title">Transformer</span> for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08621v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08621v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose Retentive Network (RetNet) as a foundation
+architecture for large language models, simultaneously achieving training
+parallelism, low-cost inference, and good performance. We theoretically derive
+the connection between recurrence and attention. Then we propose the retention
+mechanism for sequence modeling, which supports three computation paradigms,
+i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel
+representation allows for training parallelism. The recurrent representation
+enables low-cost $O(1)$ inference, which improves decoding throughput, latency,
+and GPU memory without sacrificing performance. The chunkwise recurrent
+representation facilitates efficient long-sequence modeling with linear
+complexity, where each chunk is encoded parallelly while recurrently
+summarizing the chunks. Experimental results on language modeling show that
+RetNet achieves favorable scaling results, parallel training, low-cost
+deployment, and efficient inference. The intriguing properties make RetNet a
+strong successor to Transformer for large language models. Code will be
+available at https://aka.ms/retnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RegExplainer: Generating Explanations for Graph Neural Networks in
+  Regression Task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07840v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07840v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Zhang, Zhuomin Chen, Hao Mei, Dongsheng Luo, Hua Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph regression is a fundamental task and has received increasing attention
+in a wide range of graph learning tasks. However, the inference process is
+often not interpretable. Most existing explanation techniques are limited to
+understanding GNN behaviors in classification tasks. In this work, we seek an
+explanation to interpret the graph regression models (XAIG-R). We show that
+existing methods overlook the distribution shifting and continuously ordered
+decision boundary, which hinders them away from being applied in the regression
+tasks. To address these challenges, we propose a novel objective based on the
+information bottleneck theory and introduce a new mix-up framework, which could
+support various GNNs in a model-agnostic manner. We further present a
+contrastive learning strategy to tackle the continuously ordered labels in
+regression task. To empirically verify the effectiveness of the proposed
+method, we introduce three benchmark datasets and a real-life dataset for
+evaluation. Extensive experiments show the effectiveness of the proposed method
+in interpreting GNN models in regression tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially Private Distributed Estimation and Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15865v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15865v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marios Papachristou, M. Amin Rahimian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study distributed estimation and learning problems in a networked
+environment in which agents exchange information to estimate unknown
+statistical properties of random variables from their privately observed
+samples. By exchanging information about their private observations, the agents
+can collectively estimate the unknown quantities, but they also face privacy
+risks. The goal of our aggregation schemes is to combine the observed data
+efficiently over time and across the network, while accommodating the privacy
+needs of the agents and without any coordination beyond their local
+neighborhoods. Our algorithms enable the participating agents to estimate a
+complete sufficient statistic from private signals that are acquired offline or
+online over time, and to preserve the privacy of their signals and network
+neighborhoods. This is achieved through linear aggregation schemes with
+adjusted randomization schemes that add noise to the exchanged estimates
+subject to differential privacy (DP) constraints. In every case, we demonstrate
+the efficiency of our algorithms by proving convergence to the estimators of a
+hypothetical, omniscient observer that has central access to all of the
+signals. We also provide convergence rate analysis and finite-time performance
+guarantees and show that the noise that minimizes the convergence time to the
+best estimates is the Laplace noise, with parameters corresponding to each
+agent's sensitivity to their signal and network characteristics. Finally, to
+supplement and validate our theoretical results, we run experiments on
+real-world data from the US Power Grid Network and electric consumption data
+from German Households to estimate the average power consumption of power
+stations and households under all privacy regimes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Learning Guided Curvature Approximation: A Quasi-Newton Method
+  with Global Non-Asymptotic Superlinear Convergence <span class="chip">COLT 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08580v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08580v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruichen Jiang, Qiujiang Jin, Aryan Mokhtari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quasi-Newton algorithms are among the most popular iterative methods for
+solving unconstrained minimization problems, largely due to their favorable
+superlinear convergence property. However, existing results for these
+algorithms are limited as they provide either (i) a global convergence
+guarantee with an asymptotic superlinear convergence rate, or (ii) a local
+non-asymptotic superlinear rate for the case that the initial point and the
+initial Hessian approximation are chosen properly. In particular, no current
+analysis for quasi-Newton methods guarantees global convergence with an
+explicit superlinear convergence rate. In this paper, we close this gap and
+present the first globally convergent quasi-Newton method with an explicit
+non-asymptotic superlinear convergence rate. Unlike classical quasi-Newton
+methods, we build our algorithm upon the hybrid proximal extragradient method
+and propose a novel online learning framework for updating the Hessian
+approximation matrices. Specifically, guided by the convergence analysis, we
+formulate the Hessian approximation update as an online convex optimization
+problem in the space of matrices, and we relate the bounded regret of the
+online problem to the superlinear convergence of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 1 figure, accepted to COLT 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Learning practices and infrastructures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06518v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06518v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Glen Berman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Learning (ML) systems, particularly when deployed in high-stakes
+domains, are deeply consequential. They can exacerbate existing inequities,
+create new modes of discrimination, and reify outdated social constructs.
+Accordingly, the social context (i.e. organisations, teams, cultures) in which
+ML systems are developed is a site of active research for the field of AI
+ethics, and intervention for policymakers. This paper focuses on one aspect of
+social context that is often overlooked: interactions between practitioners and
+the tools they rely on, and the role these interactions play in shaping ML
+practices and the development of ML systems. In particular, through an
+empirical study of questions asked on the Stack Exchange forums, the use of
+interactive computing platforms (e.g. Jupyter Notebook and Google Colab) in ML
+practices is explored. I find that interactive computing platforms are used in
+a host of learning and coordination practices, which constitutes an
+infrastructural relationship between interactive computing platforms and ML
+practitioners. I describe how ML practices are co-evolving alongside the
+development of interactive computing platforms, and highlight how this risks
+making invisible aspects of the ML life cycle that AI ethics researchers' have
+demonstrated to be particularly salient for the societal impact of deployed ML
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Question Decomposition Improves the Faithfulness of Model-Generated
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11768v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11768v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ansh Radhakrishnan, Karina Nguyen, Anna Chen, Carol Chen, Carson Denison, Danny Hernandez, Esin Durmus, Evan Hubinger, Jackson Kernion, Kamilė Lukošiūtė, Newton Cheng, Nicholas Joseph, Nicholas Schiefer, Oliver Rausch, Sam McCandlish, Sheer El Showk, Tamera Lanham, Tim Maxwell, Venkatesa Chandrasekaran, Zac Hatfield-Dodds, Jared Kaplan, Jan Brauner, Samuel R. Bowman, Ethan Perez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) perform more difficult tasks, it becomes
+harder to verify the correctness and safety of their behavior. One approach to
+help with this issue is to prompt LLMs to externalize their reasoning, e.g., by
+having them generate step-by-step reasoning as they answer a question
+(Chain-of-Thought; CoT). The reasoning may enable us to check the process that
+models use to perform tasks. However, this approach relies on the stated
+reasoning faithfully reflecting the model's actual reasoning, which is not
+always the case. To improve over the faithfulness of CoT reasoning, we have
+models generate reasoning by decomposing questions into subquestions.
+Decomposition-based methods achieve strong performance on question-answering
+tasks, sometimes approaching that of CoT while improving the faithfulness of
+the model's stated reasoning on several recently-proposed metrics. By forcing
+the model to answer simpler subquestions in separate contexts, we greatly
+increase the faithfulness of model-generated reasoning over CoT, while still
+achieving some of the performance gains of CoT. Our results show it is possible
+to improve the faithfulness of model-generated reasoning; continued
+improvements may lead to reasoning that enables us to verify the correctness
+and safety of LLM behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For few-shot examples and prompts, see
+  https://github.com/anthropics/DecompositionFaithfulnessPaper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Network Revenue Management with Demand Learning and Fair
+  Resource-Consumption Balancing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.11159v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.11159v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Chen, Jiameng Lyu, Yining Wang, Yuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In addition to maximizing the total revenue, decision-makers in lots of
+industries would like to guarantee balanced consumption across different
+resources. For instance, in the retailing industry, ensuring a balanced
+consumption of resources from different suppliers enhances fairness and helps
+main a good channel relationship; in the cloud computing industry,
+resource-consumption balance helps increase customer satisfaction and reduce
+operational costs. Motivated by these practical needs, this paper studies the
+price-based network revenue management (NRM) problem with both demand learning
+and fair resource-consumption balancing. We introduce the regularized revenue,
+i.e., the total revenue with a balancing regularization, as our objective to
+incorporate fair resource-consumption balancing into the revenue maximization
+goal. We propose a primal-dual-type online policy with the
+Upper-Confidence-Bound (UCB) demand learning method to maximize the regularized
+revenue. We adopt several innovative techniques to make our algorithm a unified
+and computationally efficient framework for the continuous price set and a wide
+class of balancing regularizers. Our algorithm achieves a worst-case regret of
+$\widetilde O(N^{5/2}\sqrt{T})$, where $N$ denotes the number of products and
+$T$ denotes the number of time periods. Numerical experiments in a few NRM
+examples demonstrate the effectiveness of our algorithm in simultaneously
+achieving revenue maximization and fair resource-consumption balancing
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The original title is Fairness-aware Network Revenue Management With
+  Demand Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multifidelity Covariance Estimation via Regression on the Manifold of
+  Symmetric Positive Definite Matrices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12438v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12438v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aimee Maurais, Terrence Alsup, Benjamin Peherstorfer, Youssef Marzouk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a multifidelity estimator of covariance matrices formulated as
+the solution to a regression problem on the manifold of symmetric positive
+definite matrices. The estimator is positive definite by construction, and the
+Mahalanobis distance minimized to obtain it possesses properties which enable
+practical computation. We show that our manifold regression multifidelity
+(MRMF) covariance estimator is a maximum likelihood estimator under a certain
+error model on manifold tangent space. More broadly, we show that our
+Riemannian regression framework encompasses existing multifidelity covariance
+estimators constructed from control variates. We demonstrate via numerical
+examples that our estimator can provide significant decreases, up to one order
+of magnitude, in squared estimation error relative to both single-fidelity and
+other multifidelity covariance estimators. Furthermore, preservation of
+positive definiteness ensures that our estimator is compatible with downstream
+tasks, such as data assimilation and metric learning, in which this property is
+essential.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages + 15-page supplement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Finite volume method network for acceleration of unsteady computational
+  fluid dynamics: non-reacting and reacting flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.03332v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.03332v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joongoo Jeon, Juhyeong Lee, Sung Joong Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite rapid improvements in the performance of central processing unit
+(CPU), the calculation cost of simulating chemically reacting flow using CFD
+remains infeasible in many cases. The application of the convolutional neural
+networks (CNNs) specialized in image processing in flow field prediction has
+been studied, but the need to develop a neural netweork design fitted for CFD
+is recently emerged. In this study, a neural network model introducing the
+finite volume method (FVM) with a unique network architecture and
+physics-informed loss function was developed to accelerate CFD simulations. The
+developed network model, considering the nature of the CFD flow field where the
+identical governing equations are applied to all grids, can predict the future
+fields with only two previous fields unlike the CNNs requiring many field
+images (>10,000). The performance of this baseline model was evaluated using
+CFD time series data from non-reacting flow and reacting flow simulation;
+counterflow and hydrogen flame with 20 detailed chemistries. Consequently, we
+demonstrated that (1) the FVM-based network architecture provided improved
+accuracy of multistep time series prediction compared to the previous MLP model
+(2) the physic-informed loss function prevented non-physical overfitting
+problem and ultimately reduced the error in time series prediction (3)
+observing the calculated residuals in an unsupervised manner could indirectly
+estimate the network accuracy. Additionally, under the reacting flow dataset,
+the computational speed of this network model was measured to be about 10 times
+faster than that of the CFD solver.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatial-temporal <span class="highlight-title">Transformer</span>-guided Diffusion based Data Augmentation
+  for Efficient Skeleton-based Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13434v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13434v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Jiang, Han Chen, Hanseok Ko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, skeleton-based human action has become a hot research topic because
+the compact representation of human skeletons brings new blood to this research
+domain. As a result, researchers began to notice the importance of using RGB or
+other sensors to analyze human action by extracting skeleton information.
+Leveraging the rapid development of deep learning (DL), a significant number of
+skeleton-based human action approaches have been presented with fine-designed
+DL structures recently. However, a well-trained DL model always demands
+high-quality and sufficient data, which is hard to obtain without costing high
+expenses and human labor. In this paper, we introduce a novel data augmentation
+method for skeleton-based action recognition tasks, which can effectively
+generate high-quality and diverse sequential actions. In order to obtain
+natural and realistic action sequences, we propose denoising diffusion
+probabilistic models (DDPMs) that can generate a series of synthetic action
+sequences, and their generation process is precisely guided by a
+spatial-temporal transformer (ST-Trans). Experimental results show that our
+method outperforms the state-of-the-art (SOTA) motion generation approaches on
+different naturality and diversity metrics. It proves that its high-quality
+synthetic data can also be effectively deployed to existing action recognition
+models with significant performance improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What Symptoms and How Long? An Interpretable AI Approach for Depression
+  Detection in Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13127v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13127v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwei Kuang, Jiaheng Xie, Zhijun Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depression is the most prevalent and serious mental illness, which induces
+grave financial and societal ramifications. Depression detection is key for
+early intervention to mitigate those consequences. Such a high-stake decision
+inherently necessitates interpretability. Although a few depression detection
+studies attempt to explain the decision based on the importance score or
+attention weights, these explanations misalign with the clinical depression
+diagnosis criterion that is based on depressive symptoms. To fill this gap, we
+follow the computational design science paradigm to develop a novel Multi-Scale
+Temporal Prototype Network (MSTPNet). MSTPNet innovatively detects and
+interprets depressive symptoms as well as how long they last. Extensive
+empirical analyses using a large-scale dataset show that MSTPNet outperforms
+state-of-the-art depression detection methods with an F1-score of 0.851. This
+result also reveals new symptoms that are unnoted in the survey approach, such
+as sharing admiration for a different life. We further conduct a user study to
+demonstrate its superiority over the benchmarks in interpretability. This study
+contributes to IS literature with a novel interpretable deep learning model for
+depression detection in social media. In practice, our proposed method can be
+implemented in social media platforms to provide personalized online resources
+for detected depressed patients.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>56 pages, 10 figures, 21 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FairGen: Towards Fair Graph Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17743v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17743v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lecheng Zheng, Dawei Zhou, Hanghang Tong, Jiejun Xu, Yada Zhu, Jingrui He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There have been tremendous efforts over the past decades dedicated to the
+generation of realistic graphs in a variety of domains, ranging from social
+networks to computer networks, from gene regulatory networks to online
+transaction networks. Despite the remarkable success, the vast majority of
+these works are unsupervised in nature and are typically trained to minimize
+the expected graph reconstruction loss, which would result in the
+representation disparity issue in the generated graphs, i.e., the protected
+groups (often minorities) contribute less to the objective and thus suffer from
+systematically higher errors. In this paper, we aim to tailor graph generation
+to downstream mining tasks by leveraging label information and user-preferred
+parity constraint. In particular, we start from the investigation of
+representation disparity in the context of graph generative models. To mitigate
+the disparity, we propose a fairness-aware graph generative model named
+FairGen. Our model jointly trains a label-informed graph generation module and
+a fair representation learning module by progressively learning the behaviors
+of the protected and unprotected groups, from the `easy' concepts to the `hard'
+ones. In addition, we propose a generic context sampling strategy for graph
+generative models, which is proven to be capable of fairly capturing the
+contextual information of each group with a high probability. Experimental
+results on seven real-world data sets, including web-based graphs, demonstrate
+that FairGen (1) obtains performance on par with state-of-the-art graph
+generative models across six network properties, (2) mitigates the
+representation disparity issues in the generated graphs, and (3) substantially
+boosts the model performance by up to 17% in downstream tasks via data
+augmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Concept Algebra for Score-Based Conditional Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03693v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03693v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Wang, Lin Gui, Jeffrey Negrea, Victor Veitch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper concerns the structure of learned representations in text-guided
+generative models, focusing on score-based models. Here, we focus on the idea
+that concepts are encoded as subspaces (or directions) of some representation
+space. We develop a mathematical formalization of this idea.Using this
+formalism, we show there's a natural choice of representation with this
+property, and we develop a simple method for identifying the part of the
+representation corresponding to a given concept. In particular, this allows us
+to manipulate the concepts expressed by the model through algebraic
+manipulation of the representation. We demonstrate the idea with examples
+text-guided image generation, using Stable Diffusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linear CNNs Discover the Statistical Structure of the <span class="highlight-title">Dataset</span> Using Only
+  the Most Dominant Frequencies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02034v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02034v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hannah Pinson, Joeri Lenaerts, Vincent Ginis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We here present a stepping stone towards a deeper understanding of
+convolutional neural networks (CNNs) in the form of a theory of learning in
+linear CNNs. Through analyzing the gradient descent equations, we discover that
+the evolution of the network during training is determined by the interplay
+between the dataset structure and the convolutional network structure. We show
+that linear CNNs discover the statistical structure of the dataset with
+non-linear, ordered, stage-like transitions, and that the speed of discovery
+changes depending on the relationship between the dataset and the convolutional
+network structure. Moreover, we find that this interplay lies at the heart of
+what we call the ``dominant frequency bias'', where linear CNNs arrive at these
+discoveries using only the dominant frequencies of the different structural
+parts present in the dataset. We furthermore provide experiments that show how
+our theory relates to deep, non-linear CNNs used in practice. Our findings shed
+new light on the inner working of CNNs, and can help explain their shortcut
+learning and their tendency to rely on texture instead of shape.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Manifold Filter-Combine Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04056v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04056v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joyce Chew, Edward De Brouwer, Smita Krishnaswamy, Deanna Needell, Michael Perlmutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a class of manifold neural networks (MNNs) that we call Manifold
+Filter-Combine Networks (MFCNs), that aims to further our understanding of
+MNNs, analogous to how the aggregate-combine framework helps with the
+understanding of graph neural networks (GNNs). This class includes a wide
+variety of subclasses that can be thought of as the manifold analog of various
+popular GNNs. We then consider a method, based on building a data-driven graph,
+for implementing such networks when one does not have global knowledge of the
+manifold, but merely has access to finitely many sample points. We provide
+sufficient conditions for the network to provably converge to its continuum
+limit as the number of sample points tends to infinity. Unlike previous work
+(which focused on specific graph constructions), our rate of convergence does
+not directly depend on the number of filters used. Moreover, it exhibits linear
+dependence on the depth of the network rather than the exponential dependence
+obtained previously. Additionally, we provide several examples of interesting
+subclasses of MFCNs and of the rates of convergence that are obtained under
+specific graph constructions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Productive Crop Field Detection: A New <span class="highlight-title">Dataset</span> and Deep Learning
+  Benchmark Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11990v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11990v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduardo Nascimento, John Just, Jurandy Almeida, Tiago Almeida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In precision agriculture, detecting productive crop fields is an essential
+practice that allows the farmer to evaluate operating performance separately
+and compare different seed varieties, pesticides, and fertilizers. However,
+manually identifying productive fields is often a time-consuming and
+error-prone task. Previous studies explore different methods to detect crop
+fields using advanced machine learning algorithms, but they often lack good
+quality labeled data. In this context, we propose a high-quality dataset
+generated by machine operation combined with Sentinel-2 images tracked over
+time. As far as we know, it is the first one to overcome the lack of labeled
+samples by using this technique. In sequence, we apply a semi-supervised
+classification of unlabeled data and state-of-the-art supervised and
+self-supervised deep learning methods to detect productive crop fields
+automatically. Finally, the results demonstrate high accuracy in Positive
+Unlabeled learning, which perfectly fits the problem where we have high
+confidence in the positive samples. Best performances have been found in
+Triplet Loss Siamese given the existence of an accurate dataset and Contrastive
+Learning considering situations where we do not have a comprehensive labeled
+dataset available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of the paper https://doi.org/10.1109/lgrs.2023.3296064
+  published in IEEE Geoscience and Remote Sensing Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SimFBO: Towards Simple, Flexible and Communication-efficient Federated
+  Bilevel Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19442v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19442v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Yang, Peiyao Xiao, Kaiyi Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated bilevel optimization (FBO) has shown great potential recently in
+machine learning and edge computing due to the emerging nested optimization
+structure in meta-learning, fine-tuning, hyperparameter tuning, etc. However,
+existing FBO algorithms often involve complicated computations and require
+multiple sub-loops per iteration, each of which contains a number of
+communication rounds. In this paper, we propose a simple and flexible FBO
+framework named SimFBO, which is easy to implement without sub-loops, and
+includes a generalized server-side aggregation and update for improving
+communication efficiency. We further propose System-level heterogeneity robust
+FBO (ShroFBO) as a variant of SimFBO with stronger resilience to heterogeneous
+local computation. We show that SimFBO and ShroFBO provably achieve a linear
+convergence speedup with partial client participation and client sampling
+without replacement, as well as improved sample and communication complexities.
+Experiments demonstrate the effectiveness of the proposed methods over existing
+FBO algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Monotonicity and Double Descent in Uncertainty Estimation with Gaussian
+  Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07612v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07612v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liam Hodgkinson, Chris van der Heide, Fred Roosta, Michael W. Mahoney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite their importance for assessing reliability of predictions,
+uncertainty quantification (UQ) measures for machine learning models have only
+recently begun to be rigorously characterized. One prominent issue is the curse
+of dimensionality: it is commonly believed that the marginal likelihood should
+be reminiscent of cross-validation metrics and that both should deteriorate
+with larger input dimensions. We prove that by tuning hyperparameters to
+maximize marginal likelihood (the empirical Bayes procedure), the performance,
+as measured by the marginal likelihood, improves monotonically} with the input
+dimension. On the other hand, we prove that cross-validation metrics exhibit
+qualitatively different behavior that is characteristic of double descent. Cold
+posteriors, which have recently attracted interest due to their improved
+performance in certain settings, appear to exacerbate these phenomena. We
+verify empirically that our results hold for real data, beyond our considered
+assumptions, and we explore consequences involving synthetic covariates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learn to Compress (LtC): Efficient Learning-based Streaming Video
+  Analytics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12171v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12171v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quazi Mishkatul Alam, Israat Haque, Nael Abu-Ghazaleh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video analytics are often performed as cloud services in edge settings,
+mainly to offload computation, and also in situations where the results are not
+directly consumed at the video sensors. Sending high-quality video data from
+the edge devices can be expensive both in terms of bandwidth and power use. In
+order to build a streaming video analytics pipeline that makes efficient use of
+these resources, it is therefore imperative to reduce the size of the video
+stream. Traditional video compression algorithms are unaware of the semantics
+of the video, and can be both inefficient and harmful for the analytics
+performance. In this paper, we introduce LtC, a collaborative framework between
+the video source and the analytics server, that efficiently learns to reduce
+the video streams within an analytics pipeline. Specifically, LtC uses the
+full-fledged analytics algorithm at the server as a teacher to train a
+lightweight student neural network, which is then deployed at the video source.
+The student network is trained to comprehend the semantic significance of
+various regions within the videos, which is used to differentially preserve the
+crucial regions in high quality while the remaining regions undergo aggressive
+compression. Furthermore, LtC also incorporates a novel temporal filtering
+algorithm based on feature-differencing to omit transmitting frames that do not
+contribute new information. Overall, LtC is able to use 28-35% less bandwidth
+and has up to 45% shorter response delay compared to recently published state
+of the art streaming frameworks while achieving similar analytics performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Distillation-based Information Sharing for Online Process
+  Monitoring in Decentralized Manufacturing System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12004v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12004v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangyue Shi, Yuxuan Li, Chenang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In advanced manufacturing, the incorporation of sensing technology provides
+an opportunity to achieve efficient in-situ process monitoring using machine
+learning methods. Meanwhile, the advances of information technologies also
+enable a connected and decentralized environment for manufacturing systems,
+making different manufacturing units in the system collaborate more closely. In
+a decentralized manufacturing system, the involved units may fabricate same or
+similar products and deploy their own machine learning model for online process
+monitoring. However, due to the possible inconsistency of task progress during
+the operation, it is also common that some units have more informative data
+while some have less informative data. Thus, the monitoring performance of
+machine learning model for each unit may highly vary. Therefore, it is
+extremely valuable to achieve efficient and secured knowledge sharing among the
+units in a decentralized manufacturing system for enhancement of poorly
+performed models. To realize this goal, this paper proposes a novel knowledge
+distillation-based information sharing (KD-IS) framework, which could distill
+informative knowledge from well performed models to improve the monitoring
+performance of poorly performed models. To validate the effectiveness of this
+method, a real-world case study is conducted in a connected fused filament
+fabrication (FFF)-based additive manufacturing (AM) platform. The experimental
+results show that the developed method is very efficient in improving model
+monitoring performance at poorly performed models, with solid protection on
+potential data privacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spectrum-guided Multi-granularity Referring Video Object Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Miao, Mohammed Bennamoun, Yongsheng Gao, Ajmal Mian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current referring video object segmentation (R-VOS) techniques extract
+conditional kernels from encoded (low-resolution) vision-language features to
+segment the decoded high-resolution features. We discovered that this causes
+significant feature drift, which the segmentation kernels struggle to perceive
+during the forward computation. This negatively affects the ability of
+segmentation kernels. To address the drift problem, we propose a
+Spectrum-guided Multi-granularity (SgMg) approach, which performs direct
+segmentation on the encoded features and employs visual details to further
+optimize the masks. In addition, we propose Spectrum-guided Cross-modal Fusion
+(SCF) to perform intra-frame global interactions in the spectral domain for
+effective multimodal representation. Finally, we extend SgMg to perform
+multi-object R-VOS, a new paradigm that enables simultaneous segmentation of
+multiple referred objects in a video. This not only makes R-VOS faster, but
+also more practical. Extensive experiments show that SgMg achieves
+state-of-the-art performance on four video benchmark datasets, outperforming
+the nearest competitor by 2.8% points on Ref-YouTube-VOS. Our extended SgMg
+enables multi-object R-VOS, runs about 3 times faster while maintaining
+satisfactory performance. Code is available at https://github.com/bo-miao/SgMg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023, code is at https://github.com/bo-miao/SgMg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Snoring Sound <span class="highlight-title">Dataset</span> for Body Position Recognition: Collection,
+  Annotation, and Analysis <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Xiao, Xiuping Yang, Xinhong Li, Weiping Tu, Xiong Chen, Weiyan Yi, Jie Lin, Yuhong Yang, Yanzhen Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obstructive Sleep Apnea-Hypopnea Syndrome (OSAHS) is a chronic breathing
+disorder caused by a blockage in the upper airways. Snoring is a prominent
+symptom of OSAHS, and previous studies have attempted to identify the
+obstruction site of the upper airways by snoring sounds. Despite some progress,
+the classification of the obstruction site remains challenging in real-world
+clinical settings due to the influence of sleep body position on upper airways.
+To address this challenge, this paper proposes a snore-based sleep body
+position recognition dataset (SSBPR) consisting of 7570 snoring recordings,
+which comprises six distinct labels for sleep body position: supine, supine but
+left lateral head, supine but right lateral head, left-side lying, right-side
+lying and prone. Experimental results show that snoring sounds exhibit certain
+acoustic features that enable their effective utilization for identifying body
+posture during sleep in real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-aware Query-enhanced <span class="highlight-title">Transformer</span> for Audio-Visual Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinxiang Liu, Chen Ju, Chaofan Ma, Yanfeng Wang, Yu Wang, Ya Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of the audio-visual segmentation (AVS) task is to segment the
+sounding objects in the video frames using audio cues. However, current
+fusion-based methods have the performance limitations due to the small
+receptive field of convolution and inadequate fusion of audio-visual features.
+To overcome these issues, we propose a novel \textbf{Au}dio-aware
+query-enhanced \textbf{TR}ansformer (AuTR) to tackle the task. Unlike existing
+methods, our approach introduces a multimodal transformer architecture that
+enables deep fusion and aggregation of audio-visual features. Furthermore, we
+devise an audio-aware query-enhanced transformer decoder that explicitly helps
+the model focus on the segmentation of the pinpointed sounding objects based on
+audio signals, while disregarding silent yet salient objects. Experimental
+results show that our method outperforms previous methods and demonstrates
+better generalization ability in multi-sound and open-set scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2305.11019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-oriented Modality Reinforcement Network for Multimodal Sentiment
+  Analysis from Unaligned Multimodal Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Lei, Dingkang Yang, Mingcheng Li, Shunli Wang, Jiawei Chen, Lihua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Sentiment Analysis (MSA) aims to mine sentiment information from
+text, visual, and acoustic modalities. Previous works have focused on
+representation learning and feature fusion strategies. However, most of these
+efforts ignored the disparity in the semantic richness of different modalities
+and treated each modality in the same manner. That may lead to strong
+modalities being neglected and weak modalities being overvalued. Motivated by
+these observations, we propose a Text-oriented Modality Reinforcement Network
+(TMRN), which focuses on the dominance of the text modality in MSA. More
+specifically, we design a Text-Centered Cross-modal Attention (TCCA) module to
+make full interaction for text/acoustic and text/visual pairs, and a Text-Gated
+Self-Attention (TGSA) module to guide the self-reinforcement of the other two
+modalities. Furthermore, we present an adaptive fusion mechanism to decide the
+proportion of different modalities involved in the fusion process. Finally, we
+combine the feature matrices into vectors to get the final representation for
+the downstream tasks. Experimental results show that our TMRN outperforms the
+state-of-the-art methods on two MSA benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CICAI 2023 (Finalist of Best Student Paper Award)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-24T00:00:00Z">2023-07-24</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">51</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D-LLM: Injecting the 3D World into Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yining Hong, Haoyu Zhen, Peihao Chen, Shuhong Zheng, Yilun Du, Zhenfang Chen, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) and Vision-Language Models (VLMs) have been
+proven to excel at multiple tasks, such as commonsense reasoning. Powerful as
+these models can be, they are not grounded in the 3D physical world, which
+involves richer concepts such as spatial relationships, affordances, physics,
+layout, and so on. In this work, we propose to inject the 3D world into large
+language models and introduce a whole new family of 3D-LLMs. Specifically,
+3D-LLMs can take 3D point clouds and their features as input and perform a
+diverse set of 3D-related tasks, including captioning, dense captioning, 3D
+question answering, task decomposition, 3D grounding, 3D-assisted dialog,
+navigation, and so on. Using three types of prompting mechanisms that we
+design, we are able to collect over 300k 3D-language data covering these tasks.
+To efficiently train 3D-LLMs, we first utilize a 3D feature extractor that
+obtains 3D features from rendered multi- view images. Then, we use 2D VLMs as
+our backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,
+3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show
+that our model outperforms state-of-the-art baselines by a large margin (e.g.,
+the BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,
+experiments on our held-in datasets for 3D captioning, task composition, and
+3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative
+examples also show that our model could perform more tasks beyond the scope of
+existing LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: : https://vis-www.cs.umass.edu/3dllm/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Ripple Effects of Knowledge Editing in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roi Cohen, Eden Biran, Ori Yoran, Amir Globerson, Mor Geva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern language models capture a large body of factual knowledge. However,
+some facts can be incorrectly induced or become obsolete over time, resulting
+in factually incorrect generations. This has led to the development of various
+editing methods that allow updating facts encoded by the model. Evaluation of
+these methods has primarily focused on testing whether an individual fact has
+been successfully injected, and if similar predictions for other subjects have
+not changed. Here we argue that such evaluation is limited, since injecting one
+fact (e.g. ``Jack Depp is the son of Johnny Depp'') introduces a ``ripple
+effect'' in the form of additional facts that the model needs to update
+(e.g.``Jack Depp is the sibling of Lily-Rose Depp''). To address this issue, we
+propose a novel set of evaluation criteria that consider the implications of an
+edit on related facts. Using these criteria, we then construct \ripple{}, a
+diagnostic benchmark of 5K factual edits, capturing a variety of types of
+ripple effects. We evaluate prominent editing methods on \ripple{}, showing
+that current methods fail to introduce consistent changes in the model's
+knowledge. In addition, we find that a simple in-context editing baseline
+obtains the best scores on our benchmark, suggesting a promising research
+direction for model editing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Label Variation in Large Language Models for Zero-Shot Text
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12973v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12973v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Flor Miriam Plaza-del-Arco, Debora Nozza, Dirk Hovy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The zero-shot learning capabilities of large language models (LLMs) make them
+ideal for text classification without annotation or supervised training. Many
+studies have shown impressive results across multiple tasks. While tasks, data,
+and results differ widely, their similarities to human annotation can aid us in
+tackling new tasks with minimal expenses. We evaluate using 5 state-of-the-art
+LLMs as "annotators" on 5 different tasks (age, gender, topic, sentiment
+prediction, and hate speech detection), across 4 languages: English, French,
+German, and Spanish. No single model excels at all tasks, across languages, or
+across all labels within a task. However, aggregation techniques designed for
+human annotators perform substantially better than any one individual model.
+Overall, though, LLMs do not rival even simple supervised models, so they do
+not (yet) replace the need for human annotation. We also discuss the tradeoffs
+between speed, accuracy, cost, and bias when it comes to aggregated model
+labeling versus human annotation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aligning Large Language Models with Human: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12966v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12966v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufei Wang, Wanjun Zhong, Liangyou Li, Fei Mi, Xingshan Zeng, Wenyong Huang, Lifeng Shang, Xin Jiang, Qun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) trained on extensive textual corpora have
+emerged as leading solutions for a broad array of Natural Language Processing
+(NLP) tasks. Despite their notable performance, these models are prone to
+certain limitations such as misunderstanding human instructions, generating
+potentially biased content, or factually incorrect (hallucinated) information.
+Hence, aligning LLMs with human expectations has become an active area of
+interest within the research community. This survey presents a comprehensive
+overview of these alignment technologies, including the following aspects. (1)
+Data collection: the methods for effectively collecting high-quality
+instructions for LLM alignment, including the use of NLP benchmarks, human
+annotations, and leveraging strong LLMs. (2) Training methodologies: a detailed
+review of the prevailing training methods employed for LLM alignment. Our
+exploration encompasses Supervised Fine-tuning, both Online and Offline human
+preference training, along with parameter-efficient training mechanisms. (3)
+Model Evaluation: the methods for evaluating the effectiveness of these
+human-aligned LLMs, presenting a multifaceted approach towards their
+assessment. In conclusion, we collate and distill our findings, shedding light
+on several promising future research avenues in the field. This survey,
+therefore, serves as a valuable resource for anyone invested in understanding
+and advancing the alignment of LLMs to better suit human-oriented tasks and
+expectations. An associated GitHub link collecting the latest papers is
+available at https://github.com/GaryYufei/AlignLLMHumanSurvey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RLCD: Reinforcement Learning from Contrast Distillation for Language
+  Model Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Yang, Dan Klein, Asli Celikyilmaz, Nanyun Peng, Yuandong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Reinforcement Learning from Contrast Distillation (RLCD), a method
+for aligning language models to follow natural language principles without
+using human feedback. RLCD trains a preference model using simulated preference
+pairs that contain both a high-quality and low-quality example, generated using
+contrasting positive and negative prompts. The preference model is then used to
+improve a base unaligned language model via reinforcement learning.
+Empirically, RLCD outperforms RLAIF (Bai et al., 2022b) and context
+distillation (Huang et al., 2022) baselines across three diverse alignment
+tasks--harmlessness, helpfulness, and story outline generation--and on both 7B
+and 30B model scales for preference data simulation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Punctuation Restoration with Data Generation and Reinforcement
+  Learning <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viet Dac Lai, Abel Salinas, Hao Tan, Trung Bui, Quan Tran, Seunghyun Yoon, Hanieh Deilamsalehy, Franck Dernoncourt, Thien Huu Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Punctuation restoration is an important task in automatic speech recognition
+(ASR) which aim to restore the syntactic structure of generated ASR texts to
+improve readability. While punctuated texts are abundant from written
+documents, the discrepancy between written punctuated texts and ASR texts
+limits the usability of written texts in training punctuation restoration
+systems for ASR texts. This paper proposes a reinforcement learning method to
+exploit in-topic written texts and recent advances in large pre-trained
+generative language models to bridge this gap. The experiments show that our
+method achieves state-of-the-art performance on the ASR test set on two
+benchmark datasets for punctuation restoration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at INTERSPEECH 2023, 6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rule By Example: Harnessing Logical Rules for Explainable Hate Speech
+  Detection <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Clarke, Matthew Hall, Gaurav Mittal, Ye Yu, Sandra Sajeev, Jason Mars, Mei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classic approaches to content moderation typically apply a rule-based
+heuristic approach to flag content. While rules are easily customizable and
+intuitive for humans to interpret, they are inherently fragile and lack the
+flexibility or robustness needed to moderate the vast amount of undesirable
+content found online today. Recent advances in deep learning have demonstrated
+the promise of using highly effective deep neural models to overcome these
+challenges. However, despite the improved performance, these data-driven models
+lack transparency and explainability, often leading to mistrust from everyday
+users and a lack of adoption by many platforms. In this paper, we present Rule
+By Example (RBE): a novel exemplar-based contrastive learning approach for
+learning from logical rules for the task of textual content moderation. RBE is
+capable of providing rule-grounded predictions, allowing for more explainable
+and customizable predictions compared to typical deep learning-based
+approaches. We demonstrate that our approach is capable of learning rich rule
+embedding representations using only a few data examples. Experimental results
+on 3 popular hate speech classification datasets show that RBE is able to
+outperform state-of-the-art deep learning classifiers as well as the use of
+rules in both supervised and unsupervised settings while providing explainable
+model predictions via rule-grounding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Corrections of Zipf's and Heaps' Laws Derived from Hapax Rate Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Łukasz Dębowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The article introduces corrections to Zipf's and Heaps' laws based on
+systematic models of the hapax rate. The derivation rests on two assumptions:
+The first one is the standard urn model which predicts that marginal frequency
+distributions for shorter texts look as if word tokens were sampled blindly
+from a given longer text. The second assumption posits that the rate of hapaxes
+is a simple function of the text size. Four such functions are discussed: the
+constant model, the Davis model, the linear model, and the logistic model. It
+is shown that the logistic model yields the best fit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 7 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Real-World WebAgent with Planning, Long Context Understanding, and
+  Program Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Izzeddin Gur, Hiroki Furuta, Austin Huang, Mustafa Safdari, Yutaka Matsuo, Douglas Eck, Aleksandra Faust
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large language models (LLMs) have recently achieved better
+generalization and sample efficiency in autonomous web navigation. However, the
+performance on real-world websites has still suffered from (1) open domainness,
+(2) limited context length, and (3) lack of inductive bias on HTML. We
+introduce WebAgent, an LLM-driven agent that can complete the tasks on real
+websites following natural language instructions. WebAgent plans ahead by
+decomposing instructions into canonical sub-instructions, summarizes long HTML
+documents into task-relevant snippets, and acts on websites via generated
+Python programs from those. We design WebAgent with Flan-U-PaLM, for grounded
+code generation, and HTML-T5, new pre-trained LLMs for long HTML documents
+using local and global attention mechanisms and a mixture of long-span
+denoising objectives, for planning and summarization. We empirically
+demonstrate that our recipe improves the success on a real website by over 50%,
+and that HTML-T5 is the best model to solve HTML-based tasks; achieving 14.9%
+higher success rate than prior SoTA on the MiniWoB web navigation benchmark and
+better accuracy on offline task planning evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Joint Dropout: Improving Generalizability in Low-Resource Neural Machine
+  Translation through Phrase Pair Variables 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Araabi, Vlad Niculae, Christof Monz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the tremendous success of Neural Machine Translation (NMT), its
+performance on low-resource language pairs still remains subpar, partly due to
+the limited ability to handle previously unseen inputs, i.e., generalization.
+In this paper, we propose a method called Joint Dropout, that addresses the
+challenge of low-resource neural machine translation by substituting phrases
+with variables, resulting in significant enhancement of compositionality, which
+is a key aspect of generalization. We observe a substantial improvement in
+translation quality for language pairs with minimal resources, as seen in BLEU
+and Direct Assessment scores. Furthermore, we conduct an error analysis, and
+find Joint Dropout to also enhance generalizability of low-resource NMT in
+terms of robustness and adaptability across different domains
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MT Summit 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guidance in Radiology Report Summarization: An Empirical Evaluation and
+  Error Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12803v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12803v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Trienes, Paul Youssef, Jörg Schlötterer, Christin Seifert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatically summarizing radiology reports into a concise impression can
+reduce the manual burden of clinicians and improve the consistency of
+reporting. Previous work aimed to enhance content selection and factuality
+through guided abstractive summarization. However, two key issues persist.
+First, current methods heavily rely on domain-specific resources to extract the
+guidance signal, limiting their transferability to domains and languages where
+those resources are unavailable. Second, while automatic metrics like ROUGE
+show progress, we lack a good understanding of the errors and failure modes in
+this task. To bridge these gaps, we first propose a domain-agnostic guidance
+signal in form of variable-length extractive summaries. Our empirical results
+on two English benchmarks demonstrate that this guidance signal improves upon
+unguided summarization while being competitive with domain-specific methods.
+Additionally, we run an expert evaluation of four systems according to a
+taxonomy of 11 fine-grained errors. We find that the most pressing differences
+between automatic summaries and those of radiologists relate to content
+selection including omissions (up to 52%) and additions (up to 57%). We
+hypothesize that latent reporting factors and corpus-level inconsistencies may
+limit models to reliably learn content selection from the available data,
+presenting promising directions for future work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at INLG2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RRAML: Reinforced Retrieval Augmented Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12798v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12798v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bacciu, Florin Cocunasu, Federico Siciliano, Fabrizio Silvestri, Nicola Tonellotto, Giovanni Trappolini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large language models (LLMs) has revolutionized machine
+learning and related fields, showcasing remarkable abilities in comprehending,
+generating, and manipulating human language. However, their conventional usage
+through API-based text prompt submissions imposes certain limitations in terms
+of context constraints and external source availability. To address these
+challenges, we propose a novel framework called Reinforced Retrieval Augmented
+Machine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs
+with supporting information retrieved by a purpose-built retriever from a vast
+user-provided database. By leveraging recent advancements in reinforcement
+learning, our method effectively addresses several critical challenges.
+Firstly, it circumvents the need for accessing LLM gradients. Secondly, our
+method alleviates the burden of retraining LLMs for specific tasks, as it is
+often impractical or impossible due to restricted access to the model and the
+computational intensity involved. Additionally we seamlessly link the
+retriever's task with the reasoner, mitigating hallucinations and reducing
+irrelevant, and potentially damaging retrieved documents. We believe that the
+research agenda outlined in this paper has the potential to profoundly impact
+the field of AI, democratizing access to and utilization of LLMs for a wide
+range of entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Code-Switched Urdu ASR for Noisy Telephonic Environment using Data
+  Centric Approach with Hybrid HMM and CNN-TDNN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Danyal Khan, Raheem Ali, Arshad Aziz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Call Centers have huge amount of audio data which can be used for achieving
+valuable business insights and transcription of phone calls is manually tedious
+task. An effective Automated Speech Recognition system can accurately
+transcribe these calls for easy search through call history for specific
+context and content allowing automatic call monitoring, improving QoS through
+keyword search and sentiment analysis. ASR for Call Center requires more
+robustness as telephonic environment are generally noisy. Moreover, there are
+many low-resourced languages that are on verge of extinction which can be
+preserved with help of Automatic Speech Recognition Technology. Urdu is the
+$10^{th}$ most widely spoken language in the world, with 231,295,440 worldwide
+still remains a resource constrained language in ASR. Regional call-center
+conversations operate in local language, with a mix of English numbers and
+technical terms generally causing a "code-switching" problem. Hence, this paper
+describes an implementation framework of a resource efficient Automatic Speech
+Recognition/ Speech to Text System in a noisy call-center environment using
+Chain Hybrid HMM and CNN-TDNN for Code-Switched Urdu Language. Using Hybrid
+HMM-DNN approach allowed us to utilize the advantages of Neural Network with
+less labelled data. Adding CNN with TDNN has shown to work better in noisy
+environment due to CNN's additional frequency dimension which captures extra
+information from noisy speech, thus improving accuracy. We collected data from
+various open sources and labelled some of the unlabelled data after analysing
+its general context and content from Urdu language as well as from commonly
+used words from other languages, primarily English and were able to achieve WER
+of 5.2% with noisy as well as clean environment in isolated words or numbers as
+well as in continuous spontaneous speech.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 19 figures, 2 tables, preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Model for Every User and Budget: Label-Free and Personalized
+  Mixed-Precision Quantization <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12659v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12659v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edward Fish, Umberto Michieli, Mete Ozay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancement in Automatic Speech Recognition (ASR) has produced large
+AI models, which become impractical for deployment in mobile devices. Model
+quantization is effective to produce compressed general-purpose models, however
+such models may only be deployed to a restricted sub-domain of interest. We
+show that ASR models can be personalized during quantization while relying on
+just a small set of unlabelled samples from the target domain. To this end, we
+propose myQASR, a mixed-precision quantization method that generates tailored
+quantization schemes for diverse users under any memory requirement with no
+fine-tuning. myQASR automatically evaluates the quantization sensitivity of
+network layers by analysing the full-precision activation values. We are then
+able to generate a personalised mixed-precision quantization scheme for any
+pre-determined memory budget. Results for large-scale ASR models show how
+myQASR improves performance for specific genders, languages, and speakers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fake News Detection Through Graph-based Neural Networks: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuzhi Gong, Richard O. Sinnott, Jianzhong Qi, Cecile Paris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of online social networks has enabled rapid dissemination of
+information. People now can share and consume information much more rapidly
+than ever before. However, low-quality and/or accidentally/deliberately fake
+information can also spread rapidly. This can lead to considerable and negative
+impacts on society. Identifying, labelling and debunking online misinformation
+as early as possible has become an increasingly urgent problem. Many methods
+have been proposed to detect fake news including many deep learning and
+graph-based approaches. In recent years, graph-based methods have yielded
+strong results, as they can closely model the social context and propagation
+process of online news. In this paper, we present a systematic review of fake
+news detection studies based on graph-based and deep learning-based techniques.
+We classify existing graph-based methods into knowledge-driven methods,
+propagation-based methods, and heterogeneous social context-based methods,
+depending on how a graph structure is constructed to model news related
+information flows. We further discuss the challenges and open problems in
+graph-based fake news detection and identify future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 3 tables, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tachikuma: Understading Complex Interactions with Multi-Character and
+  Novel Objects by Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12573v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12573v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanzhi Liang, Linchao Zhu, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in natural language and Large Language Models (LLMs) have
+enabled AI agents to simulate human-like interactions within virtual worlds.
+However, these interactions still face limitations in complexity and
+flexibility, particularly in scenarios involving multiple characters and novel
+objects. Pre-defining all interactable objects in the agent's world model
+presents challenges, and conveying implicit intentions to multiple characters
+through complex interactions remains difficult. To address these issues, we
+propose integrating virtual Game Masters (GMs) into the agent's world model,
+drawing inspiration from Tabletop Role-Playing Games (TRPGs). GMs play a
+crucial role in overseeing information, estimating players' intentions,
+providing environment descriptions, and offering feedback, compensating for
+current world model deficiencies. To facilitate future explorations for complex
+interactions, we introduce a benchmark named Tachikuma, comprising a Multiple
+character and novel Object based interaction Estimation (MOE) task and a
+supporting dataset. MOE challenges models to understand characters' intentions
+and accurately determine their actions within intricate contexts involving
+multi-character and novel object interactions. Besides, the dataset captures
+log data from real-time communications during gameplay, providing diverse,
+grounded, and complex interactions for further explorations. Finally, we
+present a simple prompting baseline and evaluate its performance, demonstrating
+its effectiveness in enhancing interaction understanding. We hope that our
+dataset and task will inspire further research in complex interactions with
+natural language, fostering the development of more advanced AI agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preliminary version of an ongoing work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalising Neural Topical Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12564v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12564v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohao Yang, He Zhao, Dinh Phung, Lan Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topic models have evolved from conventional Bayesian probabilistic models to
+Neural Topic Models (NTMs) over the last two decays. Although NTMs have
+achieved promising performance when trained and tested on a specific corpus,
+their generalisation ability across corpora is rarely studied. In practice, we
+often expect that an NTM trained on a source corpus can still produce quality
+topical representation for documents in a different target corpus without
+retraining. In this work, we aim to improve NTMs further so that their benefits
+generalise reliably across corpora and tasks. To do so, we propose to model
+similar documents by minimising their semantical distance when training NTMs.
+Specifically, similar documents are created by data augmentation during
+training; The semantical distance between documents is measured by the
+Hierarchical Topic Transport Distance (HOTT), which computes the Optimal
+Transport (OT) distance between the topical representations. Our framework can
+be readily applied to most NTMs as a plug-and-play module. Extensive
+experiments show that our framework significantly improves the generalisation
+ability regarding neural topical representation across corpora.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lost In Translation: Generating Adversarial Examples Robust to
+  Round-Trip Translation <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neel Bhandari, Pin-Yu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language Models today provide a high accuracy across a large number of
+downstream tasks. However, they remain susceptible to adversarial attacks,
+particularly against those where the adversarial examples maintain considerable
+similarity to the original text. Given the multilingual nature of text, the
+effectiveness of adversarial examples across translations and how machine
+translations can improve the robustness of adversarial examples remain largely
+unexplored. In this paper, we present a comprehensive study on the robustness
+of current text adversarial attacks to round-trip translation. We demonstrate
+that 6 state-of-the-art text-based adversarial attacks do not maintain their
+efficacy after round-trip translation. Furthermore, we introduce an
+intervention-based solution to this problem, by integrating Machine Translation
+into the process of adversarial example generation and demonstrating increased
+robustness to round-trip translation. Our results indicate that finding
+adversarial examples robust to translation can help identify the insufficiency
+of language models that is common across languages, and motivate further
+research into multilingual adversarial attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at International Conference on Acoustics, Speech, and
+  Signal Processing (ICASSP) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Existence of "Secret Language'' in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yimu Wang, Peng Shi, Hongyang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the problem of secret language in NLP, where current
+language models (LMs) seem to have a hidden vocabulary that allows them to
+interpret absurd inputs as meaningful concepts. We investigate two research
+questions: ``Does the secret language phenomenon exist in different language
+models?'' and ``Does secret language depend on specific context?'' To answer
+these questions, we introduce a novel method named \textit{SecretFinding}, a
+gradient-based approach that can automatically discover secret languages in
+LMs. We conduct experiments on five representative models (Electra, ALBERT,
+Roberta, DistillBERT, and CLIP) finetuned on four NLP benchmarks (SST-2, MRPC,
+SNLI, and SQuAD) and a language-grounding benchmark (MSCOCO). Our experimental
+results show that even when we replace the most important words with others
+that are semantically dissimilar to the original words in a sentence, LMs do
+not consider the new sentence semantically dissimilar to the original, as the
+output does not change with a high probability. This phenomenon holds true
+across the five models and five tasks and gives a positive answer to the first
+research question. As for the second research question, we find that the secret
+language discovered by \textit{SecretFinding} is quite general and could even
+be transferred to other models in the black-box settings, such as GPT-3 and
+ChatGPT. Finally, we discuss the causes of secret language, how to eliminate
+it, the potential connection to memorization, and ethical implications.
+Examples of secret language found by SecretFinding are available on
+https://huggingface.co/spaces/anonymousauthors/ACL23_SecretLanguage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Automatic Speech Recognition via WavAugment Guided Phoneme
+  Adversarial Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12498v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12498v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gege Qi, Yuefeng Chen, Xiaofeng Mao, Xiaojun Jia, Ranjie Duan, Rong Zhang, Hui Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing a practically-robust automatic speech recognition (ASR) is
+challenging since the model should not only maintain the original performance
+on clean samples, but also achieve consistent efficacy under small volume
+perturbations and large domain shifts. To address this problem, we propose a
+novel WavAugment Guided Phoneme Adversarial Training (wapat). wapat use
+adversarial examples in phoneme space as augmentation to make the model
+invariant to minor fluctuations in phoneme representation and preserve the
+performance on clean samples. In addition, wapat utilizes the phoneme
+representation of augmented samples to guide the generation of adversaries,
+which helps to find more stable and diverse gradient-directions, resulting in
+improved generalization. Extensive experiments demonstrate the effectiveness of
+wapat on End-to-end Speech Challenge Benchmark (ESB). Notably, SpeechLM-wapat
+outperforms the original model by 6.28% WER reduction on ESB, achieving the new
+state-of-the-art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Schema-Driven Actionable Insight Generation and Smart Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allmin Susaiyah, Aki Härmä, Milan Petković
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In natural language generation (NLG), insight mining is seen as a
+data-to-text task, where data is mined for interesting patterns and verbalised
+into 'insight' statements. An 'over-generate and rank' paradigm is intuitively
+used to generate such insights. The multidimensionality and subjectivity of
+this process make it challenging. This paper introduces a schema-driven method
+to generate actionable insights from data to drive growth and change. It also
+introduces a technique to rank the insights to align with user interests based
+on their feedback. We show preliminary qualitative results of the insights
+generated using our technique and demonstrate its ability to adapt to feedback.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Opinion Mining Using Population-tuned Generative Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allmin Susaiyah, Abhinay Pandya, Aki Härmä
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel method for mining opinions from text collections using
+generative language models trained on data collected from different
+populations. We describe the basic definitions, methodology and a generic
+algorithm for opinion insight mining. We demonstrate the performance of our
+method in an experiment where a pre-trained generative model is fine-tuned
+using specifically tailored content with unnatural and fully annotated
+opinions. We show that our approach can learn and transfer the opinions to the
+semantic classes while maintaining the proportion of polarisation. Finally, we
+demonstrate the usage of an insight mining system to scale up the discovery of
+opinion insights from a real text corpus.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining Math Word Problem Solvers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abby Newcomb, Jugal Kalita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated math word problem solvers based on neural networks have
+successfully managed to obtain 70-80\% accuracy in solving arithmetic word
+problems. However, it has been shown that these solvers may rely on superficial
+patterns to obtain their equations. In order to determine what information math
+word problem solvers use to generate solutions, we remove parts of the input
+and measure the model's performance on the perturbed dataset. Our results show
+that the model is not sensitive to the removal of many words from the input and
+can still manage to find a correct answer when given a nonsense question. This
+indicates that automatic solvers do not follow the semantic logic of math word
+problems, and may be overfitting to the presence of specific words.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How to use LLMs for Text Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13106v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13106v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Petter Törnberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This guide introduces Large Language Models (LLM) as a highly versatile text
+analysis method within the social sciences. As LLMs are easy-to-use, cheap,
+fast, and applicable on a broad range of text analysis tasks, ranging from text
+annotation and classification to sentiment analysis and critical discourse
+analysis, many scholars believe that LLMs will transform how we do text
+analysis. This how-to guide is aimed at students and researchers with limited
+programming experience, and offers a simple introduction to how LLMs can be
+used for text analysis in your own research project, as well as advice on best
+practices. We will go through each of the steps of analyzing textual data with
+LLMs using Python: installing the software, setting up the API, loading the
+data, developing an analysis prompt, analyzing the text, and validating the
+results. As an illustrative example, we will use the challenging task of
+identifying populism in political texts, and show how LLMs move beyond the
+existing state-of-the-art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making Metadata More FAIR Using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sowmya S. Sundaram, Mark A. Musen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the global increase in experimental data artifacts, harnessing them in a
+unified fashion leads to a major stumbling block - bad metadata. To bridge this
+gap, this work presents a Natural Language Processing (NLP) informed
+application, called FAIRMetaText, that compares metadata. Specifically,
+FAIRMetaText analyzes the natural language descriptions of metadata and
+provides a mathematical similarity measure between two terms. This measure can
+then be utilized for analyzing varied metadata, by suggesting terms for
+compliance or grouping similar terms for identification of replaceable terms.
+The efficacy of the algorithm is presented qualitatively and quantitatively on
+publicly available research artifacts and demonstrates large gains across
+metadata related tasks through an in-depth study of a wide variety of Large
+Language Models (LLMs). This software can drastically reduce the human effort
+in sifting through various natural language metadata while employing several
+experimental datasets on the same topic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The potential of LLMs for coding with low-resource and domain-specific
+  programming languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Artur Tarassow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a study on the feasibility of using large language models
+(LLM) for coding with low-resource and domain-specific programming languages
+that typically lack the amount of data required for effective LLM processing
+techniques. This study focuses on the econometric scripting language named
+hansl of the open-source software gretl and employs a proprietary LLM based on
+GPT-3.5. Our findings suggest that LLMs can be a useful tool for writing,
+understanding, improving, and documenting gretl code, which includes generating
+descriptive docstrings for functions and providing precise explanations for
+abstract and poorly documented econometric code. While the LLM showcased
+promoting docstring-to-code translation capability, we also identify some
+limitations, such as its inability to improve certain sections of code and to
+write accurate unit tests. This study is a step towards leveraging the power of
+LLMs to facilitate software development in low-resource programming languages
+and ultimately to lower barriers to entry for their adoption.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Do <span class="highlight-title">Transformer</span>s Learn Topic Structure: Towards a Mechanistic
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04245v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04245v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Li, Yuanzhi Li, Andrej Risteski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the successes of transformers across many domains are indisputable,
+accurate understanding of the learning mechanics is still largely lacking.
+Their capabilities have been probed on benchmarks which include a variety of
+structured and reasoning tasks -- but mathematical understanding is lagging
+substantially behind. Recent lines of work have begun studying representational
+aspects of this question: that is, the size/depth/complexity of attention-based
+networks to perform certain tasks. However, there is no guarantee the learning
+dynamics will converge to the constructions proposed. In our paper, we provide
+fine-grained mechanistic understanding of how transformers learn "semantic
+structure", understood as capturing co-occurrence structure of words.
+Precisely, we show, through a combination of mathematical analysis and
+experiments on Wikipedia data and synthetic data modeled by Latent Dirichlet
+Allocation (LDA), that the embedding layer and the self-attention layer encode
+the topical structure. In the former case, this manifests as higher average
+inner product of embeddings between same-topic words. In the latter, it
+manifests as higher average pairwise attention between same-topic words. The
+mathematical results involve several assumptions to make the analysis
+tractable, which we verify on data, and might be of independent interest as
+well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification of US Supreme Court Cases using <span class="highlight-title">BERT</span>-Based Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08649v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08649v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Vatsal, Adam Meyers, John E. Ortega
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models based on bidirectional encoder representations from transformers
+(BERT) produce state of the art (SOTA) results on many natural language
+processing (NLP) tasks such as named entity recognition (NER), part-of-speech
+(POS) tagging etc. An interesting phenomenon occurs when classifying long
+documents such as those from the US supreme court where BERT-based models can
+be considered difficult to use on a first-pass or out-of-the-box basis. In this
+paper, we experiment with several BERT-based classification techniques for US
+supreme court decisions or supreme court database (SCDB) and compare them with
+the previous SOTA results. We then compare our results specifically with SOTA
+models for long documents. We compare our results for two classification tasks:
+(1) a broad classification task with 15 categories and (2) a fine-grained
+classification task with 279 categories. Our best result produces an accuracy
+of 80\% on the 15 broad categories and 60\% on the fine-grained 279 categories
+which marks an improvement of 8\% and 28\% respectively from previously
+reported SOTA results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ (Ab)using Images and Sounds for Indirect Instruction Injection in
+  Multi-Modal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10490v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10490v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugene Bagdasaryan, Tsung-Yin Hsieh, Ben Nassi, Vitaly Shmatikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate how images and sounds can be used for indirect prompt and
+instruction injection in multi-modal LLMs. An attacker generates an adversarial
+perturbation corresponding to the prompt and blends it into an image or audio
+recording. When the user asks the (unmodified, benign) model about the
+perturbed image or audio, the perturbation steers the model to output the
+attacker-chosen text and/or make the subsequent dialog follow the attacker's
+instruction. We illustrate this attack with several proof-of-concept examples
+targeting LLaVa and PandaGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ XTQA: Span-Level Explanations of the Textbook Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.12662v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.12662v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Ma, Qi Chai, Jun Liu, Qingyu Yin, Pinghui Wang, Qinghua Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Textbook Question Answering (TQA) is a task that one should answer a
+diagram/non-diagram question given a large multi-modal context consisting of
+abundant essays and diagrams. We argue that the explainability of this task
+should place students as a key aspect to be considered. To address this issue,
+we devise a novel architecture towards span-level eXplanations of the TQA
+(XTQA) based on our proposed coarse-to-fine grained algorithm, which can
+provide not only the answers but also the span-level evidences to choose them
+for students. This algorithm first coarsely chooses top $M$ paragraphs relevant
+to questions using the TF-IDF method, and then chooses top $K$ evidence spans
+finely from all candidate spans within these paragraphs by computing the
+information gain of each span to questions. Experimental results shows that
+XTQA significantly improves the state-of-the-art performance compared with
+baselines. The source code is available at
+https://github.com/keep-smile-001/opentqa
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE TNNLS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Emotion Experiencer Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16731v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16731v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Wegge, Roman Klinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The most prominent subtask in emotion analysis is emotion classification; to
+assign a category to a textual unit, for instance a social media post. Many
+research questions from the social sciences do, however, not only require the
+detection of the emotion of an author of a post but to understand who is
+ascribed an emotion in text. This task is tackled by emotion role labeling
+which aims at extracting who is described in text to experience an emotion,
+why, and towards whom. This could, however, be considered overly sophisticated
+if the main question to answer is who feels which emotion. A targeted approach
+for such setup is to classify emotion experiencer mentions (aka "emoters")
+regarding the emotion they presumably perceive. This task is similar to named
+entity recognition of person names with the difference that not every mentioned
+entity name is an emoter. While, very recently, data with emoter annotations
+has been made available, no experiments have yet been performed to detect such
+mentions. With this paper, we provide baseline experiments to understand how
+challenging the task is. We further evaluate the impact on experiencer-specific
+emotion categorization and appraisal detection in a pipeline, when gold
+mentions are not available. We show that experiencer detection in text is a
+challenging task, with a precision of .82 and a recall of .56 (F1 =.66). These
+results motivate future work of jointly modeling emoter spans and
+emotion/appraisal predictions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to the CPSS workshop at KONVENS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Next Chapter: A Study of Large Language Models in Storytelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09790v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09790v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuohan Xie, Trevor Cohn, Jey Han Lau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To enhance the quality of generated stories, recent story generation models
+have been investigating the utilization of higher-level attributes like plots
+or commonsense knowledge. The application of prompt-based learning with large
+language models (LLMs), exemplified by GPT-3, has exhibited remarkable
+performance in diverse natural language processing (NLP) tasks. This paper
+conducts a comprehensive investigation, utilizing both automatic and human
+evaluation, to compare the story generation capacity of LLMs with recent models
+across three datasets with variations in style, register, and length of
+stories. The results demonstrate that LLMs generate stories of significantly
+higher quality compared to other story generation models. Moreover, they
+exhibit a level of performance that competes with human authors, albeit with
+the preliminary observation that they tend to replicate real stories in
+situations involving world knowledge, resembling a form of plagiarism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INLG2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards autonomous system: flexible modular production system enhanced
+  with large language model agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14721v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14721v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Xia, Manthan Shenoy, Nasser Jazdi, Michael Weyrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a novel framework that combines large language
+models (LLMs), digital twins and industrial automation system to enable
+intelligent planning and control of production processes. We retrofit the
+automation system for a modular production facility and create executable
+control interfaces of fine-granular functionalities and coarse-granular skills.
+Low-level functionalities are executed by automation components, and high-level
+skills are performed by automation modules. Subsequently, a digital twin system
+is developed, registering these interfaces and containing additional
+descriptive information about the production system. Based on the retrofitted
+automation system and the created digital twins, LLM-agents are designed to
+interpret descriptive information in the digital twins and control the physical
+system through service interfaces. These LLM-agents serve as intelligent agents
+on different levels within an automation system, enabling autonomous planning
+and control of flexible production. Given a task instruction as input, the
+LLM-agents orchestrate a sequence of atomic functionalities and skills to
+accomplish the task. We demonstrate how our implemented prototype can handle
+un-predefined tasks, plan a production process, and execute the operations.
+This research highlights the potential of integrating LLMs into industrial
+automation systems in the context of smart factory for more agile, flexible,
+and adaptive production processes, while it also underscores the critical
+insights and limitations for future work. Demos at:
+https://github.com/YuchenXia/GPT4IndustrialAutomation
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the pre-print draft manuscript. The peer-reviewed version
+  will be published exclusively by IEEE after the conference, which is set to
+  take place from September 12th to 15th, 2023. We've made several improvements
+  to the final version of the paper based on valuable feedback and suggestions
+  from other researchers</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning "O" Helps for Learning More: Handling the Concealed Entity
+  Problem for Class-incremental NER <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.04676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.04676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruotian Ma, Xuanting Chen, Lin Zhang, Xin Zhou, Junzhe Wang, Tao Gui, Qi Zhang, Xiang Gao, Yunwen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the categories of named entities rapidly increase, the deployed NER models
+are required to keep updating toward recognizing more entity types, creating a
+demand for class-incremental learning for NER. Considering the privacy concerns
+and storage constraints, the standard paradigm for class-incremental NER
+updates the models with training data only annotated with the new classes, yet
+the entities from other entity classes are unlabeled, regarded as "Non-entity"
+(or "O"). In this work, we conduct an empirical study on the "Unlabeled Entity
+Problem" and find that it leads to severe confusion between "O" and entities,
+decreasing class discrimination of old classes and declining the model's
+ability to learn new classes. To solve the Unlabeled Entity Problem, we propose
+a novel representation learning method to learn discriminative representations
+for the entity classes and "O". Specifically, we propose an entity-aware
+contrastive learning method that adaptively detects entity clusters in "O".
+Furthermore, we propose two effective distance-based relabeling strategies for
+better learning the old classes. We introduce a more realistic and challenging
+benchmark for class-incremental NER, and the proposed method achieves up to
+10.62\% improvement over the baseline methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Chat<span class="highlight-title">GPT</span> a Biomedical Expert? -- Exploring the Zero-Shot Performance
+  of Current <span class="highlight-title">GPT</span> Models in Biomedical Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16108v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16108v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samy Ateia, Udo Kruschwitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We assessed the performance of commercial Large Language Models (LLMs)
+GPT-3.5-Turbo and GPT-4 on tasks from the 2023 BioASQ challenge. In Task 11b
+Phase B, which is focused on answer generation, both models demonstrated
+competitive abilities with leading systems. Remarkably, they achieved this with
+simple zero-shot learning, grounded with relevant snippets. Even without
+relevant snippets, their performance was decent, though not on par with the
+best systems. Interestingly, the older and cheaper GPT-3.5-Turbo system was
+able to compete with GPT-4 in the grounded Q&A setting on factoid and list
+answers. In Task 11b Phase A, focusing on retrieval, query expansion through
+zero-shot learning improved performance, but the models fell short compared to
+other systems. The code needed to rerun these experiments is available through
+GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint accepted at the 11th BioASQ Workshop at the 14th Conference
+  and Labs of the Evaluation Forum (CLEF) 2023; Changes: 1. Added related work
+  and experimental setup sections. 2. Reworked discussion and future work
+  section. 3. Fixed multiple typos and improved style. Changed license</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SparseGAN: Sparse Generative Adversarial Network for Text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.11578v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.11578v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Yuan, Jiehang Zeng, Xiaoqing Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is still a challenging task to learn a neural text generation model under
+the framework of generative adversarial networks (GANs) since the entire
+training process is not differentiable. The existing training strategies either
+suffer from unreliable gradient estimations or imprecise sentence
+representations. Inspired by the principle of sparse coding, we propose a
+SparseGAN that generates semantic-interpretable, but sparse sentence
+representations as inputs to the discriminator. The key idea is that we treat
+an embedding matrix as an over-complete dictionary, and use a linear
+combination of very few selected word embeddings to approximate the output
+feature representation of the generator at each time step. With such
+semantic-rich representations, we not only reduce unnecessary noises for
+efficient adversarial training, but also make the entire training process fully
+differentiable. Experiments on multiple text generation datasets yield
+performance improvements, especially in sequence-level metrics, such as BLEU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Style Classification of Rabbinic Literature for Detection of Lost
+  Midrash Tanhuma Material 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09710v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09710v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shlomo Tannor, Nachum Dershowitz, Moshe Lavee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Midrash collections are complex rabbinic works that consist of text in
+multiple languages, which evolved through long processes of unstable oral and
+written transmission. Determining the origin of a given passage in such a
+compilation is not always straightforward and is often a matter of dispute
+among scholars, yet it is essential for scholars' understanding of the passage
+and its relationship to other texts in the rabbinic corpus. To help solve this
+problem, we propose a system for classification of rabbinic literature based on
+its style, leveraging recent advances in natural language processing for Hebrew
+texts. Additionally, we demonstrate how this method can be applied to uncover
+lost material from a specific midrash genre, Tan\d{h}uma-Yelammedenu, that has
+been preserved in later anthologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Coreference Resolution by Leveraging Entity-Centric Features
+  with Graph Neural Networks and Second-order Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2009.04639v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2009.04639v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Liu, Zhenqiao Song, Xiaoqing Zheng, Jun He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the major challenges in coreference resolution is how to make use of
+entity-level features defined over clusters of mentions rather than mention
+pairs. However, coreferent mentions usually spread far apart in an entire text,
+which makes it extremely difficult to incorporate entity-level features. We
+propose a graph neural network-based coreference resolution method that can
+capture the entity-centric information by encouraging the sharing of features
+across all mentions that probably refer to the same real-world entity. Mentions
+are linked to each other via the edges modeling how likely two linked mentions
+point to the same entity. Modeling by such graphs, the features between
+mentions can be shared by message passing operations in an entity-centric
+manner. A global inference algorithm up to second-order features is also
+presented to optimally cluster mentions into consistent groups. Experimental
+results show our graph neural network-based method combing with the
+second-order decoding algorithm (named GNNCR) achieved close to
+state-of-the-art performance on the English CoNLL-2012 Shared Task dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpokenWOZ: A Large-Scale Speech-Text Benchmark for Spoken Task-Oriented
+  Dialogue Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13040v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13040v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuzheng Si, Wentao Ma, Haoyu Gao, Yuchuan Wu, Ting-En Lin, Yinpei Dai, Hangyu Li, Rui Yan, Fei Huang, Yongbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Task-oriented dialogue (TOD) models have made significant progress in recent
+years. However, previous studies primarily focus on datasets written by
+annotators, which has resulted in a gap between academic research and
+real-world spoken conversation scenarios. While several small-scale spoken TOD
+datasets are proposed to address robustness issues such as ASR errors, they
+ignore the unique challenges in spoken conversation. To tackle the limitations,
+we introduce SpokenWOZ, a large-scale speech-text dataset for spoken TOD,
+containing 8 domains, 203k turns, 5.7k dialogues and 249 hours of audios from
+human-to-human spoken conversations. SpokenWOZ further incorporates common
+spoken characteristics such as word-by-word processing and reasoning in spoken
+language. Based on these characteristics, we present cross-turn slot and
+reasoning slot detection as new challenges. We conduct experiments on various
+baselines, including text-modal models, newly proposed dual-modal models, and
+LLMs, e.g., ChatGPT. The results show that the current models still have
+substantial room for improvement in spoken conversation, where the most
+advanced dialogue state tracker only achieves 25.65% in joint goal accuracy and
+the SOTA end-to-end model only correctly completes the user request in 52.1% of
+dialogues. The dataset, code, and leaderboard are available:
+https://spokenwoz.github.io/SpokenWOZ-github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Summarization by Jointly Extracting Sentences and Keywords 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2009.07481v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2009.07481v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongyi Li, Xiaoqing Zheng, Jun He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present RepRank, an unsupervised graph-based ranking model for extractive
+multi-document summarization in which the similarity between words, sentences,
+and word-to-sentence can be estimated by the distances between their vector
+representations in a unified vector space. In order to obtain desirable
+representations, we propose a self-attention based learning method that
+represent a sentence by the weighted sum of its word embeddings, and the
+weights are concentrated to those words hopefully better reflecting the content
+of a document. We show that salient sentences and keywords can be extracted in
+a joint and mutual reinforcement process using our learned representations, and
+prove that this process always converges to a unique solution leading to
+improvement in performance. A variant of absorbing random walk and the
+corresponding sampling-based algorithm are also described to avoid redundancy
+and increase diversity in the summaries. Experiment results with multiple
+benchmark datasets show that RepRank achieved the best or comparable
+performance in ROUGE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages(includes 2 pages references), 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CausE: Towards Causal Knowledge Graph Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11610v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11610v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichi Zhang, Wen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph embedding (KGE) focuses on representing the entities and
+relations of a knowledge graph (KG) into the continuous vector spaces, which
+can be employed to predict the missing triples to achieve knowledge graph
+completion (KGC). However, KGE models often only briefly learn structural
+correlations of triple data and embeddings would be misled by the trivial
+patterns and noisy links in real-world KGs. To address this issue, we build the
+new paradigm of KGE in the context of causality and embedding disentanglement.
+We further propose a Causality-enhanced knowledge graph Embedding (CausE)
+framework. CausE employs causal intervention to estimate the causal effect of
+the confounder embeddings and design new training objectives to make stable
+predictions. Experimental results demonstrate that CausE could outperform the
+baseline models and achieve state-of-the-art KGC performance. We release our
+code in https://github.com/zjukg/CausE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CCKS 2023 as a research paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chinese Fine-Grained Financial Sentiment Analysis with Large Language
+  Models <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14096v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14096v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinyu Lan, Yanru Wu, Wang Xu, Weiqiang Feng, Youhao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity-level fine-grained sentiment analysis in the financial domain is a
+crucial subtask of sentiment analysis and currently faces numerous challenges.
+The primary challenge stems from the lack of high-quality and large-scale
+annotated corpora specifically designed for financial text sentiment analysis,
+which in turn limits the availability of data necessary for developing
+effective text processing techniques. Recent advancements in large language
+models (LLMs) have yielded remarkable performance in natural language
+processing tasks, primarily centered around language pattern matching. In this
+paper, we propose a novel and extensive Chinese fine-grained financial
+sentiment analysis dataset, FinChina SA, for enterprise early warning. We
+thoroughly evaluate and experiment with well-known existing open-source LLMs
+using our dataset. We firmly believe that our dataset will serve as a valuable
+resource to advance the exploration of real-world financial sentiment analysis
+tasks, which should be the focus of future research. The FinChina SA dataset is
+publicly available at https://github.com/YerayL/FinChina-SA
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>FinLLM Symposium at IJCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation
+  Incorporating Gloss Information <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01788v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01788v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunjae Kwon, Rishabh Garodia, Minhwa Lee, Zhichao Yang, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Word Sense Disambiguation (VWSD) is a task to find the image that most
+accurately depicts the correct sense of the target word for the given context.
+Previously, image-text matching models often suffered from recognizing
+polysemous words. This paper introduces an unsupervised VWSD approach that uses
+gloss information of an external lexical knowledge-base, especially the sense
+definitions. Specifically, we suggest employing Bayesian inference to
+incorporate the sense definitions when sense information of the answer is not
+provided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we
+propose a context-aware definition generation with GPT-3. Experimental results
+show that the VWSD performance significantly increased with our Bayesian
+inference-based approach. In addition, our context-aware definition generation
+achieved prominent performance improvement in OOD examples exhibiting better
+performance than the existing definition generation method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, https://aclanthology.org/2023.acl-long.88</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ODD: A Benchmark <span class="highlight-title">Dataset</span> for the NLP-based Opioid Related Aberrant
+  Behavior Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02591v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02591v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunjae Kwon, Xun Wang, Weisong Liu, Emily Druhl, Minhee L. Sung, Joel I. Reisman, Wenjun Li, Robert D. Kerns, William Becker, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Opioid related aberrant behaviors (ORAB) present novel risk factors for
+opioid overdose. Previously, ORAB have been mainly assessed by survey results
+and by monitoring drug administrations. Such methods however, cannot scale up
+and do not cover the entire spectrum of aberrant behaviors. On the other hand,
+ORAB are widely documented in electronic health record notes. This paper
+introduces a novel biomedical natural language processing benchmark dataset
+named ODD, for ORAB Detection Dataset. ODD is an expert-annotated dataset
+comprising of more than 750 publicly available EHR notes. ODD has been designed
+to identify ORAB from patients' EHR notes and classify them into nine
+categories; 1) Confirmed Aberrant Behavior, 2) Suggested Aberrant Behavior, 3)
+Opioids, 4) Indication, 5) Diagnosed opioid dependency, 6) Benzodiapines, 7)
+Medication Changes, 8) Central Nervous System-related, and 9) Social
+Determinants of Health. We explored two state-of-the-art natural language
+processing (NLP) models (finetuning pretrained language models and
+prompt-tuning approaches) to identify ORAB. Experimental results show that the
+prompt-tuning models outperformed the finetuning models in most cateogories and
+the gains were especially higher among uncommon categories (Suggested aberrant
+behavior, Diagnosed opioid dependency and Medication change). Although the best
+model achieved the highest 83.92% on area under precision recall curve,
+uncommon classes (Suggested Aberrant Behavior, Diagnosed Opioid Dependence, and
+Medication Change) still have a large room for performance improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification of Consumer Belief Statements From Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.15498v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.15498v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Wenbin Le, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media offer plenty of information to perform market research in order
+to meet the requirements of customers. One way how this research is conducted
+is that a domain expert gathers and categorizes user-generated content into a
+complex and fine-grained class structure. In many of such cases, little data
+meets complex annotations. It is not yet fully understood how this can be
+leveraged successfully for classification. We examine the classification
+accuracy of expert labels when used with a) many fine-grained classes and b)
+few abstract classes. For scenario b) we compare abstract class labels given by
+the domain expert as baseline and by automatic hierarchical clustering. We
+compare this to another baseline where the entire class structure is given by a
+completely unsupervised clustering approach. By doing so, this work can serve
+as an example of how complex expert annotations are potentially beneficial and
+can be utilized in the most optimal way for opinion mining in highly specific
+domains. By exploring across a range of techniques and experiments, we find
+that automated class abstraction approaches in particular the unsupervised
+approach performs remarkably well against domain expert baseline on text
+classification tasks. This has the potential to inspire opinion mining
+applications in order to support market researchers in practice and to inspire
+fine-grained automated content analysis on a large scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SocialVisTUM: An Interactive Visualization Toolkit for Correlated Neural
+  Topic Models on Social Media Opinion Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.10575v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.10575v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Martin Kirchhoff, Hannah Danner, Robert Pesch, Mainak Ghosh, Archishman Roy, Jiaxi Zhao, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research in opinion mining proposed word embedding-based topic
+modeling methods that provide superior coherence compared to traditional topic
+modeling. In this paper, we demonstrate how these methods can be used to
+display correlated topic models on social media texts using SocialVisTUM, our
+proposed interactive visualization toolkit. It displays a graph with topics as
+nodes and their correlations as edges. Further details are displayed
+interactively to support the exploration of large text collections, e.g.,
+representative words and sentences of topics, topic and sentiment
+distributions, hierarchical topic clustering, and customizable, predefined
+topic labels. The toolkit optimizes automatically on custom data for optimal
+coherence. We show a working instance of the toolkit on data crawled from
+English social media discussions about organic food consumption. The
+visualization confirms findings of a qualitative consumer research study.
+SocialVisTUM and its training procedures are accessible online.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Demo paper accepted for publication on RANLP 2021; 8 pages, 5
+  figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Analysis of Programming Course Evaluations Before and After the
+  Introduction of an Autograder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.15134v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.15134v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Laura Lahesoo, Miriam Anschütz, Stephan Krusche, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Commonly, introductory programming courses in higher education institutions
+have hundreds of participating students eager to learn to program. The manual
+effort for reviewing the submitted source code and for providing feedback can
+no longer be managed. Manually reviewing the submitted homework can be
+subjective and unfair, particularly if many tutors are responsible for grading.
+Different autograders can help in this situation; however, there is a lack of
+knowledge about how autograders can impact students' overall perception of
+programming classes and teaching. This is relevant for course organizers and
+institutions to keep their programming courses attractive while coping with
+increasing students.
+  This paper studies the answers to the standardized university evaluation
+questionnaires of multiple large-scale foundational computer science courses
+which recently introduced autograding. The differences before and after this
+intervention are analyzed. By incorporating additional observations, we
+hypothesize how the autograder might have contributed to the significant
+changes in the data, such as, improved interactions between tutors and
+students, improved overall course quality, improved learning success, increased
+time spent, and reduced difficulty. This qualitative study aims to provide
+hypotheses for future research to define and conduct quantitative surveys and
+data analysis. The autograder technology can be validated as a teaching method
+to improve student satisfaction with programming courses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted full paper article on IEEE ITHET 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Case Study and Qualitative Analysis of Simple Cross-Lingual Opinion
+  Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.02259v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.02259v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Wing Sheung Leung, Qiaoxi Liu, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User-generated content from social media is produced in many languages,
+making it technically challenging to compare the discussed themes from one
+domain across different cultures and regions. It is relevant for domains in a
+globalized world, such as market research, where people from two nations and
+markets might have different requirements for a product. We propose a simple,
+modern, and effective method for building a single topic model with sentiment
+analysis capable of covering multiple languages simultanteously, based on a
+pre-trained state-of-the-art deep neural network for natural language
+understanding. To demonstrate its feasibility, we apply the model to newspaper
+articles and user comments of a specific domain, i.e., organic food products
+and related consumption behavior. The themes match across languages.
+Additionally, we obtain an high proportion of stable and domain-relevant
+topics, a meaningful relation between topics and their respective textual
+contents, and an interpretable representation for social media documents.
+Marketing can potentially benefit from our method, since it provides an
+easy-to-use means of addressing specific customer interests from different
+market regions around the globe. For reproducibility, we provide the code,
+data, and results of our study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 tables, 5 figures, full paper, peer-reviewed, published
+  at KDIR/IC3k 2021 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ End-to-End Annotator Bias Approximation on Crowdsourced Single-Label
+  Sentiment Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.02326v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.02326v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, David Szabo, Andreas Koch, Maria Luisa Ripoll Dominguez, Christian Widmer, Maximilian Wich, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sentiment analysis is often a crowdsourcing task prone to subjective labels
+given by many annotators. It is not yet fully understood how the annotation
+bias of each annotator can be modeled correctly with state-of-the-art methods.
+However, resolving annotator bias precisely and reliably is the key to
+understand annotators' labeling behavior and to successfully resolve
+corresponding individual misconceptions and wrongdoings regarding the
+annotation task. Our contribution is an explanation and improvement for precise
+neural end-to-end bias modeling and ground truth estimation, which reduces an
+undesired mismatch in that regard of the existing state-of-the-art.
+Classification experiments show that it has potential to improve accuracy in
+cases where each sample is annotated only by one single annotator. We provide
+the whole source code publicly and release an own domain-specific sentiment
+dataset containing 10,000 sentences discussing organic food products. These are
+crawled from social media and are singly labeled by 10 non-expert annotators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, 2 tables, full conference paper, peer-reviewed</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NormBank: A Knowledge Bank of Situational Social Norms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17008v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17008v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caleb Ziems, Jane Dwivedi-Yu, Yi-Chia Wang, Alon Halevy, Diyi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present NormBank, a knowledge bank of 155k situational norms. This
+resource is designed to ground flexible normative reasoning for interactive,
+assistive, and collaborative AI systems. Unlike prior commonsense resources,
+NormBank grounds each inference within a multivalent sociocultural frame, which
+includes the setting (e.g., restaurant), the agents' contingent roles (waiter,
+customer), their attributes (age, gender), and other physical, social, and
+cultural constraints (e.g., the temperature or the country of operation). In
+total, NormBank contains 63k unique constraints from a taxonomy that we
+introduce and iteratively refine here. Constraints then apply in different
+combinations to frame social norms. Under these manipulations, norms are
+non-monotonic - one can cancel an inference by updating its frame even
+slightly. Still, we find evidence that neural models can help reliably extend
+the scope and coverage of NormBank. We further demonstrate the utility of this
+resource with a series of transfer experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Explainable and Language-Agnostic LLMs: Symbolic Reverse
+  Engineering of Language at Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00017v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00017v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Walid S. Saba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved a milestone that undenia-bly
+changed many held beliefs in artificial intelligence (AI). However, there
+remains many limitations of these LLMs when it comes to true language
+understanding, limitations that are a byproduct of the under-lying architecture
+of deep neural networks. Moreover, and due to their subsymbolic nature,
+whatever knowledge these models acquire about how language works will always be
+buried in billions of microfeatures (weights), none of which is meaningful on
+its own, making such models hopelessly unexplainable. To address these
+limitations, we suggest com-bining the strength of symbolic representations
+with what we believe to be the key to the success of LLMs, namely a successful
+bottom-up re-verse engineering of language at scale. As such we argue for a
+bottom-up reverse engineering of language in a symbolic setting. Hints on what
+this project amounts to have been suggested by several authors, and we discuss
+in some detail here how this project could be accomplished.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Draft, preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">117</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D-LLM: Injecting the 3D World into Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yining Hong, Haoyu Zhen, Peihao Chen, Shuhong Zheng, Yilun Du, Zhenfang Chen, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) and Vision-Language Models (VLMs) have been
+proven to excel at multiple tasks, such as commonsense reasoning. Powerful as
+these models can be, they are not grounded in the 3D physical world, which
+involves richer concepts such as spatial relationships, affordances, physics,
+layout, and so on. In this work, we propose to inject the 3D world into large
+language models and introduce a whole new family of 3D-LLMs. Specifically,
+3D-LLMs can take 3D point clouds and their features as input and perform a
+diverse set of 3D-related tasks, including captioning, dense captioning, 3D
+question answering, task decomposition, 3D grounding, 3D-assisted dialog,
+navigation, and so on. Using three types of prompting mechanisms that we
+design, we are able to collect over 300k 3D-language data covering these tasks.
+To efficiently train 3D-LLMs, we first utilize a 3D feature extractor that
+obtains 3D features from rendered multi- view images. Then, we use 2D VLMs as
+our backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,
+3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show
+that our model outperforms state-of-the-art baselines by a large margin (e.g.,
+the BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,
+experiments on our held-in datasets for 3D captioning, task composition, and
+3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative
+examples also show that our model could perform more tasks beyond the scope of
+existing LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: : https://vis-www.cs.umass.edu/3dllm/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Systematic <span class="highlight-title">Survey</span> of <span class="highlight-title">Prompt</span> Engineering on Vision-Language Foundation
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12980v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12980v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jindong Gu, Zhen Han, Shuo Chen, Ahmad Beirami, Bailan He, Gengyuan Zhang, Ruotong Liao, Yao Qin, Volker Tresp, Philip Torr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt engineering is a technique that involves augmenting a large
+pre-trained model with task-specific hints, known as prompts, to adapt the
+model to new tasks. Prompts can be created manually as natural language
+instructions or generated automatically as either natural language instructions
+or vector representations. Prompt engineering enables the ability to perform
+predictions based solely on prompts without updating model parameters, and the
+easier application of large pre-trained models in real-world tasks. In past
+years, Prompt engineering has been well-studied in natural language processing.
+Recently, it has also been intensively studied in vision-language modeling.
+However, there is currently a lack of a systematic overview of prompt
+engineering on pre-trained vision-language models. This paper aims to provide a
+comprehensive survey of cutting-edge research in prompt engineering on three
+types of vision-language models: multimodal-to-text generation models (e.g.
+Flamingo), image-text matching models (e.g. CLIP), and text-to-image generation
+models (e.g. Stable Diffusion). For each type of model, a brief model summary,
+prompting methods, prompting-based applications, and the corresponding
+responsibility and integrity issues are summarized and discussed. Furthermore,
+the commonalities and differences between prompting on vision-language models,
+language models, and vision models are also discussed. The challenges, future
+directions, and research opportunities are summarized to foster future research
+on this topic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DFA3D: 3D Deformable Attention For 2D-to-3D Feature Lifting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyang Li, Hao Zhang, Zhaoyang Zeng, Shilong Liu, Feng Li, Tianhe Ren, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a new operator, called 3D DeFormable Attention
+(DFA3D), for 2D-to-3D feature lifting, which transforms multi-view 2D image
+features into a unified 3D space for 3D object detection. Existing feature
+lifting approaches, such as Lift-Splat-based and 2D attention-based, either use
+estimated depth to get pseudo LiDAR features and then splat them to a 3D space,
+which is a one-pass operation without feature refinement, or ignore depth and
+lift features by 2D attention mechanisms, which achieve finer semantics while
+suffering from a depth ambiguity problem. In contrast, our DFA3D-based method
+first leverages the estimated depth to expand each view's 2D feature map to 3D
+and then utilizes DFA3D to aggregate features from the expanded 3D feature
+maps. With the help of DFA3D, the depth ambiguity problem can be effectively
+alleviated from the root, and the lifted features can be progressively refined
+layer by layer, thanks to the Transformer-like architecture. In addition, we
+propose a mathematically equivalent implementation of DFA3D which can
+significantly improve its memory efficiency and computational speed. We
+integrate DFA3D into several methods that use 2D attention-based feature
+lifting with only a few modifications in code and evaluate on the nuScenes
+dataset. The experiment results show a consistent improvement of +1.41\% mAP on
+average, and up to +15.1\% mAP improvement when high-quality depth information
+is available, demonstrating the superiority, applicability, and huge potential
+of DFA3D. The code is available at
+https://github.com/IDEA-Research/3D-deformable-attention.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Volcanic ash delimitation using Artificial Intelligence based on Pix2Pix 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Carrillo, Gissela Torres, Christian Mejia-Escobar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Volcanic eruptions emit ash that can be harmful to human health and cause
+damage to infrastructure, economic activities and the environment. The
+delimitation of ash clouds allows to know their behavior and dispersion, which
+helps in the prevention and mitigation of this phenomenon. Traditional methods
+take advantage of specialized software programs to process the bands or
+channels that compose the satellite images. However, their use is limited to
+experts and demands a lot of time and significant computational resources. In
+recent years, Artificial Intelligence has been a milestone in the computational
+treatment of complex problems in different areas. In particular, Deep Learning
+techniques allow automatic, fast and accurate processing of digital images. The
+present work proposes the use of the Pix2Pix model, a type of generative
+adversarial network that, once trained, learns the mapping of input images to
+output images. The architecture of such a network consisting of a generator and
+a discriminator provides the versatility needed to produce black and white ash
+cloud images from multispectral satellite images. The evaluation of the model,
+based on loss and accuracy plots, a confusion matrix, and visual inspection,
+indicates a satisfactory solution for accurate ash cloud delineation,
+applicable in any area of the world and becomes a useful tool in risk
+management.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, in Spanish language, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Dense Correspondences between Photos and Sketches <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanchen Lu, Xiaolong Wang, Judith E Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans effortlessly grasp the connection between sketches and real-world
+objects, even when these sketches are far from realistic. Moreover, human
+sketch understanding goes beyond categorization -- critically, it also entails
+understanding how individual elements within a sketch correspond to parts of
+the physical world it represents. What are the computational ingredients needed
+to support this ability? Towards answering this question, we make two
+contributions: first, we introduce a new sketch-photo correspondence benchmark,
+$\textit{PSC6k}$, containing 150K annotations of 6250 sketch-photo pairs across
+125 object categories, augmenting the existing Sketchy dataset with
+fine-grained correspondence metadata. Second, we propose a self-supervised
+method for learning dense correspondences between sketch-photo pairs, building
+upon recent advances in correspondence learning for pairs of photos. Our model
+uses a spatial transformer network to estimate the warp flow between latent
+representations of a sketch and photo extracted by a contrastive learning-based
+ConvNet backbone. We found that this approach outperformed several strong
+baselines and produced predictions that were quantitatively consistent with
+other warp-based methods. However, our benchmark also revealed systematic
+differences between predictions of the suite of models we tested and those of
+humans. Taken together, our work suggests a promising path towards developing
+artificial systems that achieve more human-like understanding of visual images
+at different levels of abstraction. Project page:
+https://photo-sketch-correspondence.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023. Project page:
+  https://photo-sketch-correspondence.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-Enhanced Text-to-Video Retrieval using Text-Conditioned Feature
+  Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Ibrahimi, Xiaohang Sun, Pichao Wang, Amanmeet Garg, Ashutosh Sanan, Mohamed Omar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-video retrieval systems have recently made significant progress by
+utilizing pre-trained models trained on large-scale image-text pairs. However,
+most of the latest methods primarily focus on the video modality while
+disregarding the audio signal for this task. Nevertheless, a recent advancement
+by ECLIPSE has improved long-range text-to-video retrieval by developing an
+audiovisual video representation. Nonetheless, the objective of the
+text-to-video retrieval task is to capture the complementary audio and video
+information that is pertinent to the text query rather than simply achieving
+better audio and video alignment. To address this issue, we introduce TEFAL, a
+TExt-conditioned Feature ALignment method that produces both audio and video
+representations conditioned on the text query. Instead of using only an
+audiovisual attention block, which could suppress the audio information
+relevant to the text query, our approach employs two independent cross-modal
+attention blocks that enable the text to attend to the audio and video
+representations separately. Our proposed method's efficacy is demonstrated on
+four benchmark datasets that include audio: MSR-VTT, LSMDC, VATEX, and
+Charades, and achieves better than state-of-the-art performance consistently
+across the four datasets. This is attributed to the additional
+text-query-conditioned audio representation and the complementary information
+it adds to the text-query-conditioned video representation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Privileged and Convergent Bases in Neural Network Representations <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davis Brown, Nikhil Vyas, Yamini Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we investigate whether the representations learned by neural
+networks possess a privileged and convergent basis. Specifically, we examine
+the significance of feature directions represented by individual neurons.
+First, we establish that arbitrary rotations of neural representations cannot
+be inverted (unlike linear networks), indicating that they do not exhibit
+complete rotational invariance. Subsequently, we explore the possibility of
+multiple bases achieving identical performance. To do this, we compare the
+bases of networks trained with the same parameters but with varying random
+initializations. Our study reveals two findings: (1) Even in wide networks such
+as WideResNets, neural networks do not converge to a unique basis; (2) Basis
+correlation increases significantly when a few early layers of the network are
+frozen identically.
+  Furthermore, we analyze Linear Mode Connectivity, which has been studied as a
+measure of basis correlation. Our findings give evidence that while Linear Mode
+Connectivity improves with increased network width, this improvement is not due
+to an increase in basis correlation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In the Workshop on High-dimensional Learning Dynamics at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Skeleton Meta-Prototype Contrastive Learning with Hard
+  Skeleton Mining for Unsupervised Person Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haocong Rao, Cyril Leung, Chunyan Miao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With rapid advancements in depth sensors and deep learning, skeleton-based
+person re-identification (re-ID) models have recently achieved remarkable
+progress with many advantages. Most existing solutions learn single-level
+skeleton features from body joints with the assumption of equal skeleton
+importance, while they typically lack the ability to exploit more informative
+skeleton features from various levels such as limb level with more global body
+patterns. The label dependency of these methods also limits their flexibility
+in learning more general skeleton representations. This paper proposes a
+generic unsupervised Hierarchical skeleton Meta-Prototype Contrastive learning
+(Hi-MPC) approach with Hard Skeleton Mining (HSM) for person re-ID with
+unlabeled 3D skeletons. Firstly, we construct hierarchical representations of
+skeletons to model coarse-to-fine body and motion features from the levels of
+body joints, components, and limbs. Then a hierarchical meta-prototype
+contrastive learning model is proposed to cluster and contrast the most typical
+skeleton features ("prototypes") from different-level skeletons. By converting
+original prototypes into meta-prototypes with multiple homogeneous
+transformations, we induce the model to learn the inherent consistency of
+prototypes to capture more effective skeleton features for person re-ID.
+Furthermore, we devise a hard skeleton mining mechanism to adaptively infer the
+informative importance of each skeleton, so as to focus on harder skeletons to
+learn more discriminative skeleton representations. Extensive evaluations on
+five datasets demonstrate that our approach outperforms a wide variety of
+state-of-the-art skeleton-based methods. We further show the general
+applicability of our method to cross-view person re-ID and RGB-based scenarios
+with estimated skeletons.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Journal of Computer Vision (IJCV). Codes
+  are available at https://github.com/Kali-Hac/Hi-MPC. Supplemental materials
+  will be included in the published version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards a Visual-Language Foundation Model for Computational Pathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12914v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12914v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Y. Lu, Bowen Chen, Drew F. K. Williamson, Richard J. Chen, Ivy Liang, Tong Ding, Guillaume Jaume, Igor Odintsov, Andrew Zhang, Long Phi Le, Georg Gerber, Anil V Parwani, Faisal Mahmood
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accelerated adoption of digital pathology and advances in deep learning
+have enabled the development of powerful models for various pathology tasks
+across a diverse array of diseases and patient cohorts. However, model training
+is often difficult due to label scarcity in the medical domain and the model's
+usage is limited by the specific task and disease for which it is trained.
+Additionally, most models in histopathology leverage only image data, a stark
+contrast to how humans teach each other and reason about histopathologic
+entities. We introduce CONtrastive learning from Captions for Histopathology
+(CONCH), a visual-language foundation model developed using diverse sources of
+histopathology images, biomedical text, and notably over 1.17 million
+image-caption pairs via task-agnostic pretraining. Evaluated on a suite of 13
+diverse benchmarks, CONCH can be transferred to a wide range of downstream
+tasks involving either or both histopathology images and text, achieving
+state-of-the-art performance on histology image classification, segmentation,
+captioning, text-to-image and image-to-text retrieval. CONCH represents a
+substantial leap over concurrent visual-language pretrained systems for
+histopathology, with the potential to directly facilitate a wide array of
+machine learning-based workflows requiring minimal or no further supervised
+fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dyn-E: Local Appearance Editing of Dynamic Neural Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangzhan Zhang, Sida Peng, Yinji ShenTu, Qing Shuai, Tianrun Chen, Kaicheng Yu, Hujun Bao, Xiaowei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the editing of neural radiance fields (NeRFs) has gained
+considerable attention, but most prior works focus on static scenes while
+research on the appearance editing of dynamic scenes is relatively lacking. In
+this paper, we propose a novel framework to edit the local appearance of
+dynamic NeRFs by manipulating pixels in a single frame of training video.
+Specifically, to locally edit the appearance of dynamic NeRFs while preserving
+unedited regions, we introduce a local surface representation of the edited
+region, which can be inserted into and rendered along with the original NeRF
+and warped to arbitrary other frames through a learned invertible motion
+representation network. By employing our method, users without professional
+expertise can easily add desired content to the appearance of a dynamic scene.
+We extensively evaluate our approach on various scenes and show that our
+approach achieves spatially and temporally consistent editing results. Notably,
+our approach is versatile and applicable to different variants of dynamic NeRF
+representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://dyn-e.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GridMM: Grid Memory Map for Vision-and-Language Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Wang, Xiangyang Li, Jiahao Yang, Yeqi Liu, Shuqiang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-and-language navigation (VLN) enables the agent to navigate to a
+remote location following the natural language instruction in 3D environments.
+To represent the previously visited environment, most approaches for VLN
+implement memory using recurrent states, topological maps, or top-down semantic
+maps. In contrast to these approaches, we build the top-down egocentric and
+dynamically growing Grid Memory Map (i.e., GridMM) to structure the visited
+environment. From a global perspective, historical observations are projected
+into a unified grid map in a top-down view, which can better represent the
+spatial relations of the environment. From a local perspective, we further
+propose an instruction relevance aggregation method to capture fine-grained
+visual clues in each grid region. Extensive experiments are conducted on both
+the REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE
+dataset in the continuous environments, showing the superiority of our proposed
+method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automotive Object Detection via Learning Sparse Events by Temporal
+  Dynamics of Spiking Neurons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hu Zhang, Luziwei Leng, Kaiwei Che, Qian Liu, Jie Cheng, Qinghai Guo, Jiangxing Liao, Ran Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event-based sensors, with their high temporal resolution (1us) and dynamical
+range (120dB), have the potential to be deployed in high-speed platforms such
+as vehicles and drones. However, the highly sparse and fluctuating nature of
+events poses challenges for conventional object detection techniques based on
+Artificial Neural Networks (ANNs). In contrast, Spiking Neural Networks (SNNs)
+are well-suited for representing event-based data due to their inherent
+temporal dynamics. In particular, we demonstrate that the membrane potential
+dynamics can modulate network activity upon fluctuating events and strengthen
+features of sparse input. In addition, the spike-triggered adaptive threshold
+can stabilize training which further improves network performance. Based on
+this, we develop an efficient spiking feature pyramid network for event-based
+object detection. Our proposed SNN outperforms previous SNNs and sophisticated
+ANNs with attention mechanisms, achieving a mean average precision (map50) of
+47.7% on the Gen1 benchmark dataset. This result significantly surpasses the
+previous best SNN by 9.7% and demonstrates the potential of SNNs for
+event-based vision. Our model has a concise architecture while maintaining high
+accuracy and much lower computation cost as a result of sparse computation. Our
+code will be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-free Black-box Attack based on Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12872v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12872v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingwen Shao, Lingzhuang Meng, Yuanjian Qiao, Lixu Zhang, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the training data for the target model in a data-free black-box attack
+is not available, most recent schemes utilize GANs to generate data for
+training substitute model. However, these GANs-based schemes suffer from low
+training efficiency as the generator needs to be retrained for each target
+model during the substitute training process, as well as low generation
+quality. To overcome these limitations, we consider utilizing the diffusion
+model to generate data, and propose a data-free black-box attack scheme based
+on diffusion model to improve the efficiency and accuracy of substitute
+training. Despite the data generated by the diffusion model exhibits high
+quality, it presents diverse domain distributions and contains many samples
+that do not meet the discriminative criteria of the target model. To further
+facilitate the diffusion model to generate data suitable for the target model,
+we propose a Latent Code Augmentation (LCA) method to guide the diffusion model
+in generating data. With the guidance of LCA, the data generated by the
+diffusion model not only meets the discriminative criteria of the target model
+but also exhibits high diversity. By utilizing this data, it is possible to
+train substitute model that closely resemble the target model more efficiently.
+Extensive experiments demonstrate that our LCA achieves higher attack success
+rates and requires fewer query budgets compared to GANs-based schemes for
+different target models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding the Latent Space of Diffusion Models through the Lens of
+  Riemannian Geometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong-Hyun Park, Mingi Kwon, Jaewoong Choi, Junghyo Jo, Youngjung Uh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the success of diffusion models (DMs), we still lack a thorough
+understanding of their latent space. To understand the latent space
+$\mathbf{x}_t \in \mathcal{X}$, we analyze them from a geometrical perspective.
+Specifically, we utilize the pullback metric to find the local latent basis in
+$\mathcal{X}$ and their corresponding local tangent basis in $\mathcal{H}$, the
+intermediate feature maps of DMs. The discovered latent basis enables
+unsupervised image editing capability through latent space traversal. We
+investigate the discovered structure from two perspectives. First, we examine
+how geometric structure evolves over diffusion timesteps. Through analysis, we
+show that 1) the model focuses on low-frequency components early in the
+generative process and attunes to high-frequency details later; 2) At early
+timesteps, different samples share similar tangent spaces; and 3) The simpler
+datasets that DMs trained on, the more consistent the tangent space for each
+timestep. Second, we investigate how the geometric structure changes based on
+text conditioning in Stable Diffusion. The results show that 1) similar prompts
+yield comparable tangent spaces; and 2) the model depends less on text
+conditions in later timesteps. To the best of our knowledge, this paper is the
+first to present image editing through $\mathbf{x}$-space traversal and provide
+thorough analyses of the latent structure of DMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Treatment Outcome Prediction for Intracerebral Hemorrhage via Generative
+  Prognostic Model with Imaging and Tabular Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenao Ma, Cheng Chen, Jill Abrigo, Calvin Hoi-Kwan Mak, Yuqi Gong, Nga Yan Chan, Chu Han, Zaiyi Liu, Qi Dou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intracerebral hemorrhage (ICH) is the second most common and deadliest form
+of stroke. Despite medical advances, predicting treat ment outcomes for ICH
+remains a challenge. This paper proposes a novel prognostic model that utilizes
+both imaging and tabular data to predict treatment outcome for ICH. Our model
+is trained on observational data collected from non-randomized controlled
+trials, providing reliable predictions of treatment success. Specifically, we
+propose to employ a variational autoencoder model to generate a low-dimensional
+prognostic score, which can effectively address the selection bias resulting
+from the non-randomized controlled trials. Importantly, we develop a
+variational distributions combination module that combines the information from
+imaging data, non-imaging clinical data, and treatment assignment to accurately
+generate the prognostic score. We conducted extensive experiments on a
+real-world clinical dataset of intracerebral hemorrhage. Our proposed method
+demonstrates a substantial improvement in treatment outcome prediction compared
+to existing state-of-the-art approaches. Code is available at
+https://github.com/med-air/TOP-GPM
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiscale Video <span class="highlight-title">Pretrain</span>ing for Long-Term Activity Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reuben Tan, Matthias De Lange, Michael Iuzzolino, Bryan A. Plummer, Kate Saenko, Karl Ridgeway, Lorenzo Torresani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-term activity forecasting is an especially challenging research problem
+because it requires understanding the temporal relationships between observed
+actions, as well as the variability and complexity of human activities. Despite
+relying on strong supervision via expensive human annotations, state-of-the-art
+forecasting approaches often generalize poorly to unseen data. To alleviate
+this issue, we propose Multiscale Video Pretraining (MVP), a novel
+self-supervised pretraining approach that learns robust representations for
+forecasting by learning to predict contextualized representations of future
+video clips over multiple timescales. MVP is based on our observation that
+actions in videos have a multiscale nature, where atomic actions typically
+occur at a short timescale and more complex actions may span longer timescales.
+We compare MVP to state-of-the-art self-supervised video learning approaches on
+downstream long-term forecasting tasks including long-term action anticipation
+and video summary prediction. Our comprehensive experiments across the Ego4D
+and Epic-Kitchens-55/100 datasets demonstrate that MVP out-performs
+state-of-the-art methods by significant margins. Notably, MVP obtains a
+relative performance gain of over 20% accuracy in video summary forecasting
+over existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatiotemporal Modeling Encounters 3D Medical Image Analysis:
+  Slice-Shift UNet with Multi-View Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        C. I. Ugwu, S. Casarin, O. Lanz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a fundamental part of computational healthcare, Computer Tomography (CT)
+and Magnetic Resonance Imaging (MRI) provide volumetric data, making the
+development of algorithms for 3D image analysis a necessity. Despite being
+computationally cheap, 2D Convolutional Neural Networks can only extract
+spatial information. In contrast, 3D CNNs can extract three-dimensional
+features, but they have higher computational costs and latency, which is a
+limitation for clinical practice that requires fast and efficient models.
+Inspired by the field of video action recognition we propose a new 2D-based
+model dubbed Slice SHift UNet (SSH-UNet) which encodes three-dimensional
+features at 2D CNN's complexity. More precisely multi-view features are
+collaboratively learned by performing 2D convolutions along the three
+orthogonal planes of a volume and imposing a weights-sharing mechanism. The
+third dimension, which is neglected by the 2D convolution, is reincorporated by
+shifting a portion of the feature maps along the slices' axis. The
+effectiveness of our approach is validated in Multi-Modality Abdominal
+Multi-Organ Segmentation (AMOS) and Multi-Atlas Labeling Beyond the Cranial
+Vault (BTCV) datasets, showing that SSH-UNet is more efficient while on par in
+performance with state-of-the-art architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View Vertebra Localization and Identification from CT Images <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Wu, Jiadong Zhang, Yu Fang, Zhentao Liu, Nizhuan Wang, Zhiming Cui, Dinggang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately localizing and identifying vertebrae from CT images is crucial for
+various clinical applications. However, most existing efforts are performed on
+3D with cropping patch operation, suffering from the large computation costs
+and limited global information. In this paper, we propose a multi-view vertebra
+localization and identification from CT images, converting the 3D problem into
+a 2D localization and identification task on different views. Without the
+limitation of the 3D cropped patch, our method can learn the multi-view global
+information naturally. Moreover, to better capture the anatomical structure
+information from different view perspectives, a multi-view contrastive learning
+strategy is developed to pre-train the backbone. Additionally, we further
+propose a Sequence Loss to maintain the sequential structure embedded along the
+vertebrae. Evaluation results demonstrate that, with only two 2D networks, our
+method can localize and identify vertebrae in CT images accurately, and
+outperforms the state-of-the-art methods consistently. Our code is available at
+https://github.com/ShanghaiTech-IMPACT/Multi-View-Vertebra-Localization-and-Identification-from-CT-Images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EPIC-KITCHENS-100 Unsupervised Domain Adaptation Challenge: Mixed
+  Sequences Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12837v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12837v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amirshayan Nasirimajd, Simone Alberto Peirone, Chiara Plizzari, Barbara Caputo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report presents the technical details of our approach for the
+EPIC-Kitchens-100 Unsupervised Domain Adaptation (UDA) Challenge in Action
+Recognition. Our approach is based on the idea that the order in which actions
+are performed is similar between the source and target domains. Based on this,
+we generate a modified sequence by randomly combining actions from the source
+and target domains. As only unlabelled target data are available under the UDA
+setting, we use a standard pseudo-labeling strategy for extracting action
+labels for the target. We then ask the network to predict the resulting action
+sequence. This allows to integrate information from both domains during
+training and to achieve better transfer results on target. Additionally, to
+better incorporate sequence information, we use a language model to filter
+unlikely sequences. Lastly, we employed a co-occurrence matrix to eliminate
+unseen combinations of verbs and nouns. Our submission, labeled as 'sshayan',
+can be found on the leaderboard, where it currently holds the 2nd position for
+'verb' and the 4th position for both 'noun' and 'action'.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2nd place in the 2023 EPIC-KITCHENS-100 Unsupervised Domain
+  Adaptation Challenge for Action Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Provably Robust Estimators for Inverse Problems via Jittering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anselm Krainovic, Mahdi Soltanolkotabi, Reinhard Heckel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks provide excellent performance for inverse problems such
+as denoising. However, neural networks can be sensitive to adversarial or
+worst-case perturbations. This raises the question of whether such networks can
+be trained efficiently to be worst-case robust. In this paper, we investigate
+whether jittering, a simple regularization technique that adds isotropic
+Gaussian noise during training, is effective for learning worst-case robust
+estimators for inverse problems. While well studied for prediction in
+classification tasks, the effectiveness of jittering for inverse problems has
+not been systematically investigated. In this paper, we present a novel
+analytical characterization of the optimal $\ell_2$-worst-case robust estimator
+for linear denoising and show that jittering yields optimal robust denoisers.
+Furthermore, we examine jittering empirically via training deep neural networks
+(U-nets) for natural image denoising, deconvolution, and accelerated magnetic
+resonance imaging (MRI). The results show that jittering significantly enhances
+the worst-case robustness, but can be suboptimal for inverse problems beyond
+denoising. Moreover, our results imply that training on real data which often
+contains slight noise is somewhat robustness enhancing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exposing the Troublemakers in Described Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12813v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12813v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi Xie, Zhao Zhang, Yixuan Wu, Feng Zhu, Rui Zhao, Shuang Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting objects based on language descriptions is a popular task that
+includes Open-Vocabulary object Detection (OVD) and Referring Expression
+Comprehension (REC). In this paper, we advance them to a more practical setting
+called Described Object Detection (DOD) by expanding category names to flexible
+language expressions for OVD and overcoming the limitation of REC to only
+grounding the pre-existing object. We establish the research foundation for DOD
+tasks by constructing a Description Detection Dataset ($D^3$), featuring
+flexible language expressions and annotating all described objects without
+omission. By evaluating previous SOTA methods on $D^3$, we find some
+troublemakers that fail current REC, OVD, and bi-functional methods. REC
+methods struggle with confidence scores, rejecting negative instances, and
+multi-target scenarios, while OVD methods face constraints with long and
+complex descriptions. Recent bi-functional methods also do not work well on DOD
+due to their separated training procedures and inference strategies for REC and
+OVD tasks. Building upon the aforementioned findings, we propose a baseline
+that largely improves REC methods by reconstructing the training data and
+introducing a binary classification sub-task, outperforming existing methods.
+Data and code is available at https://github.com/shikras/d-cube.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compact & Capable: Harnessing Graph Neural Networks and Edge Convolution
+  for Medical Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aryan Singh, Pepijn Van de Ven, Ciarán Eising, Patrick Denny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-based neural network models are gaining traction in the field of
+representation learning due to their ability to uncover latent topological
+relationships between entities that are otherwise challenging to identify.
+These models have been employed across a diverse range of domains, encompassing
+drug discovery, protein interactions, semantic segmentation, and fluid dynamics
+research. In this study, we investigate the potential of Graph Neural Networks
+(GNNs) for medical image classification. We introduce a novel model that
+combines GNNs and edge convolution, leveraging the interconnectedness of RGB
+channel feature values to strongly represent connections between crucial graph
+nodes. Our proposed model not only performs on par with state-of-the-art Deep
+Neural Networks (DNNs) but does so with 1000 times fewer parameters, resulting
+in reduced training time and data requirements. We compare our Graph
+Convolutional Neural Network (GCNN) to pre-trained DNNs for classifying
+MedMNIST dataset classes, revealing promising prospects for GNNs in medical
+image analysis. Our results also encourage further exploration of advanced
+graph-based models such as Graph Attention Networks (GAT) and Graph
+Auto-Encoders in the medical imaging domain. The proposed model yields more
+reliable, interpretable, and accurate outcomes for tasks like semantic
+segmentation and image classification compared to simpler GCNNs
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is attention all you need in medical image analysis? A <span class="highlight-title">review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgos Papanastasiou, Nikolaos Dikaios, Jiahao Huang, Chengjia Wang, Guang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical imaging is a key component in clinical diagnosis, treatment planning
+and clinical trial design, accounting for almost 90% of all healthcare data.
+CNNs achieved performance gains in medical image analysis (MIA) over the last
+years. CNNs can efficiently model local pixel interactions and be trained on
+small-scale MI data. The main disadvantage of typical CNN models is that they
+ignore global pixel relationships within images, which limits their
+generalisation ability to understand out-of-distribution data with different
+'global' information. The recent progress of Artificial Intelligence gave rise
+to Transformers, which can learn global relationships from data. However, full
+Transformer models need to be trained on large-scale data and involve
+tremendous computational complexity. Attention and Transformer compartments
+(Transf/Attention) which can well maintain properties for modelling global
+relationships, have been proposed as lighter alternatives of full Transformers.
+Recently, there is an increasing trend to co-pollinate complementary
+local-global properties from CNN and Transf/Attention architectures, which led
+to a new era of hybrid models. The past years have witnessed substantial growth
+in hybrid CNN-Transf/Attention models across diverse MIA problems. In this
+systematic review, we survey existing hybrid CNN-Transf/Attention models,
+review and unravel key architectural designs, analyse breakthroughs, and
+evaluate current and future opportunities as well as challenges. We also
+introduced a comprehensive analysis framework on generalisation opportunities
+of scientific and clinical impact, based on which new data-driven domain
+generalisation and adaptation methods can be stimulated.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Full-frame Video Stabilization with Iterative Optimization <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiyue Zhao, Xin Li, Zhan Peng, Xianrui Luo, Xinyi Ye, Hao Lu, Zhiguo Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video stabilization refers to the problem of transforming a shaky video into
+a visually pleasing one. The question of how to strike a good trade-off between
+visual quality and computational speed has remained one of the open challenges
+in video stabilization. Inspired by the analogy between wobbly frames and
+jigsaw puzzles, we propose an iterative optimization-based learning approach
+using synthetic datasets for video stabilization, which consists of two
+interacting submodules: motion trajectory smoothing and full-frame outpainting.
+First, we develop a two-level (coarse-to-fine) stabilizing algorithm based on
+the probabilistic flow field. The confidence map associated with the estimated
+optical flow is exploited to guide the search for shared regions through
+backpropagation. Second, we take a divide-and-conquer approach and propose a
+novel multiframe fusion strategy to render full-frame stabilized views. An
+important new insight brought about by our iterative optimization approach is
+that the target video can be interpreted as the fixed point of nonlinear
+mapping for video stabilization. We formulate video stabilization as a problem
+of minimizing the amount of jerkiness in motion trajectories, which guarantees
+convergence with the help of fixed-point theory. Extensive experimental results
+are reported to demonstrate the superiority of the proposed approach in terms
+of computational speed and visual quality. The code will be available on
+GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LiDAR Meta Depth Completion <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wolfgang Boettcher, Lukas Hoyer, Ozan Unal, Dengxin Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth estimation is one of the essential tasks to be addressed when creating
+mobile autonomous systems. While monocular depth estimation methods have
+improved in recent times, depth completion provides more accurate and reliable
+depth maps by additionally using sparse depth information from other sensors
+such as LiDAR. However, current methods are specifically trained for a single
+LiDAR sensor. As the scanning pattern differs between sensors, every new sensor
+would require re-training a specialized depth completion model, which is
+computationally inefficient and not flexible. Therefore, we propose to
+dynamically adapt the depth completion model to the used sensor type enabling
+LiDAR adaptive depth completion. Specifically, we propose a meta depth
+completion network that uses data patterns derived from the data to learn a
+task network to alter weights of the main depth completion network to solve a
+given depth completion task effectively. The method demonstrates a strong
+capability to work on multiple LiDAR scanning patterns and can also generalize
+to scanning patterns that are unseen during training. While using a single
+model, our method yields significantly better results than a non-adaptive
+baseline trained on different LiDAR patterns. It outperforms LiDAR-specific
+expert models for very sparse cases. These advantages allow flexible deployment
+of a single depth completion model on different sensors, which could also prove
+valuable to process the input of nascent LiDAR technology with adaptive instead
+of fixed scanning patterns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ICF-SRSR: Invertible scale-Conditional Function for <span class="highlight-title">Self-Supervised</span>
+  Real-world Single Image Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reyhaneh Neshatavar, Mohsen Yavartanoo, Sanghyun Son, Kyoung Mu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single image super-resolution (SISR) is a challenging ill-posed problem that
+aims to up-sample a given low-resolution (LR) image to a high-resolution (HR)
+counterpart. Due to the difficulty in obtaining real LR-HR training pairs,
+recent approaches are trained on simulated LR images degraded by simplified
+down-sampling operators, e.g., bicubic. Such an approach can be problematic in
+practice because of the large gap between the synthesized and real-world LR
+images. To alleviate the issue, we propose a novel Invertible scale-Conditional
+Function (ICF), which can scale an input image and then restore the original
+input with different scale conditions. By leveraging the proposed ICF, we
+construct a novel self-supervised SISR framework (ICF-SRSR) to handle the
+real-world SR task without using any paired/unpaired training data.
+Furthermore, our ICF-SRSR can generate realistic and feasible LR-HR pairs,
+which can make existing supervised SISR networks more robust. Extensive
+experiments demonstrate the effectiveness of the proposed method in handling
+SISR in a fully self-supervised manner. Our ICF-SRSR demonstrates superior
+performance compared to the existing methods trained on synthetic paired images
+in real-world scenarios and exhibits comparable performance compared to
+state-of-the-art supervised/unsupervised methods on public benchmark datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLIP-KD: An Empirical Study of Distilling CLIP Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanguang Yang, Zhulin An, Libo Huang, Junyu Bi, Xinqiang Yu, Han Yang, Yongjun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CLIP has become a promising language-supervised visual pre-training framework
+and achieves excellent performance over a wide range of tasks. This paper aims
+to distill small CLIP models supervised by a large teacher CLIP model. We
+propose several distillation strategies, including relation, feature, gradient
+and contrastive paradigm, to examine the impact on CLIP distillation. We show
+that the simplest feature mimicry with MSE loss performs best. Moreover,
+interactive contrastive learning and relation-based distillation are also
+critical in performance improvement. We apply the unified method to distill
+several student networks trained on 15 million (image, text) pairs.
+Distillation improves the student CLIP models consistently over zero-shot
+ImageNet classification and cross-modal retrieval benchmarks. We hope our
+empirical study will become an important baseline for future CLIP distillation
+research. The code is available at \url{https://github.com/winycg/CLIP-KD}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COCO-O: A Benchmark for Object Detectors under Natural Distribution
+  Shifts <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofeng Mao, Yuefeng Chen, Yao Zhu, Da Chen, Hang Su, Rong Zhang, Hui Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Practical object detection application can lose its effectiveness on image
+inputs with natural distribution shifts. This problem leads the research
+community to pay more attention on the robustness of detectors under
+Out-Of-Distribution (OOD) inputs. Existing works construct datasets to
+benchmark the detector's OOD robustness for a specific application scenario,
+e.g., Autonomous Driving. However, these datasets lack universality and are
+hard to benchmark general detectors built on common tasks such as COCO. To give
+a more comprehensive robustness assessment, we introduce
+COCO-O(ut-of-distribution), a test dataset based on COCO with 6 types of
+natural distribution shifts. COCO-O has a large distribution gap with training
+data and results in a significant 55.7% relative performance drop on a Faster
+R-CNN detector. We leverage COCO-O to conduct experiments on more than 100
+modern object detectors to investigate if their improvements are credible or
+just over-fitting to the COCO test set. Unfortunately, most classic detectors
+in early years do not exhibit strong OOD generalization. We further study the
+robustness effect on recent breakthroughs of detector's architecture design,
+augmentation and pre-training techniques. Some empirical findings are revealed:
+1) Compared with detection head or neck, backbone is the most important part
+for robustness; 2) An end-to-end detection transformer design brings no
+enhancement, and may even reduce robustness; 3) Large-scale foundation models
+have made a great leap on robust object detection. We hope our COCO-O could
+provide a rich testbed for robustness study of object detection. The dataset
+will be available at
+\url{https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ICCV2023,
+  https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Persistent-Transient Duality: A Multi-mechanism Approach for Modeling
+  Human-Object Interaction <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung Tran, Vuong Le, Svetha Venkatesh, Truyen Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans are highly adaptable, swiftly switching between different modes to
+progressively handle different tasks, situations and contexts. In Human-object
+interaction (HOI) activities, these modes can be attributed to two mechanisms:
+(1) the large-scale consistent plan for the whole activity and (2) the
+small-scale children interactive actions that start and end along the timeline.
+While neuroscience and cognitive science have confirmed this multi-mechanism
+nature of human behavior, machine modeling approaches for human motion are
+trailing behind. While attempted to use gradually morphing structures (e.g.,
+graph attention networks) to model the dynamic HOI patterns, they miss the
+expeditious and discrete mode-switching nature of the human motion. To bridge
+that gap, this work proposes to model two concurrent mechanisms that jointly
+control human motion: the Persistent process that runs continually on the
+global scale, and the Transient sub-processes that operate intermittently on
+the local context of the human while interacting with objects. These two
+mechanisms form an interactive Persistent-Transient Duality that
+synergistically governs the activity sequences. We model this conceptual
+duality by a parent-child neural network of Persistent and Transient channels
+with a dedicated neural module for dynamic mechanism switching. The framework
+is trialed on HOI motion forecasting. On two rich datasets and a wide variety
+of settings, the model consistently delivers superior performances, proving its
+suitability for the challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AMAE: Adaptation of <span class="highlight-title">Pre-Train</span>ed Masked Autoencoder for Dual-Distribution
+  Anomaly Detection in Chest X-Rays <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Behzad Bozorgtabar, Dwarikanath Mahapatra, Jean-Philippe Thiran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly detection in medical images such as chest radiographs is
+stepping into the spotlight as it mitigates the scarcity of the labor-intensive
+and costly expert annotation of anomaly data. However, nearly all existing
+methods are formulated as a one-class classification trained only on
+representations from the normal class and discard a potentially significant
+portion of the unlabeled data. This paper focuses on a more practical setting,
+dual distribution anomaly detection for chest X-rays, using the entire training
+data, including both normal and unlabeled images. Inspired by a modern
+self-supervised vision transformer model trained using partial image inputs to
+reconstruct missing image regions -- we propose AMAE, a two-stage algorithm for
+adaptation of the pre-trained masked autoencoder (MAE). Starting from MAE
+initialization, AMAE first creates synthetic anomalies from only normal
+training images and trains a lightweight classifier on frozen transformer
+features. Subsequently, we propose an adaptation strategy to leverage unlabeled
+images containing anomalies. The adaptation scheme is accomplished by assigning
+pseudo-labels to unlabeled images and using two separate MAE based modules to
+model the normative and anomalous distributions of pseudo-labeled images. The
+effectiveness of the proposed adaptation strategy is evaluated with different
+anomaly ratios in an unlabeled training set. AMAE leads to consistent
+performance gains over competing self-supervised and dual distribution anomaly
+detection methods, setting the new state-of-the-art on three public chest X-ray
+benchmarks: RSNA, NIH-CXR, and VinDr-CXR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CarPatch: A Synthetic Benchmark for Radiance Field Evaluation on Vehicle
+  Components 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Di Nucci, Alessandro Simoni, Matteo Tomei, Luca Ciuffreda, Roberto Vezzani, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRFs) have gained widespread recognition as a highly
+effective technique for representing 3D reconstructions of objects and scenes
+derived from sets of images. Despite their efficiency, NeRF models can pose
+challenges in certain scenarios such as vehicle inspection, where the lack of
+sufficient data or the presence of challenging elements (e.g. reflections)
+strongly impact the accuracy of the reconstruction. To this aim, we introduce
+CarPatch, a novel synthetic benchmark of vehicles. In addition to a set of
+images annotated with their intrinsic and extrinsic camera parameters, the
+corresponding depth maps and semantic segmentation masks have been generated
+for each view. Global and part-based metrics have been defined and used to
+evaluate, compare, and better characterize some state-of-the-art techniques.
+The dataset is publicly released at
+https://aimagelab.ing.unimore.it/go/carpatch and can be used as an evaluation
+guide and as a baseline for future work on this challenging topic.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICIAP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dense <span class="highlight-title">Transformer</span> based Enhanced Coding Network for Unsupervised Metal
+  Artifact Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wangduo Xie, Matthew B. Blaschko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CT images corrupted by metal artifacts have serious negative effects on
+clinical diagnosis. Considering the difficulty of collecting paired data with
+ground truth in clinical settings, unsupervised methods for metal artifact
+reduction are of high interest. However, it is difficult for previous
+unsupervised methods to retain structural information from CT images while
+handling the non-local characteristics of metal artifacts. To address these
+challenges, we proposed a novel Dense Transformer based Enhanced Coding Network
+(DTEC-Net) for unsupervised metal artifact reduction. Specifically, we
+introduce a Hierarchical Disentangling Encoder, supported by the high-order
+dense process, and transformer to obtain densely encoded sequences with
+long-range correspondence. Then, we present a second-order disentanglement
+method to improve the dense sequence's decoding process. Extensive experiments
+and model discussions illustrate DTEC-Net's effectiveness, which outperforms
+the previous state-of-the-art methods on a benchmark dataset, and greatly
+reduces metal artifacts while restoring richer texture details.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> MC-JEPA: A Joint-Embedding Predictive Architecture for <span class="highlight-title">Self-Supervised</span>
+  Learning of Motion and Content Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrien Bardes, Jean Ponce, <span class="highlight-author">Yann LeCun</span>
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning of visual representations has been focusing on
+learning content features, which do not capture object motion or location, and
+focus on identifying and differentiating objects in images and videos. On the
+other hand, optical flow estimation is a task that does not involve
+understanding the content of the images on which it is estimated. We unify the
+two approaches and introduce MC-JEPA, a joint-embedding predictive architecture
+and self-supervised learning approach to jointly learn optical flow and content
+features within a shared encoder, demonstrating that the two associated
+objectives; the optical flow estimation objective and the self-supervised
+learning objective; benefit from each other and thus learn content features
+that incorporate motion information. The proposed approach achieves performance
+on-par with existing unsupervised optical flow benchmarks, as well as with
+common self-supervised learning approaches on downstream tasks such as semantic
+segmentation of images and videos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Damage Vision Mining Opportunity for Imbalanced Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12676v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12676v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takato Yasuno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In past decade, previous balanced datasets have been used to advance
+algorithms for classification, object detection, semantic segmentation, and
+anomaly detection in industrial applications. Specifically, for condition-based
+maintenance, automating visual inspection is crucial to ensure high quality.
+Deterioration prognostic attempts to optimize the fine decision process for
+predictive maintenance and proactive repair. In civil infrastructure and living
+environment, damage data mining cannot avoid the imbalanced data issue because
+of rare unseen events and high quality status by improved operations. For
+visual inspection, deteriorated class acquired from the surface of concrete and
+steel components are occasionally imbalanced. From numerous related surveys, we
+summarize that imbalanced data problems can be categorized into four types; 1)
+missing range of target and label valuables, 2) majority-minority class
+imbalance, 3) foreground-background of spatial imbalance, 4) long-tailed class
+of pixel-wise imbalance. Since 2015, there has been many imbalanced studies
+using deep learning approaches that includes regression, image classification,
+object detection, semantic segmentation. However, anomaly detection for
+imbalanced data is not yet well known. In the study, we highlight one-class
+anomaly detection application whether anomalous class or not, and demonstrate
+clear examples on imbalanced vision datasets: wooden, concrete deterioration,
+and disaster damage. We provide key results on damage vision mining advantage,
+hypothesizing that the more effective range of positive ratio, the higher
+accuracy gain of anomaly detection application. Finally, the applicability of
+the damage learning methods, limitations, and future works are mentioned.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 14 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Industrial Segment Anything -- a Case Study in Aircraft Manufacturing,
+  Intralogistics, Maintenance, Repair, and Overhaul 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keno Moenck, Arne Wendt, Philipp Prünte, Julian Koch, Arne Sahrhage, Johann Gierecker, Ole Schmedemann, Falko Kähler, Dirk Holst, Martin Gomse, Thorsten Schüppstuhl, Daniel Schoepflin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying deep learning-based applications in specialized domains like the
+aircraft production industry typically suffers from the training data
+availability problem. Only a few datasets represent non-everyday objects,
+situations, and tasks. Recent advantages in research around Vision Foundation
+Models (VFM) opened a new area of tasks and models with high generalization
+capabilities in non-semantic and semantic predictions. As recently demonstrated
+by the Segment Anything Project, exploiting VFM's zero-shot capabilities is a
+promising direction in tackling the boundaries spanned by data, context, and
+sensor variety. Although, investigating its application within specific domains
+is subject to ongoing research. This paper contributes here by surveying
+applications of the SAM in aircraft production-specific use cases. We include
+manufacturing, intralogistics, as well as maintenance, repair, and overhaul
+processes, also representing a variety of other neighboring industrial domains.
+Besides presenting the various use cases, we further discuss the injection of
+domain knowledge.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked
+  Image Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhen Pan, Suprosanna Shit, Özgün Turgut, Wenqi Huang, Hongwei Bran Li, Nil Stolt-Ansó, Thomas Küstner, Kerstin Hammernik, Daniel Rueckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In dynamic Magnetic Resonance Imaging (MRI), k-space is typically
+undersampled due to limited scan time, resulting in aliasing artifacts in the
+image domain. Hence, dynamic MR reconstruction requires not only modeling
+spatial frequency components in the x and y directions of k-space but also
+considering temporal redundancy. Most previous works rely on image-domain
+regularizers (priors) to conduct MR reconstruction. In contrast, we focus on
+interpolating the undersampled k-space before obtaining images with Fourier
+transform. In this work, we connect masked image modeling with k-space
+interpolation and propose a novel Transformer-based k-space Global
+Interpolation Network, termed k-GIN. Our k-GIN learns global dependencies among
+low- and high-frequency components of 2D+t k-space and uses it to interpolate
+unsampled data. Further, we propose a novel k-space Iterative Refinement Module
+(k-IRM) to enhance the high-frequency components learning. We evaluate our
+approach on 92 in-house 2D+t cardiac MR subjects and compare it to MR
+reconstruction methods with image-domain regularizers. Experiments show that
+our proposed k-space interpolation method quantitatively and qualitatively
+outperforms baseline methods. Importantly, the proposed approach achieves
+substantially higher robustness and generalizability in cases of
+highly-undersampled MR data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Theoretically Guaranteed Quaternion Weighted Schatten p-norm
+  Minimization Method for Color Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12656v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12656v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qing-Hua Zhang, Liang-Tian He, Yi-Lun Wang, Liang-Jian Deng, Jun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by the fact that the matrix formulated by nonlocal similar patches
+in a natural image is of low rank, the rank approximation issue have been
+extensively investigated over the past decades, among which weighted nuclear
+norm minimization (WNNM) and weighted Schatten $p$-norm minimization (WSNM) are
+two prevailing methods have shown great superiority in various image
+restoration (IR) problems. Due to the physical characteristic of color images,
+color image restoration (CIR) is often a much more difficult task than its
+grayscale image counterpart. However, when applied to CIR, the traditional
+WNNM/WSNM method only processes three color channels individually and fails to
+consider their cross-channel correlations. Very recently, a quaternion-based
+WNNM approach (QWNNM) has been developed to mitigate this issue, which is
+capable of representing the color image as a whole in the quaternion domain and
+preserving the inherent correlation among the three color channels. Despite its
+empirical success, unfortunately, the convergence behavior of QWNNM has not
+been strictly studied yet. In this paper, on the one side, we extend the WSNM
+into quaternion domain and correspondingly propose a novel quaternion-based
+WSNM model (QWSNM) for tackling the CIR problems. Extensive experiments on two
+representative CIR tasks, including color image denoising and deblurring,
+demonstrate that the proposed QWSNM method performs favorably against many
+state-of-the-art alternatives, in both quantitative and qualitative
+evaluations. On the other side, more importantly, we preliminarily provide a
+theoretical convergence analysis, that is, by modifying the quaternion
+alternating direction method of multipliers (QADMM) through a simple
+continuation strategy, we theoretically prove that both the solution sequences
+generated by the QWNNM and QWSNM have fixed-point convergence guarantees.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>46 pages, 10 figures; references added</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation
+  of rPPG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dae Yeol Kim, Eunsu Goh, KwangKee Lee, JongEui Chae, JongHyeon Mun, Junyeong Na, Chae-bong Sohn, Do-Yup Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote Photoplethysmography (rPPG) is a technology that utilizes the light
+absorption properties of hemoglobin, captured via camera, to analyze and
+measure blood volume pulse (BVP). By analyzing the measured BVP, various
+physiological signals such as heart rate, stress levels, and blood pressure can
+be derived, enabling applications such as the early prediction of
+cardiovascular diseases. rPPG is a rapidly evolving field as it allows the
+measurement of vital signals using camera-equipped devices without the need for
+additional devices such as blood pressure monitors or pulse oximeters, and
+without the assistance of medical experts. Despite extensive efforts and
+advances in this field, serious challenges remain, including issues related to
+skin color, camera characteristics, ambient lighting, and other sources of
+noise, which degrade performance accuracy. We argue that fair and evaluable
+benchmarking is urgently required to overcome these challenges and make any
+meaningful progress from both academic and commercial perspectives. In most
+existing work, models are trained, tested, and validated only on limited
+datasets. Worse still, some studies lack available code or reproducibility,
+making it difficult to fairly evaluate and compare performance. Therefore, the
+purpose of this study is to provide a benchmarking framework to evaluate
+various rPPG techniques across a wide range of datasets for fair evaluation and
+comparison, including both conventional non-deep neural network (non-DNN) and
+deep neural network (DNN) methods. GitHub URL:
+https://github.com/remotebiosensing/rppg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PG-RCNN: Semantic Surface Point Generation for 3D Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12637v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12637v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inyong Koo, Inyoung Lee, Se-Ho Kim, Hee-Seon Kim, Woo-jin Jeon, Changick Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the main challenges in LiDAR-based 3D object detection is that the
+sensors often fail to capture the complete spatial information about the
+objects due to long distance and occlusion. Two-stage detectors with point
+cloud completion approaches tackle this problem by adding more points to the
+regions of interest (RoIs) with a pre-trained network. However, these methods
+generate dense point clouds of objects for all region proposals, assuming that
+objects always exist in the RoIs. This leads to the indiscriminate point
+generation for incorrect proposals as well. Motivated by this, we propose Point
+Generation R-CNN (PG-RCNN), a novel end-to-end detector that generates semantic
+surface points of foreground objects for accurate detection. Our method uses a
+jointly trained RoI point generation module to process the contextual
+information of RoIs and estimate the complete shape and displacement of
+foreground objects. For every generated point, PG-RCNN assigns a semantic
+feature that indicates the estimated foreground probability. Extensive
+experiments show that the point clouds generated by our method provide
+geometrically and semantically rich information for refining false positive and
+misaligned proposals. PG-RCNN achieves competitive performance on the KITTI
+benchmark, with significantly fewer parameters than state-of-the-art models.
+The code is available at https://github.com/quotation2520/PG-RCNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic lobe segmentation using attentive cross entropy and end-to-end
+  fissure generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12634v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12634v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Su, Na Wang, Jiawen Xie, Yinan Chen, Xiaofan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The automatic lung lobe segmentation algorithm is of great significance for
+the diagnosis and treatment of lung diseases, however, which has great
+challenges due to the incompleteness of pulmonary fissures in lung CT images
+and the large variability of pathological features. Therefore, we propose a new
+automatic lung lobe segmentation framework, in which we urge the model to pay
+attention to the area around the pulmonary fissure during the training process,
+which is realized by a task-specific loss function. In addition, we introduce
+an end-to-end pulmonary fissure generation method in the auxiliary pulmonary
+fissure segmentation task, without any additional network branch. Finally, we
+propose a registration-based loss function to alleviate the convergence
+difficulty of the Dice loss supervised pulmonary fissure segmentation task. We
+achieve 97.83% and 94.75% dice scores on our private dataset STLB and public
+LUNA16 dataset respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, published to 'IEEE International Symposium on
+  Biomedical Imaging (ISBI) 2023'</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semi-Supervised Medical Image Segmentation with Co-Distribution
+  Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12630v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12630v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Wang, Zhongzheng Huang, Jiawei Wu, Yuanzheng Cai, Zuoyong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation has made significant progress when a large amount
+of labeled data are available. However, annotating medical image segmentation
+datasets is expensive due to the requirement of professional skills.
+Additionally, classes are often unevenly distributed in medical images, which
+severely affects the classification performance on minority classes. To address
+these problems, this paper proposes Co-Distribution Alignment (Co-DA) for
+semi-supervised medical image segmentation. Specifically, Co-DA aligns marginal
+predictions on unlabeled data to marginal predictions on labeled data in a
+class-wise manner with two differently initialized models before using the
+pseudo-labels generated by one model to supervise the other. Besides, we design
+an over-expectation cross-entropy loss for filtering the unlabeled pixels to
+reduce noise in their pseudo-labels. Quantitative and qualitative experiments
+on three public datasets demonstrate that the proposed approach outperforms
+existing state-of-the-art semi-supervised medical image segmentation methods on
+both the 2D CaDIS dataset and the 3D LGE-MRI and ACDC datasets, achieving an
+mIoU of 0.8515 with only 24% labeled data on CaDIS, and a Dice score of 0.8824
+and 0.8773 with only 20% data on LGE-MRI and ACDC, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper appears in Bioengineering 2023, 10(7), 869</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Phase Match for Out-of-Distribution Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12622v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12622v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengming Hu, Rui Wang, Hao Chen, Zhouwang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Fourier transform, serving as an explicit decomposition method for visual
+signals, has been employed to explain the out-of-distribution generalization
+behaviors of Convolutional Neural Networks (CNNs). Previous research and
+empirical studies have indicated that the amplitude spectrum plays a decisive
+role in CNN recognition, but it is susceptible to disturbance caused by
+distribution shifts. On the other hand, the phase spectrum preserves
+highly-structured spatial information, which is crucial for visual
+representation learning. In this paper, we aim to clarify the relationships
+between Domain Generalization (DG) and the frequency components by introducing
+a Fourier-based structural causal model. Specifically, we interpret the phase
+spectrum as semi-causal factors and the amplitude spectrum as non-causal
+factors. Building upon these observations, we propose Phase Match (PhaMa) to
+address DG problems. Our method introduces perturbations on the amplitude
+spectrum and establishes spatial relationships to match the phase components.
+Through experiments on multiple benchmarks, we demonstrate that our proposed
+method achieves state-of-the-art performance in domain generalization and
+out-of-distribution robustness tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse annotation strategies for segmentation of short axis cardiac MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12619v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12619v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josh Stein, Maxime Di Folco, Julia Schnabel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Short axis cardiac MRI segmentation is a well-researched topic, with
+excellent results achieved by state-of-the-art models in a supervised setting.
+However, annotating MRI volumes is time-consuming and expensive. Many different
+approaches (e.g. transfer learning, data augmentation, few-shot learning, etc.)
+have emerged in an effort to use fewer annotated data and still achieve similar
+performance as a fully supervised model. Nevertheless, to the best of our
+knowledge, none of these works focus on which slices of MRI volumes are most
+important to annotate for yielding the best segmentation results. In this
+paper, we investigate the effects of training with sparse volumes, i.e.
+reducing the number of cases annotated, and sparse annotations, i.e. reducing
+the number of slices annotated per case. We evaluate the segmentation
+performance using the state-of-the-art nnU-Net model on two public datasets to
+identify which slices are the most important to annotate. We have shown that
+training on a significantly reduced dataset (48 annotated volumes) can give a
+Dice score greater than 0.85 and results comparable to using the full dataset
+(160 and 240 volumes for each dataset respectively). In general, training on
+more slice annotations provides more valuable information compared to training
+on more volumes. Further, annotating slices from the middle of volumes yields
+the most beneficial results in terms of segmentation performance, and the
+apical region the worst. When evaluating the trade-off between annotating
+volumes against slices, annotating as many slices as possible instead of
+annotating more volumes is a better strategy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attribute Regularized Soft Introspective VAE: Towards Cardiac Attribute
+  Regularization Through MRI Domains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12618v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12618v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Di Folco, Cosmin Bercea, Julia A. Schnabel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep generative models have emerged as influential instruments for data
+generation and manipulation. Enhancing the controllability of these models by
+selectively modifying data attributes has been a recent focus. Variational
+Autoencoders (VAEs) have shown promise in capturing hidden attributes but often
+produce blurry reconstructions. Controlling these attributes through different
+imaging domains is difficult in medical imaging. Recently, Soft Introspective
+VAE leverage the benefits of both VAEs and Generative Adversarial Networks
+(GANs), which have demonstrated impressive image synthesis capabilities, by
+incorporating an adversarial loss into VAE training. In this work, we propose
+the Attributed Soft Introspective VAE (Attri-SIVAE) by incorporating an
+attribute regularized loss, into the Soft-Intro VAE framework. We evaluate
+experimentally the proposed method on cardiac MRI data from different domains,
+such as various scanner vendors and acquisition centers. The proposed method
+achieves similar performance in terms of reconstruction and regularization
+compared to the state-of-the-art Attributed regularized VAE but additionally
+also succeeds in keeping the same regularization level when tested on a
+different dataset, unlike the compared method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CTVIS: Consistent Training for Online Video Instance Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaining Ying, Qing Zhong, Weian Mao, Zhenhua Wang, Hao Chen, Lin Yuanbo Wu, Yifan Liu, Chengxiang Fan, Yunzhi Zhuge, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The discrimination of instance embeddings plays a vital role in associating
+instances across time for online video instance segmentation (VIS). Instance
+embedding learning is directly supervised by the contrastive loss computed upon
+the contrastive items (CIs), which are sets of anchor/positive/negative
+embeddings. Recent online VIS methods leverage CIs sourced from one reference
+frame only, which we argue is insufficient for learning highly discriminative
+embeddings. Intuitively, a possible strategy to enhance CIs is replicating the
+inference phase during training. To this end, we propose a simple yet effective
+training strategy, called Consistent Training for Online VIS (CTVIS), which
+devotes to aligning the training and inference pipelines in terms of building
+CIs. Specifically, CTVIS constructs CIs by referring inference the
+momentum-averaged embedding and the memory bank storage mechanisms, and adding
+noise to the relevant embeddings. Such an extension allows a reliable
+comparison between embeddings of current instances and the stable
+representations of historical instances, thereby conferring an advantage in
+modeling VIS challenges such as occlusion, re-identification, and deformation.
+Empirically, CTVIS outstrips the SOTA VIS models by up to +5.0 points on three
+VIS benchmarks, including YTVIS19 (55.1% AP), YTVIS21 (50.1% AP) and OVIS
+(35.5% AP). Furthermore, we find that pseudo-videos transformed from images can
+train robust models surpassing fully-supervised ones.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. The code is available at
+  https://github.com/KainingYing/CTVIS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Less is More: Focus Attention for Efficient DETR <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12612v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12612v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dehua Zheng, Wenhui Dong, Hailin Hu, Xinghao Chen, Yunhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DETR-like models have significantly boosted the performance of detectors and
+even outperformed classical convolutional models. However, all tokens are
+treated equally without discrimination brings a redundant computational burden
+in the traditional encoder structure. The recent sparsification strategies
+exploit a subset of informative tokens to reduce attention complexity
+maintaining performance through the sparse encoder. But these methods tend to
+rely on unreliable model statistics. Moreover, simply reducing the token
+population hinders the detection performance to a large extent, limiting the
+application of these sparse models. We propose Focus-DETR, which focuses
+attention on more informative tokens for a better trade-off between computation
+efficiency and model accuracy. Specifically, we reconstruct the encoder with
+dual attention, which includes a token scoring mechanism that considers both
+localization and category semantic information of the objects from multi-scale
+feature maps. We efficiently abandon the background queries and enhance the
+semantic interaction of the fine-grained object queries based on the scores.
+Compared with the state-of-the-art sparse DETR-like detectors under the same
+setting, our Focus-DETR gets comparable complexity while achieving 50.4AP
+(+2.2) on COCO. The code is available at
+https://github.com/huawei-noah/noah-research/tree/master/Focus-DETR and
+https://gitee.com/mindspore/models/tree/master/research/cv/Focus-DETR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ExWarp: Extrapolation and Warping-based Temporal Supersampling for
+  High-frequency Displays 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12607v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12607v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akanksha Dixit, Yashashwee Chakrabarty, Smruti R. Sarangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-frequency displays are gaining immense popularity because of their
+increasing use in video games and virtual reality applications. However, the
+issue is that the underlying GPUs cannot continuously generate frames at this
+high rate -- this results in a less smooth and responsive experience.
+Furthermore, if the frame rate is not synchronized with the refresh rate, the
+user may experience screen tearing and stuttering. Previous works propose
+increasing the frame rate to provide a smooth experience on modern displays by
+predicting new frames based on past or future frames. Interpolation and
+extrapolation are two widely used algorithms that predict new frames.
+Interpolation requires waiting for the future frame to make a prediction, which
+adds additional latency. On the other hand, extrapolation provides a better
+quality of experience because it relies solely on past frames -- it does not
+incur any additional latency. The simplest method to extrapolate a frame is to
+warp the previous frame using motion vectors; however, the warped frame may
+contain improperly rendered visual artifacts due to dynamic objects -- this
+makes it very challenging to design such a scheme. Past work has used DNNs to
+get good accuracy, however, these approaches are slow. This paper proposes
+Exwarp -- an approach based on reinforcement learning (RL) to intelligently
+choose between the slower DNN-based extrapolation and faster warping-based
+methods to increase the frame rate by 4x with an almost negligible reduction in
+the perceived image quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SwinMM: Masked Multi-view with Swin <span class="highlight-title">Transformer</span>s for 3D Medical Image
+  Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12591v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12591v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqing Wang, Zihan Li, Jieru Mei, Zihao Wei, Li Liu, Chen Wang, Shengtian Sang, Alan Yuille, Cihang Xie, Yuyin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large-scale Vision Transformers have made significant
+strides in improving pre-trained models for medical image segmentation.
+However, these methods face a notable challenge in acquiring a substantial
+amount of pre-training data, particularly within the medical field. To address
+this limitation, we present Masked Multi-view with Swin Transformers (SwinMM),
+a novel multi-view pipeline for enabling accurate and data-efficient
+self-supervised medical image analysis. Our strategy harnesses the potential of
+multi-view information by incorporating two principal components. In the
+pre-training phase, we deploy a masked multi-view encoder devised to
+concurrently train masked multi-view observations through a range of diverse
+proxy tasks. These tasks span image reconstruction, rotation, contrastive
+learning, and a novel task that employs a mutual learning paradigm. This new
+task capitalizes on the consistency between predictions from various
+perspectives, enabling the extraction of hidden multi-view information from 3D
+medical data. In the fine-tuning stage, a cross-view decoder is developed to
+aggregate the multi-view information through a cross-attention block. Compared
+with the previous state-of-the-art self-supervised learning method Swin UNETR,
+SwinMM demonstrates a notable advantage on several medical image segmentation
+tasks. It allows for a smooth integration of multi-view information,
+significantly boosting both the accuracy and data-efficiency of the model. Code
+and models are available at https://github.com/UCSC-VLAA/SwinMM/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023; project page: https://github.com/UCSC-VLAA/SwinMM/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SL: Stable Learning in Source-Free Domain Adaption for Medical Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12580v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12580v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixin Chen, Yan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning techniques for medical image analysis usually suffer from the
+domain shift between source and target data. Most existing works focus on
+unsupervised domain adaptation (UDA). However, in practical applications,
+privacy issues are much more severe. For example, the data of different
+hospitals have domain shifts due to equipment problems, and data of the two
+domains cannot be available simultaneously because of privacy. In this
+challenge defined as Source-Free UDA, the previous UDA medical methods are
+limited. Although a variety of medical source-free unsupervised domain adaption
+(MSFUDA) methods have been proposed, we found they fall into an over-fitting
+dilemma called "longer training, worse performance." Therefore, we propose the
+Stable Learning (SL) strategy to address the dilemma. SL is a scalable method
+and can be integrated with other research, which consists of Weight
+Consolidation and Entropy Increase. First, we apply Weight Consolidation to
+retain domain-invariant knowledge and then we design Entropy Increase to avoid
+over-learning. Comparative experiments prove the effectiveness of SL. We also
+have done extensive ablation experiments. Besides, We will release codes
+including a variety of MSFUDA methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PRIOR: Prototype Representation Joint Learning from Medical Images and
+  Reports <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pujin Cheng, Li Lin, Junyan Lyu, Yijin Huang, Wenhan Luo, Xiaoying Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning based vision-language joint pre-training has emerged as
+a successful representation learning strategy. In this paper, we present a
+prototype representation learning framework incorporating both global and local
+alignment between medical images and reports. In contrast to standard global
+multi-modality alignment methods, we employ a local alignment module for
+fine-grained representation. Furthermore, a cross-modality conditional
+reconstruction module is designed to interchange information across modalities
+in the training phase by reconstructing masked images and reports. For
+reconstructing long reports, a sentence-wise prototype memory bank is
+constructed, enabling the network to focus on low-level localized visual and
+high-level clinical linguistic features. Additionally, a non-auto-regressive
+generation paradigm is proposed for reconstructing non-sequential reports.
+Experimental results on five downstream tasks, including supervised
+classification, zero-shot classification, image-to-text retrieval, semantic
+segmentation, and object detection, show the proposed method outperforms other
+state-of-the-art methods across multiple datasets and under different dataset
+size settings. The code is available at https://github.com/QtacierP/PRIOR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Good Student is Cooperative and Reliable: CNN-<span class="highlight-title">Transformer</span>
+  Collaborative Learning for Semantic Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12574v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12574v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinjing Zhu, Yunhao Luo, Xu Zheng, Hao Wang, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we strive to answer the question "how to collaboratively learn
+convolutional neural network (CNN)-based and vision transformer (ViT)-based
+models by selecting and exchanging the reliable knowledge between them for
+semantic segmentation?" Accordingly, we propose an online knowledge
+distillation (KD) framework that can simultaneously learn compact yet effective
+CNN-based and ViT-based models with two key technical breakthroughs to take
+full advantage of CNNs and ViT while compensating their limitations. Firstly,
+we propose heterogeneous feature distillation (HFD) to improve students'
+consistency in low-layer feature space by mimicking heterogeneous features
+between CNNs and ViT. Secondly, to facilitate the two students to learn
+reliable knowledge from each other, we propose bidirectional selective
+distillation (BSD) that can dynamically transfer selective knowledge. This is
+achieved by 1) region-wise BSD determining the directions of knowledge
+transferred between the corresponding regions in the feature space and 2)
+pixel-wise BSD discerning which of the prediction knowledge to be transferred
+in the logit space. Extensive experiments on three benchmark datasets
+demonstrate that our proposed framework outperforms the state-of-the-art online
+distillation methods by a large margin, and shows its efficacy in learning
+collaboratively between ViT-based and CNN-based models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MataDoc: Margin and Text Aware Document Dewarping for Arbitrary Boundary 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beiya Dai, Xing li, Qunyi Xie, Yulin Li, Xiameng Qin, Chengquan Zhang, Kun Yao, Junyu Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document dewarping from a distorted camera-captured image is of great value
+for OCR and document understanding. The document boundary plays an important
+role which is more evident than the inner region in document dewarping. Current
+learning-based methods mainly focus on complete boundary cases, leading to poor
+document correction performance of documents with incomplete boundaries. In
+contrast to these methods, this paper proposes MataDoc, the first method
+focusing on arbitrary boundary document dewarping with margin and text aware
+regularizations. Specifically, we design the margin regularization by
+explicitly considering background consistency to enhance boundary perception.
+Moreover, we introduce word position consistency to keep text lines straight in
+rectified document images. To produce a comprehensive evaluation of MataDoc, we
+propose a novel benchmark ArbDoc, mainly consisting of document images with
+arbitrary boundaries in four typical scenarios. Extensive experiments confirm
+the superiority of MataDoc with consideration for the incomplete boundary on
+ArbDoc and also demonstrate the effectiveness of the proposed method on
+DocUNet, DIR300, and WarpDoc datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpolating between Images with Diffusion Models <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clinton J. Wang, Polina Golland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One little-explored frontier of image generation and editing is the task of
+interpolating between two input images, a feature missing from all currently
+deployed image generation pipelines. We argue that such a feature can expand
+the creative applications of such models, and propose a method for zero-shot
+interpolation using latent diffusion models. We apply interpolation in the
+latent space at a sequence of decreasing noise levels, then perform denoising
+conditioned on interpolated text embeddings derived from textual inversion and
+(optionally) subject poses. For greater consistency, or to specify additional
+criteria, we can generate several candidates and use CLIP to select the highest
+quality image. We obtain convincing interpolations across diverse subject
+poses, image styles, and image content, and show that standard quantitative
+metrics such as FID are insufficient to measure the quality of an
+interpolation. Code and data are available at
+https://clintonjwang.github.io/interpolation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at ICML 2023 Workshop on Challenges of Deploying Generative
+  AI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Event-based Video Frame Interpolation <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaben Chen, Yichen Zhu, Dongze Lian, Jiaqi Yang, Yifu Wang, Renrui Zhang, Xinhang Liu, Shenhan Qian, Laurent Kneip, Shenghua Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic vision sensors or event cameras provide rich complementary
+information for video frame interpolation. Existing state-of-the-art methods
+follow the paradigm of combining both synthesis-based and warping networks.
+However, few of those methods fully respect the intrinsic characteristics of
+events streams. Given that event cameras only encode intensity changes and
+polarity rather than color intensities, estimating optical flow from events is
+arguably more difficult than from RGB information. We therefore propose to
+incorporate RGB information in an event-guided optical flow refinement
+strategy. Moreover, in light of the quasi-continuous nature of the time signals
+provided by event cameras, we propose a divide-and-conquer strategy in which
+event-based intermediate frame synthesis happens incrementally in multiple
+simplified stages rather than in a single, long stage. Extensive experiments on
+both synthetic and real-world datasets show that these modifications lead to
+more reliable and realistic intermediate frame results than previous video
+frame interpolation methods. Our findings underline that a careful
+consideration of event characteristics such as high temporal density and
+elevated noise benefits interpolation accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IROS2023 Project Site:
+  https://jiabenchen.github.io/revisit_event</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MFMAN-YOLO: A Method for Detecting Pole-like Obstacles in Complex
+  Environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12548v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12548v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Cai, Hao Wang, Congling Zhou, Yongqiang Wang, Boyu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world traffic, there are various uncertainties and complexities in
+road and weather conditions. To solve the problem that the feature information
+of pole-like obstacles in complex environments is easily lost, resulting in low
+detection accuracy and low real-time performance, a multi-scale hybrid
+attention mechanism detection algorithm is proposed in this paper. First, the
+optimal transport function Monge-Kantorovich (MK) is incorporated not only to
+solve the problem of overlapping multiple prediction frames with optimal
+matching but also the MK function can be regularized to prevent model
+over-fitting; then, the features at different scales are up-sampled separately
+according to the optimized efficient multi-scale feature pyramid. Finally, the
+extraction of multi-scale feature space channel information is enhanced in
+complex environments based on the hybrid attention mechanism, which suppresses
+the irrelevant complex environment background information and focuses the
+feature information of pole-like obstacles. Meanwhile, this paper conducts real
+road test experiments in a variety of complex environments. The experimental
+results show that the detection precision, recall, and average precision of the
+method are 94.7%, 93.1%, and 97.4%, respectively, and the detection frame rate
+is 400 f/s. This research method can detect pole-like obstacles in a complex
+road environment in real time and accurately, which further promotes innovation
+and progress in the field of automatic driving.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Video Anomaly Retrieval from Video Anomaly Detection: New
+  Benchmarks and Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Wu, Jing Liu, Xiangteng He, Yuxin Peng, Peng Wang, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video anomaly detection (VAD) has been paid increasing attention due to its
+potential applications, its current dominant tasks focus on online detecting
+anomalies% at the frame level, which can be roughly interpreted as the binary
+or multiple event classification. However, such a setup that builds
+relationships between complicated anomalous events and single labels, e.g.,
+``vandalism'', is superficial, since single labels are deficient to
+characterize anomalous events. In reality, users tend to search a specific
+video rather than a series of approximate videos. Therefore, retrieving
+anomalous events using detailed descriptions is practical and positive but few
+researches focus on this. In this context, we propose a novel task called Video
+Anomaly Retrieval (VAR), which aims to pragmatically retrieve relevant
+anomalous videos by cross-modalities, e.g., language descriptions and
+synchronous audios. Unlike the current video retrieval where videos are assumed
+to be temporally well-trimmed with short duration, VAR is devised to retrieve
+long untrimmed videos which may be partially relevant to the given query. To
+achieve this, we present two large-scale VAR benchmarks, UCFCrime-AR and
+XDViolence-AR, constructed on top of prevalent anomaly datasets. Meanwhile, we
+design a model called Anomaly-Led Alignment Network (ALAN) for VAR. In ALAN, we
+propose an anomaly-led sampling to focus on key segments in long untrimmed
+videos. Then, we introduce an efficient pretext task to enhance semantic
+associations between video-text fine-grained representations. Besides, we
+leverage two complementary alignments to further match cross-modal contents.
+Experimental results on two benchmarks reveal the challenges of VAR task and
+also demonstrate the advantages of our tailored method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Client-Level Differential Privacy via Adaptive Intermediary in Federated
+  Medical Imaging <span class="chip">MICCAI'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meirui Jiang, Yuan Zhong, Anjie Le, Xiaoxiao Li, Qi Dou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite recent progress in enhancing the privacy of federated learning (FL)
+via differential privacy (DP), the trade-off of DP between privacy protection
+and performance is still underexplored for real-world medical scenario. In this
+paper, we propose to optimize the trade-off under the context of client-level
+DP, which focuses on privacy during communications. However, FL for medical
+imaging involves typically much fewer participants (hospitals) than other
+domains (e.g., mobile devices), thus ensuring clients be differentially private
+is much more challenging. To tackle this problem, we propose an adaptive
+intermediary strategy to improve performance without harming privacy.
+Specifically, we theoretically find splitting clients into sub-clients, which
+serve as intermediaries between hospitals and the server, can mitigate the
+noises introduced by DP without harming privacy. Our proposed approach is
+empirically evaluated on both classification and segmentation tasks using two
+public datasets, and its effectiveness is demonstrated with significant
+performance improvements and comprehensive analytical studies. Code is
+available at: https://github.com/med-air/Client-DP-FL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 26th International Conference on Medical Image Computing
+  and Computer Assisted Intervention (MICCAI'23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SelFormaly: Towards Task-Agnostic Unified Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujin Lee, Harin Lim, Hyunsoo Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The core idea of visual anomaly detection is to learn the normality from
+normal images, but previous works have been developed specifically for certain
+tasks, leading to fragmentation among various tasks: defect detection, semantic
+anomaly detection, multi-class anomaly detection, and anomaly clustering. This
+one-task-one-model approach is resource-intensive and incurs high maintenance
+costs as the number of tasks increases. This paper presents SelFormaly, a
+universal and powerful anomaly detection framework. We emphasize the necessity
+of our off-the-shelf approach by pointing out a suboptimal issue with
+fluctuating performance in previous online encoder-based methods. In addition,
+we question the effectiveness of using ConvNets as previously employed in the
+literature and confirm that self-supervised ViTs are suitable for unified
+anomaly detection. We introduce back-patch masking and discover the new role of
+top k-ratio feature matching to achieve unified and powerful anomaly detection.
+Back-patch masking eliminates irrelevant regions that possibly hinder
+target-centric detection with representations of the scene layout. The top
+k-ratio feature matching unifies various anomaly levels and tasks. Finally,
+SelFormaly achieves state-of-the-art results across various datasets for all
+the aforementioned tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalizable Deepfake Detection by Primary Region
+  Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harry Cheng, Yangyang Guo, Tianyi Wang, Liqiang Nie, Mohan Kankanhalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The existing deepfake detection methods have reached a bottleneck in
+generalizing to unseen forgeries and manipulation approaches. Based on the
+observation that the deepfake detectors exhibit a preference for overfitting
+the specific primary regions in input, this paper enhances the generalization
+capability from a novel regularization perspective. This can be simply achieved
+by augmenting the images through primary region removal, thereby preventing the
+detector from over-relying on data bias. Our method consists of two stages,
+namely the static localization for primary region maps, as well as the dynamic
+exploitation of primary region masks. The proposed method can be seamlessly
+integrated into different backbones without affecting their inference
+efficiency. We conduct extensive experiments over three widely used deepfake
+datasets - DFDC, DF-1.0, and Celeb-DF with five backbones. Our method
+demonstrates an average performance improvement of 6% across different
+backbones and performs competitively with several state-of-the-art baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages. Code and Dataset: https://github.com/xaCheng1996/PRLE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Connection between <span class="highlight-title">Pre-train</span>ing Data Diversity and Fine-tuning
+  Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Ramanujan, Thao Nguyen, Sewoong Oh, Ludwig Schmidt, Ali Farhadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-training has been widely adopted in deep learning to improve model
+performance, especially when the training data for a target task is limited. In
+our work, we seek to understand the implications of this training strategy on
+the generalization properties of downstream models. More specifically, we ask
+the following question: how do properties of the pre-training distribution
+affect the robustness of a fine-tuned model? The properties we explore include
+the label space, label semantics, image diversity, data domains, and data
+quantity of the pre-training distribution. We find that the primary factor
+influencing downstream effective robustness (Taori et al., 2020) is data
+quantity, while other factors have limited significance. For example, reducing
+the number of ImageNet pre-training classes by 4x while increasing the number
+of images per class by 4x (that is, keeping total data quantity fixed) does not
+impact the robustness of fine-tuned models. We demonstrate our findings on
+pre-training distributions drawn from various natural and synthetic data
+sources, primarily using the iWildCam-WILDS distribution shift as a test for
+downstream robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Medical Report Generation: Disease Revealing Enhancement with
+  Knowledge Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixin Wang, Zihao Lin, Haoyu Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graph (KG) plays a crucial role in Medical Report Generation (MRG)
+because it reveals the relations among diseases and thus can be utilized to
+guide the generation process. However, constructing a comprehensive KG is
+labor-intensive and its applications on the MRG process are under-explored. In
+this study, we establish a complete KG on chest X-ray imaging that includes 137
+types of diseases and abnormalities. Based on this KG, we find that the current
+MRG data sets exhibit a long-tailed problem in disease distribution. To
+mitigate this problem, we introduce a novel augmentation strategy that enhances
+the representation of disease types in the tail-end of the distribution. We
+further design a two-stage MRG approach, where a classifier is first trained to
+detect whether the input images exhibit any abnormalities. The classified
+images are then independently fed into two transformer-based generators,
+namely, ``disease-specific generator" and ``disease-free generator" to generate
+the corresponding reports. To enhance the clinical evaluation of whether the
+generated reports correctly describe the diseases appearing in the input image,
+we propose diverse sensitivity (DS), a new metric that checks whether generated
+diseases match ground truth and measures the diversity of all generated
+diseases. Results show that the proposed two-stage generation framework and
+augmentation strategies improve DS by a considerable margin, indicating a
+notable reduction in the long-tailed problem associated with under-represented
+diseases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Entropy <span class="highlight-title">Transformer</span> Networks: A Learning Approach via Tangent Bundle
+  Data Manifold 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12517v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12517v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pourya Shamsolmoali, Masoumeh Zareapoor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on an accurate and fast interpolation approach for image
+transformation employed in the design of CNN architectures. Standard Spatial
+Transformer Networks (STNs) use bilinear or linear interpolation as their
+interpolation, with unrealistic assumptions about the underlying data
+distributions, which leads to poor performance under scale variations.
+Moreover, STNs do not preserve the norm of gradients in propagation due to
+their dependency on sparse neighboring pixels. To address this problem, a novel
+Entropy STN (ESTN) is proposed that interpolates on the data manifold
+distributions. In particular, random samples are generated for each pixel in
+association with the tangent space of the data manifold and construct a linear
+approximation of their intensity values with an entropy regularizer to compute
+the transformer parameters. A simple yet effective technique is also proposed
+to normalize the non-zero values of the convolution operation, to fine-tune the
+layers for gradients' norm-regularization during training. Experiments on
+challenging benchmarks show that the proposed ESTN can improve predictive
+accuracy over a range of computer vision tasks, including image reconstruction,
+and classification, while reducing the computational cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross Contrastive Feature Perturbation for Domain Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenming Li, Daoan Zhang, Wenjian Huang, Jianguo Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain generalization (DG) aims to learn a robust model from source domains
+that generalize well on unseen target domains. Recent studies focus on
+generating novel domain samples or features to diversify distributions
+complementary to source domains. Yet, these approaches can hardly deal with the
+restriction that the samples synthesized from various domains can cause
+semantic distortion. In this paper, we propose an online one-stage Cross
+Contrasting Feature Perturbation (CCFP) framework to simulate domain shift by
+generating perturbed features in the latent space while regularizing the model
+prediction against domain shift. Different from the previous fixed synthesizing
+strategy, we design modules with learnable feature perturbations and semantic
+consistency constraints. In contrast to prior work, our method does not use any
+generative-based models or domain labels. We conduct extensive experiments on a
+standard DomainBed benchmark with a strict evaluation protocol for a fair
+comparison. Comprehensive experiments show that our method outperforms the
+previous state-of-the-art, and quantitative analyses illustrate that our
+approach can alleviate the domain shift problem in out-of-distribution (OOD)
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdvDiff: Generating Unrestricted Adversarial Examples using Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuelong Dai, Kaisheng Liang, Bin Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unrestricted adversarial attacks present a serious threat to deep learning
+models and adversarial defense techniques. They pose severe security problems
+for deep learning applications because they can effectively bypass defense
+mechanisms. However, previous attack methods often utilize Generative
+Adversarial Networks (GANs), which are not theoretically provable and thus
+generate unrealistic examples by incorporating adversarial objectives,
+especially for large-scale datasets like ImageNet. In this paper, we propose a
+new method, called AdvDiff, to generate unrestricted adversarial examples with
+diffusion models. We design two novel adversarial guidance techniques to
+conduct adversarial sampling in the reverse generation process of diffusion
+models. These two techniques are effective and stable to generate high-quality,
+realistic adversarial examples by integrating gradients of the target
+classifier interpretably. Experimental results on MNIST and ImageNet datasets
+demonstrate that AdvDiff is effective to generate unrestricted adversarial
+examples, which outperforms GAN-based methods in terms of attack performance
+and generation quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TF-ICON: Diffusion-Based Training-Free Cross-Domain Image Composition <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shilin Lu, Yanzhu Liu, Adams Wai-Kin Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-driven diffusion models have exhibited impressive generative
+capabilities, enabling various image editing tasks. In this paper, we propose
+TF-ICON, a novel Training-Free Image COmpositioN framework that harnesses the
+power of text-driven diffusion models for cross-domain image-guided
+composition. This task aims to seamlessly integrate user-provided objects into
+a specific visual context. Current diffusion-based methods often involve costly
+instance-based optimization or finetuning of pretrained models on customized
+datasets, which can potentially undermine their rich prior. In contrast,
+TF-ICON can leverage off-the-shelf diffusion models to perform cross-domain
+image-guided composition without requiring additional training, finetuning, or
+optimization. Moreover, we introduce the exceptional prompt, which contains no
+information, to facilitate text-driven diffusion models in accurately inverting
+real images into latent representations, forming the basis for compositing. Our
+experiments show that equipping Stable Diffusion with the exceptional prompt
+outperforms state-of-the-art inversion methods on various datasets (CelebA-HQ,
+COCO, and ImageNet), and that TF-ICON surpasses prior baselines in versatile
+visual domains. Code is available at https://github.com/Shilin-LU/TF-ICON
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Data Distillation: Do Not Overlook Calibration <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyao Zhu, Bowen Lei, Jie Zhang, Yanbo Fang, Ruqi Zhang, Yiqun Xie, Dongkuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks trained on distilled data often produce over-confident output
+and require correction by calibration methods. Existing calibration methods
+such as temperature scaling and mixup work well for networks trained on
+original large-scale data. However, we find that these methods fail to
+calibrate networks trained on data distilled from large source datasets. In
+this paper, we show that distilled data lead to networks that are not
+calibratable due to (i) a more concentrated distribution of the maximum logits
+and (ii) the loss of information that is semantically meaningful but unrelated
+to classification tasks. To address this problem, we propose Masked Temperature
+Scaling (MTS) and Masked Distillation Training (MDT) which mitigate the
+limitations of distilled data and achieve better calibration results while
+maintaining the efficiency of dataset distillation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust face anti-spoofing framework with Convolutional Vision
+  <span class="highlight-title">Transformer</span> <span class="chip">ICIP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunseung Lee, Youngjun Kwak, Jinho Shin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Owing to the advances in image processing technology and large-scale
+datasets, companies have implemented facial authentication processes, thereby
+stimulating increased focus on face anti-spoofing (FAS) against realistic
+presentation attacks. Recently, various attempts have been made to improve face
+recognition performance using both global and local learning on face images;
+however, to the best of our knowledge, this is the first study to investigate
+whether the robustness of FAS against domain shifts is improved by considering
+global information and local cues in face images captured using self-attention
+and convolutional layers. This study proposes a convolutional vision
+transformer-based framework that achieves robust performance for various unseen
+domain data. Our model resulted in 7.3%$p$ and 12.9%$p$ increases in FAS
+performance compared to models using only a convolutional neural network or
+vision transformer, respectively. It also shows the highest average rank in
+sub-protocols of cross-dataset setting over the other nine benchmark models for
+domain generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICIP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does Progress On Object Recognition Benchmarks Improve Real-World
+  Generalization? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Megan Richards, Polina Kirichenko, Diane Bouchacourt, Mark Ibrahim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For more than a decade, researchers have measured progress in object
+recognition on ImageNet-based generalization benchmarks such as ImageNet-A, -C,
+and -R. Recent advances in foundation models, trained on orders of magnitude
+more data, have begun to saturate these standard benchmarks, but remain brittle
+in practice. This suggests standard benchmarks, which tend to focus on
+predefined or synthetic changes, may not be sufficient for measuring real world
+generalization. Consequently, we propose studying generalization across
+geography as a more realistic measure of progress using two datasets of objects
+from households across the globe. We conduct an extensive empirical evaluation
+of progress across nearly 100 vision models up to most recent foundation
+models. We first identify a progress gap between standard benchmarks and
+real-world, geographical shifts: progress on ImageNet results in up to 2.5x
+more progress on standard generalization benchmarks than real-world
+distribution shifts. Second, we study model generalization across geographies
+by measuring the disparities in performance across regions, a more fine-grained
+measure of real world generalization. We observe all models have large
+geographic disparities, even foundation CLIP models, with differences of 7-20%
+in accuracy between regions. Counter to modern intuition, we discover progress
+on standard benchmarks fails to improve geographic disparities and often
+exacerbates them: geographic disparities between the least performant models
+and today's best models have more than tripled. Our results suggest scaling
+alone is insufficient for consistent robustness to real-world distribution
+shifts. Finally, we highlight in early experiments how simple last layer
+retraining on more representative, curated data can complement scaling as a
+promising direction of future work, reducing geographic disparity on both
+benchmarks by over two-thirds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ simPLE: a visuotactile method learned in simulation to precisely pick,
+  localize, regrasp, and place objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13133v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13133v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Bauza, Antonia Bronars, Yifan Hou, Ian Taylor, Nikhil Chavan-Dafle, Alberto Rodriguez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing robotic systems have a clear tension between generality and
+precision. Deployed solutions for robotic manipulation tend to fall into the
+paradigm of one robot solving a single task, lacking precise generalization,
+i.e., the ability to solve many tasks without compromising on precision. This
+paper explores solutions for precise and general pick-and-place. In precise
+pick-and-place, i.e. kitting, the robot transforms an unstructured arrangement
+of objects into an organized arrangement, which can facilitate further
+manipulation. We propose simPLE (simulation to Pick Localize and PLacE) as a
+solution to precise pick-and-place. simPLE learns to pick, regrasp and place
+objects precisely, given only the object CAD model and no prior experience. We
+develop three main components: task-aware grasping, visuotactile perception,
+and regrasp planning. Task-aware grasping computes affordances of grasps that
+are stable, observable, and favorable to placing. The visuotactile perception
+model relies on matching real observations against a set of simulated ones
+through supervised learning. Finally, we compute the desired robot motion by
+solving a shortest path problem on a graph of hand-to-hand regrasps. On a
+dual-arm robot equipped with visuotactile sensing, we demonstrate
+pick-and-place of 15 diverse objects with simPLE. The objects span a wide range
+of shapes and simPLE achieves successful placements into structured
+arrangements with 1mm clearance over 90% of the time for 6 objects, and over
+80% of the time for 11 objects. Videos are available at
+http://mcube.mit.edu/research/simPLE.html .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 6 figures, 2 tables, submitted to Science Robotics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Approaches for Data Augmentation in Medical Imaging: A
+  <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13125v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13125v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aghiles Kebaili, Jérôme Lapuyade-Lahorgue, Su Ruan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has become a popular tool for medical image analysis, but the
+limited availability of training data remains a major challenge, particularly
+in the medical field where data acquisition can be costly and subject to
+privacy regulations. Data augmentation techniques offer a solution by
+artificially increasing the number of training samples, but these techniques
+often produce limited and unconvincing results. To address this issue, a
+growing number of studies have proposed the use of deep generative models to
+generate more realistic and diverse data that conform to the true distribution
+of the data. In this review, we focus on three types of deep generative models
+for medical image augmentation: variational autoencoders, generative
+adversarial networks, and diffusion models. We provide an overview of the
+current state of the art in each of these models and discuss their potential
+for use in different downstream tasks in medical imaging, including
+classification, segmentation, and cross-modal translation. We also evaluate the
+strengths and limitations of each model and suggest directions for future
+research in this field. Our goal is to provide a comprehensive review about the
+use of deep generative models for medical image augmentation and to highlight
+the potential of these models for improving the performance of deep learning
+algorithms in medical image analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Infant Respiration Estimation from Video: A Deep Flow-based
+  Algorithm and a Novel Public Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13110v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13110v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sai Kumar Reddy Manne, Shaotong Zhu, Sarah Ostadabbas, Michael Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Respiration is a critical vital sign for infants, and continuous respiratory
+monitoring is particularly important for newborns. However, neonates are
+sensitive and contact-based sensors present challenges in comfort, hygiene, and
+skin health, especially for preterm babies. As a step toward fully automatic,
+continuous, and contactless respiratory monitoring, we develop a deep-learning
+method for estimating respiratory rate and waveform from plain video footage in
+natural settings. Our automated infant respiration flow-based network
+(AIRFlowNet) combines video-extracted optical flow input and spatiotemporal
+convolutional processing tuned to the infant domain. We support our model with
+the first public annotated infant respiration dataset with 125 videos
+(AIR-125), drawn from eight infant subjects, set varied pose, lighting, and
+camera conditions. We include manual respiration annotations and optimize
+AIRFlowNet training on them using a novel spectral bandpass loss function. When
+trained and tested on the AIR-125 infant data, our method significantly
+outperforms other state-of-the-art methods in respiratory rate estimation,
+achieving a mean absolute error of $\sim$2.9 breaths per minute, compared to
+$\sim$4.7--6.2 for other public models designed for adult subjects and more
+uniform environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Certified Training: Towards Better Accuracy-Robustness
+  Tradeoffs <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhakshylyk Nurlanov, Frank R. Schmidt, Florian Bernard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As deep learning models continue to advance and are increasingly utilized in
+real-world systems, the issue of robustness remains a major challenge. Existing
+certified training methods produce models that achieve high provable robustness
+guarantees at certain perturbation levels. However, the main problem of such
+models is a dramatically low standard accuracy, i.e. accuracy on clean
+unperturbed data, that makes them impractical. In this work, we consider a more
+realistic perspective of maximizing the robustness of a model at certain levels
+of (high) standard accuracy. To this end, we propose a novel certified training
+method based on a key insight that training with adaptive certified radii helps
+to improve both the accuracy and robustness of the model, advancing
+state-of-the-art accuracy-robustness tradeoffs. We demonstrate the
+effectiveness of the proposed method on MNIST, CIFAR-10, and TinyImageNet
+datasets. Particularly, on CIFAR-10 and TinyImageNet, our method yields models
+with up to two times higher robustness, measured as an average certified radius
+of a test set, at the same levels of standard accuracy compared to baseline
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at ICML 2023 workshop "New Frontiers in Adversarial Machine
+  Learning"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ General-Purpose Multi-Modal OOD Detection Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viet Duong, Qiong Wu, Zhengyi Zhou, Eric Zavesky, Jiahe Chen, Xiangzhou Liu, Wen-Ling Hsu, Huajie Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection identifies test samples that differ from
+the training data, which is critical to ensuring the safety and reliability of
+machine learning (ML) systems. While a plethora of methods have been developed
+to detect uni-modal OOD samples, only a few have focused on multi-modal OOD
+detection. Current contrastive learning-based methods primarily study
+multi-modal OOD detection in a scenario where both a given image and its
+corresponding textual description come from a new domain. However, real-world
+deployments of ML systems may face more anomaly scenarios caused by multiple
+factors like sensor faults, bad weather, and environmental changes. Hence, the
+goal of this work is to simultaneously detect from multiple different OOD
+scenarios in a fine-grained manner. To reach this goal, we propose a
+general-purpose weakly-supervised OOD detection framework, called WOOD, that
+combines a binary classifier and a contrastive learning component to reap the
+benefits of both. In order to better distinguish the latent representations of
+in-distribution (ID) and OOD samples, we adopt the Hinge loss to constrain
+their similarity. Furthermore, we develop a new scoring metric to integrate the
+prediction results from both the binary classifier and contrastive learning for
+identifying OOD samples. We evaluate the proposed WOOD model on multiple
+real-world datasets, and the experimental results demonstrate that the WOOD
+model outperforms the state-of-the-art methods for multi-modal OOD detection.
+Importantly, our approach is able to achieve high accuracy in OOD detection in
+three different OOD scenarios simultaneously. The source code will be made
+publicly available upon publication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the characteristics of natural hydraulic dampers: An image-based
+  approach to study the fluid flow behaviour inside the human meniscal tissue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        J. Waghorne, F. P. Bonomo, A. Rabbani, D. Bell, O. Barrera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The meniscal tissue is a layered material with varying properties influenced
+by collagen content and arrangement. Understanding the relationship between
+structure and properties is crucial for disease management, treatment
+development, and biomaterial design. The internal layer of the meniscus is
+softer and more deformable than the outer layers, thanks to interconnected
+collagen channels that guide fluid flow. To investigate these relationships, we
+propose a novel approach that combines Computational Fluid Dynamics (CFD) with
+Image Analysis (CFD-IA). We analyze fluid flow in the internal architecture of
+the human meniscus across a range of inlet velocities (0.1mm/s to 1.6m/s) using
+high-resolution 3D micro-computed tomography scans. Statistical correlations
+are observed between architectural parameters (tortuosity, connectivity,
+porosity, pore size) and fluid flow parameters (Re number distribution,
+permeability). Some channels exhibit Re values of 1400 at an inlet velocity of
+1.6m/s, and a transition from Darcy's regime to a non-Darcian regime occurs
+around an inlet velocity of 0.02m/s. Location-dependent permeability ranges
+from 20-32 Darcy. Regression modelling reveals a strong correlation between
+fluid velocity and tortuosity at high inlet velocities, as well as with channel
+diameter at low inlet velocities. At higher inlet velocities, flow paths
+deviate more from the preferential direction, resulting in a decrease in the
+concentration parameter by an average of 0.4. This research provides valuable
+insights into the fluid flow behaviour within the meniscus and its structural
+influences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 Pages, 5 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Maximal Independent Sets for Pooling in Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stevan Stanovic, Benoit Gaüzère, Luc Brun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional Neural Networks (CNNs) have enabled major advances in image
+classification through convolution and pooling. In particular, image pooling
+transforms a connected discrete lattice into a reduced lattice with the same
+connectivity and allows reduction functions to consider all pixels in an image.
+However, there is no pooling that satisfies these properties for graphs. In
+fact, traditional graph pooling methods suffer from at least one of the
+following drawbacks: Graph disconnection or overconnection, low decimation
+ratio, and deletion of large parts of graphs. In this paper, we present three
+pooling methods based on the notion of maximal independent sets that avoid
+these pitfalls. Our experimental results confirm the relevance of maximal
+independent set constraints for graph pooling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Segmenting Known Objects and Unseen Unknowns without Prior Knowledge <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05407v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05407v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefano Gasperini, Alvaro Marcos-Ramiro, Michael Schmidt, Nassir Navab, Benjamin Busam, Federico Tombari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic segmentation methods assign a known class to each pixel given in
+input. Even for state-of-the-art approaches, this inevitably enforces decisions
+that systematically lead to wrong predictions for objects outside the training
+categories. However, robustness against out-of-distribution samples and corner
+cases is crucial in safety-critical settings to avoid dangerous consequences.
+Since real-world datasets cannot contain enough data points to adequately
+sample the long tail of the underlying distribution, models must be able to
+deal with unseen and unknown scenarios as well. Previous methods targeted this
+by re-identifying already-seen unlabeled objects. In this work, we propose the
+necessary step to extend segmentation with a new setting which we term holistic
+segmentation. Holistic segmentation aims to identify and separate objects of
+unseen unknown categories into instances, without any prior knowledge about
+them, while performing panoptic segmentation of known classes. We tackle this
+new problem with U3HS, which finds unknowns as highly uncertain regions and
+clusters their corresponding instance-aware embeddings into individual objects.
+By doing so, for the first time in panoptic segmentation with unknown objects,
+our U3HS is trained without unknown categories, reducing assumptions and
+leaving the settings as unconstrained as in real-life scenarios. Extensive
+experiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate
+the effectiveness of U3HS for this new, challenging, and assumptions-free
+setting called holistic segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizing similarity in noisy setups: the DIBS phenomenon <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.12803v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.12803v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nayara Fonseca, Veronica Guidetti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work uncovers an interplay among data density, noise, and the
+generalization ability in similarity learning. We consider Siamese Neural
+Networks (SNNs), which are the basic form of contrastive learning, and explore
+two types of noise that can impact SNNs, Pair Label Noise (PLN) and Single
+Label Noise (SLN). Our investigation reveals that SNNs exhibit double descent
+behaviour regardless of the training setup and that it is further exacerbated
+by noise. We demonstrate that the density of data pairs is crucial for
+generalization. When SNNs are trained on sparse datasets with the same amount
+of PLN or SLN, they exhibit comparable generalization properties. However, when
+using dense datasets, PLN cases generalize worse than SLN ones in the
+overparametrized region, leading to a phenomenon we call Density-Induced Break
+of Similarity (DIBS). In this regime, PLN similarity violation becomes
+macroscopical, corrupting the dataset to the point where complete interpolation
+cannot be achieved, regardless of the number of model parameters. Our analysis
+also delves into the correspondence between online optimization and offline
+generalization in similarity learning. The results show that this equivalence
+fails in the presence of label noise in all the scenarios considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v3: version accepted at ECAI 2023 + Supplementary Material</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Encyclopedic VQA: Visual questions about detailed properties of
+  fine-grained categories <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09224v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09224v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Mensink, Jasper Uijlings, Lluis Castrejon, Arushi Goel, Felipe Cadar, Howard Zhou, Fei Sha, André Araujo, Vittorio Ferrari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Encyclopedic-VQA, a large scale visual question answering (VQA)
+dataset featuring visual questions about detailed properties of fine-grained
+categories and instances. It contains 221k unique question+answer pairs each
+matched with (up to) 5 images, resulting in a total of 1M VQA samples.
+Moreover, our dataset comes with a controlled knowledge base derived from
+Wikipedia, marking the evidence to support each answer. Empirically, we show
+that our dataset poses a hard challenge for large vision+language models as
+they perform poorly on our dataset: PaLI [14] is state-of-the-art on OK-VQA
+[37], yet it only achieves 13.0% accuracy on our dataset. Moreover, we
+experimentally show that progress on answering our encyclopedic questions can
+be achieved by augmenting large models with a mechanism that retrieves relevant
+information from the knowledge base. An oracle experiment with perfect
+retrieval achieves 87.0% accuracy on the single-hop portion of our dataset, and
+an automatic retrieval-augmented prototype yields 48.8%. We believe that our
+dataset enables future research on retrieval-augmented vision+language models.
+It is available at
+https://github.com/google-research/google-research/tree/master/encyclopedic_vqa .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BoxSnake: Polygonal Instance Segmentation with Box Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11630v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11630v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Yang, Lin Song, Yixiao Ge, Xiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Box-supervised instance segmentation has gained much attention as it requires
+only simple box annotations instead of costly mask or polygon annotations.
+However, existing box-supervised instance segmentation models mainly focus on
+mask-based frameworks. We propose a new end-to-end training technique, termed
+BoxSnake, to achieve effective polygonal instance segmentation using only box
+annotations for the first time. Our method consists of two loss functions: (1)
+a point-based unary loss that constrains the bounding box of predicted polygons
+to achieve coarse-grained segmentation; and (2) a distance-aware pairwise loss
+that encourages the predicted polygons to fit the object boundaries. Compared
+with the mask-based weakly-supervised methods, BoxSnake further reduces the
+performance gap between the predicted segmentation and the bounding box, and
+shows significant superiority on the Cityscapes dataset. The code has been
+available publicly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Coupling a Recurrent Neural Network to SPAD TCSPC Systems for Real-time
+  Fluorescence Lifetime Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15599v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15599v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Lin, Paul Mos, Andrei Ardelean, Claudio Bruschini, Edoardo Charbon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fluorescence lifetime imaging (FLI) has been receiving increased attention in
+recent years as a powerful diagnostic technique in biological and medical
+research. However, existing FLI systems often suffer from a tradeoff between
+processing speed, accuracy, and robustness. In this paper, we propose a robust
+approach that enables fast FLI with no degradation of accuracy. The approach is
+based on a SPAD TCSPC system coupled to a recurrent neural network (RNN) that
+accurately estimates the fluorescence lifetime directly from raw timestamps
+without building histograms, thereby drastically reducing transfer data volumes
+and hardware resource utilization, thus enabling FLI acquisition at video rate.
+We train two variants of the RNN on a synthetic dataset and compare the results
+to those obtained using center-of-mass method (CMM) and least squares fitting
+(LS fitting). Results demonstrate that two RNN variants, gated recurrent unit
+(GRU) and long short-term memory (LSTM), are comparable to CMM and LS fitting
+in terms of accuracy, while outperforming them in background noise by a large
+margin. To explore the ultimate limits of the approach, we derived the
+Cramer-Rao lower bound of the measurement, showing that RNN yields lifetime
+estimations with near-optimal precision. Moreover, our FLI model, which is
+purely trained on synthetic datasets, works well with never-seen-before,
+real-world data. To demonstrate real-time operation, we have built a FLI
+microscope based on Piccolo, a 32x32 SPAD sensor developed in our lab. Four
+quantized GRU cores, capable of processing up to 4 million photons per second,
+are deployed on a Xilinx Kintex-7 FPGA. Powered by the GRU, the FLI setup can
+retrieve real-time fluorescence lifetime images at up to 10 frames per second.
+The proposed FLI system is promising and ideally suited for biomedical
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Saner Deep Image Registration <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09696v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09696v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Duan, Ming Zhong, Yan Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With recent advances in computing hardware and surges of deep-learning
+architectures, learning-based deep image registration methods have surpassed
+their traditional counterparts, in terms of metric performance and inference
+time. However, these methods focus on improving performance measurements such
+as Dice, resulting in less attention given to model behaviors that are equally
+desirable for registrations, especially for medical imaging. This paper
+investigates these behaviors for popular learning-based deep registrations
+under a sanity-checking microscope. We find that most existing registrations
+suffer from low inverse consistency and nondiscrimination of identical pairs
+due to overly optimized image similarities. To rectify these behaviors, we
+propose a novel regularization-based sanity-enforcer method that imposes two
+sanity checks on the deep model to reduce its inverse consistency errors and
+increase its discriminative power simultaneously. Moreover, we derive a set of
+theoretical guarantees for our sanity-checked image registration method, with
+experimental results supporting our theoretical findings and their
+effectiveness in increasing the sanity of models without sacrificing any
+performance. Our code and models are available at
+https://github.com/tuffr5/Saner-deep-registration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compound Attention and Neighbor Matching Network for Multi-contrast MRI
+  Super-resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02148v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02148v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Chen, Sirui Wu, Shuai Wang, Zhongsen Li, Jia Yang, Huifeng Yao, Xiaomeng Li, Xiaolei Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-contrast magnetic resonance imaging (MRI) reflects information about
+human tissue from different perspectives and has many clinical applications. By
+utilizing the complementary information among different modalities,
+multi-contrast super-resolution (SR) of MRI can achieve better results than
+single-image super-resolution. However, existing methods of multi-contrast MRI
+SR have the following shortcomings that may limit their performance: First,
+existing methods either simply concatenate the reference and degraded features
+or exploit global feature-matching between them, which are unsuitable for
+multi-contrast MRI SR. Second, although many recent methods employ transformers
+to capture long-range dependencies in the spatial dimension, they neglect that
+self-attention in the channel dimension is also important for low-level vision
+tasks. To address these shortcomings, we proposed a novel network architecture
+with compound-attention and neighbor matching (CANM-Net) for multi-contrast MRI
+SR: The compound self-attention mechanism effectively captures the dependencies
+in both spatial and channel dimension; the neighborhood-based feature-matching
+modules are exploited to match degraded features and adjacent reference
+features and then fuse them to obtain the high-quality images. We conduct
+experiments of SR tasks on the IXI, fastMRI, and real-world scanning datasets.
+The CANM-Net outperforms state-of-the-art approaches in both retrospective and
+prospective experiments. Moreover, the robustness study in our work shows that
+the CANM-Net still achieves good performance when the reference and degraded
+images are imperfectly registered, proving good potential in clinical
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Cross-Modal Retrieval with Set of Diverse Embeddings <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.16761v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.16761v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongwon Kim, Namyup Kim, Suha Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modal retrieval across image and text modalities is a challenging task
+due to its inherent ambiguity: An image often exhibits various situations, and
+a caption can be coupled with diverse images. Set-based embedding has been
+studied as a solution to this problem. It seeks to encode a sample into a set
+of different embedding vectors that capture different semantics of the sample.
+In this paper, we present a novel set-based embedding method, which is distinct
+from previous work in two aspects. First, we present a new similarity function
+called smooth-Chamfer similarity, which is designed to alleviate the side
+effects of existing similarity functions for set-based embedding. Second, we
+propose a novel set prediction module to produce a set of embedding vectors
+that effectively captures diverse semantics of input by the slot attention
+mechanism. Our method is evaluated on the COCO and Flickr30K datasets across
+different visual backbones, where it outperforms existing methods including
+ones that demand substantially larger computation at inference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2023 (Highlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaBest: Minimizing Client Drift in Federated Learning via Adaptive Bias
+  Estimation <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.13170v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.13170v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farshid Varno, Marzie Saghayi, Laya Rafiee Sevyeri, Sharut Gupta, Stan Matwin, Mohammad Havaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Federated Learning (FL), a number of clients or devices collaborate to
+train a model without sharing their data. Models are optimized locally at each
+client and further communicated to a central hub for aggregation. While FL is
+an appealing decentralized training paradigm, heterogeneity among data from
+different clients can cause the local optimization to drift away from the
+global objective. In order to estimate and therefore remove this drift,
+variance reduction techniques have been incorporated into FL optimization
+recently. However, these approaches inaccurately estimate the clients' drift
+and ultimately fail to remove it properly. In this work, we propose an adaptive
+algorithm that accurately estimates drift across clients. In comparison to
+previous works, our approach necessitates less storage and communication
+bandwidth, as well as lower compute costs. Additionally, our proposed
+methodology induces stability by constraining the norm of estimates for client
+drift, making it more practical for large scale FL. Experimental findings
+demonstrate that the proposed algorithm converges significantly faster and
+achieves higher accuracy than the baselines across various FL benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at ECCV 2022; Corrected some typos in
+  the text and a baseline algorithm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deployment of Image Analysis Algorithms under Prevalence Shifts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12540v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12540v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Godau, Piotr Kalinowski, Evangelia Christodoulou, Annika Reinke, Minu Tizabi, Luciana Ferrer, Paul Jäger, Lena Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain gaps are among the most relevant roadblocks in the clinical
+translation of machine learning (ML)-based solutions for medical image
+analysis. While current research focuses on new training paradigms and network
+architectures, little attention is given to the specific effect of prevalence
+shifts on an algorithm deployed in practice. Such discrepancies between class
+frequencies in the data used for a method's development/validation and that in
+its deployment environment(s) are of great importance, for example in the
+context of artificial intelligence (AI) democratization, as disease prevalences
+may vary widely across time and location. Our contribution is twofold. First,
+we empirically demonstrate the potentially severe consequences of missing
+prevalence handling by analyzing (i) the extent of miscalibration, (ii) the
+deviation of the decision threshold from the optimum, and (iii) the ability of
+validation metrics to reflect neural network performance on the deployment
+population as a function of the discrepancy between development and deployment
+prevalence. Second, we propose a workflow for prevalence-aware image
+classification that uses estimated deployment prevalences to adjust a trained
+classifier to a new environment, without requiring additional annotated
+deployment data. Comprehensive experiments based on a diverse set of 30 medical
+classification tasks showcase the benefit of the proposed workflow in
+generating better classifier decisions and more reliable performance estimates
+compared to current practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning-based Anonymization of Chest Radiographs: A
+  Utility-preserving Measure for Patient Privacy <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.11531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.11531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Packhäuser, Sebastian Gündel, Florian Thamm, Felix Denzinger, Andreas Maier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust and reliable anonymization of chest radiographs constitutes an
+essential step before publishing large datasets of such for research purposes.
+The conventional anonymization process is carried out by obscuring personal
+information in the images with black boxes and removing or replacing
+meta-information. However, such simple measures retain biometric information in
+the chest radiographs, allowing patients to be re-identified by a linkage
+attack. Therefore, there is an urgent need to obfuscate the biometric
+information appearing in the images. We propose the first deep learning-based
+approach (PriCheXy-Net) to targetedly anonymize chest radiographs while
+maintaining data utility for diagnostic and machine learning purposes. Our
+model architecture is a composition of three independent neural networks that,
+when collectively used, allow for learning a deformation field that is able to
+impede patient re-identification. Quantitative results on the ChestX-ray14
+dataset show a reduction of patient re-identification from 81.8% to 57.7% (AUC)
+after re-training with little impact on the abnormality classification
+performance. This indicates the ability to preserve underlying abnormality
+patterns while increasing patient privacy. Lastly, we compare our proposed
+anonymization approach with two other obfuscation-based methods (Privacy-Net,
+DP-Pix) and demonstrate the superiority of our method towards resolving the
+privacy-utility trade-off for chest radiographs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizable Embeddings with Cross-batch Metric Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07620v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07620v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeti Z. Gurbuz, A. Aydin Alatan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global average pooling (GAP) is a popular component in deep metric learning
+(DML) for aggregating features. Its effectiveness is often attributed to
+treating each feature vector as a distinct semantic entity and GAP as a
+combination of them. Albeit substantiated, such an explanation's algorithmic
+implications to learn generalizable entities to represent unseen classes, a
+crucial DML goal, remain unclear. To address this, we formulate GAP as a convex
+combination of learnable prototypes. We then show that the prototype learning
+can be expressed as a recursive process fitting a linear predictor to a batch
+of samples. Building on that perspective, we consider two batches of disjoint
+classes at each iteration and regularize the learning by expressing the samples
+of a batch with the prototypes that are fitted to the other batch. We validate
+our approach on 4 popular DML benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>\c{opyright} 2023 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BiofilmScanner: A Computational Intelligence Approach to Obtain
+  Bacterial Cell Morphological Attributes from Biofilm Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09629v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09629v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Hafizur Rahman, Md Ali Azam, Md Abir Hossen, Shankarachary Ragi, Venkataramana Gadhamshetty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Desulfovibrio alaskensis G20 (DA-G20) is utilized as a model for
+sulfate-reducing bacteria (SRB) that are associated with corrosion issues
+caused by microorganisms. SRB-based biofilms are thought to be responsible for
+the billion-dollar-per-year bio-corrosion of metal infrastructure.
+Understanding the extraction of the bacterial cells' shape and size properties
+in the SRB-biofilm at different growth stages will assist with the design of
+anti-corrosion techniques. However, numerous issues affect current approaches,
+including time-consuming geometric property extraction, low efficiency, and
+high error rates. This paper proposes BiofilScanner, a Yolact-based deep
+learning method integrated with invariant moments to address these problems.
+Our approach efficiently detects and segments bacterial cells in an SRB image
+while simultaneously invariant moments measure the geometric characteristics of
+the segmented cells with low errors. The numerical experiments of the proposed
+method demonstrate that the BiofilmScanner is 2.1x and 6.8x faster than our
+earlier Mask-RCNN and DLv3+ methods for detecting, segmenting, and measuring
+the geometric properties of the cell. Furthermore, the BiofilmScanner achieved
+an F1-score of 85.28% while Mask-RCNN and DLv3+ obtained F1-scores of 77.67%
+and 75.18%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Pattern Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeRF-GAN Distillation for Efficient 3D-Aware Generation with
+  Convolutions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12865v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12865v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamad Shahbazi, Evangelos Ntavelis, Alessio Tonioni, Edo Collins, Danda Pani Paudel, Martin Danelljan, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pose-conditioned convolutional generative models struggle with high-quality
+3D-consistent image generation from single-view datasets, due to their lack of
+sufficient 3D priors. Recently, the integration of Neural Radiance Fields
+(NeRFs) and generative models, such as Generative Adversarial Networks (GANs),
+has transformed 3D-aware generation from single-view images. NeRF-GANs exploit
+the strong inductive bias of neural 3D representations and volumetric rendering
+at the cost of higher computational complexity. This study aims at revisiting
+pose-conditioned 2D GANs for efficient 3D-aware generation at inference time by
+distilling 3D knowledge from pretrained NeRF-GANs. We propose a simple and
+effective method, based on re-using the well-disentangled latent space of a
+pre-trained NeRF-GAN in a pose-conditioned convolutional network to directly
+generate 3D-consistent images corresponding to the underlying 3D
+representations. Experiments on several datasets demonstrate that the proposed
+method obtains results comparable with volumetric rendering in terms of quality
+and 3D consistency while benefiting from the computational advantage of
+convolutional networks. The code will be available at:
+https://github.com/mshahbazi72/NeRF-GAN-Distillation
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Automated Hemorrhage Detection in Sparse-view Computed
+  Tomography via Deep Convolutional Neural Network based Artifact Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09340v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09340v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Thalhammer, Manuel Schultheiss, Tina Dorosti, Tobias Lasser, Franz Pfeiffer, Daniela Pfeiffer, Florian Schaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Sparse-view computed tomography (CT) is an effective way to reduce
+dose by lowering the total number of views acquired, albeit at the expense of
+image quality, which, in turn, can impact the ability to detect diseases. We
+explore deep learning-based artifact reduction in sparse-view cranial CT scans
+and its impact on automated hemorrhage detection. Methods: We trained a U-Net
+for artefact reduction on simulated sparse-view cranial CT scans from 3000
+patients obtained from a public dataset and reconstructed with varying levels
+of sub-sampling. Additionally, we trained a convolutional neural network on
+fully sampled CT data from 17,545 patients for automated hemorrhage detection.
+We evaluated the classification performance using the area under the receiver
+operator characteristic curves (AUC-ROCs) with corresponding 95% confidence
+intervals (CIs) and the DeLong test, along with confusion matrices. The
+performance of the U-Net was compared to an analytical approach based on total
+variation (TV). Results: The U-Net performed superior compared to unprocessed
+and TV-processed images with respect to image quality and automated hemorrhage
+diagnosis. With U-Net post-processing, the number of views can be reduced from
+4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;
+0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256
+views (0.967; 0.964-0.969) with a slight performance decrease (P<.001).
+Conclusion: The results suggest that U-Net based artifact reduction
+substantially enhances automated hemorrhage detection in sparse-view cranial
+CTs. Our findings highlight that appropriate post-processing is crucial for
+optimal image quality and diagnostic accuracy while minimizing radiation dose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UP-DETR: Unsupervised <span class="highlight-title">Pre-train</span>ing for Object Detection with
+  <span class="highlight-title">Transformer</span>s <span class="chip">CVPR 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.09094v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.09094v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhigang Dai, Bolun Cai, Yugeng Lin, Junying Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DEtection TRansformer (DETR) for object detection reaches competitive
+performance compared with Faster R-CNN via a transformer encoder-decoder
+architecture. However, trained with scratch transformers, DETR needs
+large-scale training data and an extreme long training schedule even on COCO
+dataset. Inspired by the great success of pre-training transformers in natural
+language processing, we propose a novel pretext task named random query patch
+detection in Unsupervised Pre-training DETR (UP-DETR). Specifically, we
+randomly crop patches from the given image and then feed them as queries to the
+decoder. The model is pre-trained to detect these query patches from the input
+image. During the pre-training, we address two critical issues: multi-task
+learning and multi-query localization. (1) To trade off classification and
+localization preferences in the pretext task, we find that freezing the CNN
+backbone is the prerequisite for the success of pre-training transformers. (2)
+To perform multi-query localization, we develop UP-DETR with multi-query patch
+detection with attention mask. Besides, UP-DETR also provides a unified
+perspective for fine-tuning object detection and one-shot detection tasks. In
+our experiments, UP-DETR significantly boosts the performance of DETR with
+faster convergence and higher average precision on object detection, one-shot
+detection and panoptic segmentation. Code and pre-training models:
+https://github.com/dddzg/up-detr.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TPAMI 2022 and CVPR 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive Learning and the Emergence of Attributes Associations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10763v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10763v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel N. Nissani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In response to an object presentation, supervised learning schemes generally
+respond with a parsimonious label. Upon a similar presentation we humans
+respond again with a label, but are flooded, in addition, by a myriad of
+associations. A significant portion of these consist of the presented object
+attributes. Contrastive learning is a semi-supervised learning scheme based on
+the application of identity preserving transformations on the object input
+representations. It is conjectured in this work that these same applied
+transformations preserve, in addition to the identity of the presented object,
+also the identity of its semantically meaningful attributes. The corollary of
+this is that the output representations of such a contrastive learning scheme
+contain valuable information not only for the classification of the presented
+object, but also for the presence or absence decision of any attribute of
+interest. Simulation results which demonstrate this idea and the feasibility of
+this conjecture are presented.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dr. KID: Direct Remeshing and K-set Isometric Decomposition for Scalable
+  Physicalization of Organic Shapes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02941v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02941v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dawar Khan, Ciril Bohak, Ivan Viola
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dr. KID is an algorithm that uses isometric decomposition for the
+physicalization of potato-shaped organic models in a puzzle fashion. The
+algorithm begins with creating a simple, regular triangular surface mesh of
+organic shapes, followed by iterative k-means clustering and remeshing. For
+clustering, we need similarity between triangles (segments) which is defined as
+a distance function. The distance function maps each triangle's shape to a
+single point in the virtual 3D space. Thus, the distance between the triangles
+indicates their degree of dissimilarity. K-means clustering uses this distance
+and sorts of segments into k classes. After this, remeshing is applied to
+minimize the distance between triangles within the same cluster by making their
+shapes identical. Clustering and remeshing are repeated until the distance
+between triangles in the same cluster reaches an acceptable threshold. We adopt
+a curvature-aware strategy to determine the surface thickness and finalize
+puzzle pieces for 3D printing. Identical hinges and holes are created for
+assembling the puzzle components. For smoother outcomes, we use triangle
+subdivision along with curvature-aware clustering, generating curved triangular
+patches for 3D printing. Our algorithm was evaluated using various models, and
+the 3D-printed results were analyzed. Findings indicate that our algorithm
+performs reliably on target organic shapes with minimal loss of input geometry.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fusing Structure from Motion and Simulation-Augmented Pose Regression
+  from Optical Flow for Challenging Indoor Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07250v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07250v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Ott, Lucas Heublein, David Rügamer, Bernd Bischl, Christopher Mutschler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The localization of objects is a crucial task in various applications such as
+robotics, virtual and augmented reality, and the transportation of goods in
+warehouses. Recent advances in deep learning have enabled the localization
+using monocular visual cameras. While structure from motion (SfM) predicts the
+absolute pose from a point cloud, absolute pose regression (APR) methods learn
+a semantic understanding of the environment through neural networks. However,
+both fields face challenges caused by the environment such as motion blur,
+lighting changes, repetitive patterns, and feature-less structures. This study
+aims to address these challenges by incorporating additional information and
+regularizing the absolute pose using relative pose regression (RPR) methods.
+RPR methods suffer under different challenges, i.e., motion blur. The optical
+flow between consecutive images is computed using the Lucas-Kanade algorithm,
+and the relative pose is predicted using an auxiliary small recurrent
+convolutional network. The fusion of absolute and relative poses is a complex
+task due to the mismatch between the global and local coordinate systems.
+State-of-the-art methods fusing absolute and relative poses use pose graph
+optimization (PGO) to regularize the absolute pose predictions using relative
+poses. In this work, we propose recurrent fusion networks to optimally align
+absolute and relative pose predictions to improve the absolute pose prediction.
+We evaluate eight different recurrent units and construct a simulation
+environment to pre-train the APR and RPR networks for better generalized
+training. Additionally, we record a large database of different scenarios in a
+challenging large-scale indoor environment that mimics a warehouse with
+transportation robots. We conduct hyperparameter searches and experiments to
+show the effectiveness of our recurrent fusion method compared to PGO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Get3DHuman: Lifting StyleGAN-Human into a 3D Generative Model using
+  Pixel-aligned Reconstruction Priors <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01162v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01162v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangyang Xiong, Di Kang, Derong Jin, Weikai Chen, Linchao Bao, Shuguang Cui, Xiaoguang Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fast generation of high-quality 3D digital humans is important to a vast
+number of applications ranging from entertainment to professional concerns.
+Recent advances in differentiable rendering have enabled the training of 3D
+generative models without requiring 3D ground truths. However, the quality of
+the generated 3D humans still has much room to improve in terms of both
+fidelity and diversity. In this paper, we present Get3DHuman, a novel 3D human
+framework that can significantly boost the realism and diversity of the
+generated outcomes by only using a limited budget of 3D ground-truth data. Our
+key observation is that the 3D generator can profit from human-related priors
+learned through 2D human generators and 3D reconstructors. Specifically, we
+bridge the latent space of Get3DHuman with that of StyleGAN-Human via a
+specially-designed prior network, where the input latent code is mapped to the
+shape and texture feature volumes spanned by the pixel-aligned 3D
+reconstructor. The outcomes of the prior network are then leveraged as the
+supervisory signals for the main generator network. To ensure effective
+training, we further propose three tailored losses applied to the generated
+feature volumes and the intermediate feature maps. Extensive experiments
+demonstrate that Get3DHuman greatly outperforms the other state-of-the-art
+approaches and can support a wide range of applications including shape
+interpolation, shape re-texturing, and single-view reconstruction through
+latent inversion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023, project page:
+  https://x-zhangyang.github.io/2023_Get3DHuman/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-inspired Open Set Learning for Retinal Anomaly
+  Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03981v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03981v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Wang, Tian Lin, Lianyu Wang, Aidi Lin, Ke Zou, Xinxing Xu, Yi Zhou, Yuanyuan Peng, Qingquan Meng, Yiming Qian, Guoyao Deng, Zhiqun Wu, Junhong Chen, Jianhong Lin, Mingzhi Zhang, Weifang Zhu, Changqing Zhang, Daoqiang Zhang, Rick Siow Mong Goh, Yong Liu, Chi Pui Pang, Xinjian Chen, Haoyu Chen, Huazhu Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Failure to recognize samples from the classes unseen during training is a
+major limitation of artificial intelligence in the real-world implementation
+for recognition and classification of retinal anomalies. We established an
+uncertainty-inspired open-set (UIOS) model, which was trained with fundus
+images of 9 retinal conditions. Besides assessing the probability of each
+category, UIOS also calculated an uncertainty score to express its confidence.
+Our UIOS model with thresholding strategy achieved an F1 score of 99.55%,
+97.01% and 91.91% for the internal testing set, external target categories
+(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1
+score of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS
+correctly predicted high uncertainty scores, which would prompt the need for a
+manual check in the datasets of non-target categories retinal diseases,
+low-quality fundus images, and non-fundus images. UIOS provides a robust method
+for real-world screening of retinal anomalies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Morphological Image Analysis and Feature Extraction for Reasoning with
+  AI-based Defect Detection and Classification Models <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11643v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11643v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Zhang, Georgina Cosma, Sarah Bugby, Axel Finke, Jason Watkins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the use of artificial intelligent (AI) models becomes more prevalent in
+industries such as engineering and manufacturing, it is essential that these
+models provide transparent reasoning behind their predictions. This paper
+proposes the AI-Reasoner, which extracts the morphological characteristics of
+defects (DefChars) from images and utilises decision trees to reason with the
+DefChar values. Thereafter, the AI-Reasoner exports visualisations (i.e.
+charts) and textual explanations to provide insights into outputs made by
+masked-based defect detection and classification models. It also provides
+effective mitigation strategies to enhance data pre-processing and overall
+model performance. The AI-Reasoner was tested on explaining the outputs of an
+IE Mask R-CNN model using a set of 366 images containing defects. The results
+demonstrated its effectiveness in explaining the IE Mask R-CNN model's
+predictions. Overall, the proposed AI-Reasoner provides a solution for
+improving the performance of AI models in industrial applications that require
+defect analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, 5 tables; submitted to 2023 IEEE symposium series
+  on computational intelligence (SSCI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ S3M: Scalable Statistical Shape Modeling through Unsupervised
+  Correspondences <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07515v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07515v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lennart Bastian, Alexander Baumann, Emily Hoppe, Vincent Bürgin, Ha Young Kim, Mahdi Saleh, Benjamin Busam, Nassir Navab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical shape models (SSMs) are an established way to represent the
+anatomy of a population with various clinically relevant applications. However,
+they typically require domain expertise, and labor-intensive landmark
+annotations to construct. We address these shortcomings by proposing an
+unsupervised method that leverages deep geometric features and functional
+correspondences to simultaneously learn local and global shape structures
+across population anatomies. Our pipeline significantly improves unsupervised
+correspondence estimation for SSMs compared to baseline methods, even on highly
+irregular surface topologies. We demonstrate this for two different anatomical
+structures: the thyroid and a multi-chamber heart dataset. Furthermore, our
+method is robust enough to learn from noisy neural network predictions,
+potentially enabling scaling SSMs to larger patient populations without manual
+segmentation annotation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023. 13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ADPS: Asymmetric Distillation Post-Segmentation for Image Anomaly
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10495v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10495v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Xing, Hao Tang, Jinhui Tang, Zechao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Distillation-based Anomaly Detection (KDAD) methods rely on the
+teacher-student paradigm to detect and segment anomalous regions by contrasting
+the unique features extracted by both networks. However, existing KDAD methods
+suffer from two main limitations: 1) the student network can effortlessly
+replicate the teacher network's representations, and 2) the features of the
+teacher network serve solely as a ``reference standard" and are not fully
+leveraged. Toward this end, we depart from the established paradigm and instead
+propose an innovative approach called Asymmetric Distillation Post-Segmentation
+(ADPS). Our ADPS employs an asymmetric distillation paradigm that takes
+distinct forms of the same image as the input of the teacher-student networks,
+driving the student network to learn discriminating representations for
+anomalous regions.
+  Meanwhile, a customized Weight Mask Block (WMB) is proposed to generate a
+coarse anomaly localization mask that transfers the distilled knowledge
+acquired from the asymmetric paradigm to the teacher network. Equipped with
+WMB, the proposed Post-Segmentation Module (PSM) is able to effectively detect
+and segment abnormal regions with fine structures and clear boundaries.
+Experimental results demonstrate that the proposed ADPS outperforms the
+state-of-the-art methods in detecting and segmenting anomalies. Surprisingly,
+ADPS significantly improves Average Precision (AP) metric by 9% and 20% on the
+MVTec AD and KolektorSDD2 datasets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11pages,9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recovering 3D Human Mesh from Monocular Images: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.01923v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.01923v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yating Tian, Hongwen Zhang, Yebin Liu, Limin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating human pose and shape from monocular images is a long-standing
+problem in computer vision. Since the release of statistical body models, 3D
+human mesh recovery has been drawing broader attention. With the same goal of
+obtaining well-aligned and physically plausible mesh results, two paradigms
+have been developed to overcome challenges in the 2D-to-3D lifting process: i)
+an optimization-based paradigm, where different data terms and regularization
+terms are exploited as optimization objectives; and ii) a regression-based
+paradigm, where deep learning techniques are embraced to solve the problem in
+an end-to-end fashion. Meanwhile, continuous efforts are devoted to improving
+the quality of 3D mesh labels for a wide range of datasets. Though remarkable
+progress has been achieved in the past decade, the task is still challenging
+due to flexible body motions, diverse appearances, complex environments, and
+insufficient in-the-wild annotations. To the best of our knowledge, this is the
+first survey that focuses on the task of monocular 3D human mesh recovery. We
+start with the introduction of body models and then elaborate recovery
+frameworks and training objectives by providing in-depth analyses of their
+strengths and weaknesses. We also summarize datasets, evaluation metrics, and
+benchmark results. Open issues and future directions are discussed in the end,
+hoping to motivate researchers and facilitate their research in this area. A
+regularly updated project page can be found at
+https://github.com/tinatiansjz/hmr-survey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE TPAMI, Survey on monocular 3D human mesh recovery,
+  Project page: https://github.com/tinatiansjz/hmr-survey</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Underwater Object Tracker: UOSTrack for Marine Organism Grasping of
+  Underwater Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01482v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01482v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfeng Li, Bo Wang, Ye Li, Zhuoyan Liu, Wei Huo, Yueming Li, Jian Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A visual single-object tracker is an indispensable component of underwater
+vehicles (UVs) in marine organism grasping tasks. Its accuracy and stability
+are imperative to guide the UVs to perform grasping behavior. Although
+single-object trackers show competitive performance in the challenge of
+underwater image degradation, there are still issues with sample imbalance and
+exclusion of similar objects that need to be addressed for application in
+marine organism grasping. This paper proposes Underwater OSTrack (UOSTrack),
+which consists of underwater image and open-air sequence hybrid training
+(UOHT), and motion-based post-processing (MBPP). The UOHT training paradigm is
+designed to train the sample-imbalanced underwater tracker so that the tracker
+is exposed to a great number of underwater domain training samples and learns
+the feature expressions. The MBPP paradigm is proposed to exclude similar
+objects. It uses the estimation box predicted with a Kalman filter and the
+candidate boxes in the response map to relocate the lost tracked object in the
+candidate area. UOSTrack achieves an average performance improvement of 4.41%
+and 7.98% maximum compared to state-of-the-art methods on various benchmarks,
+respectively. Field experiments have verified the accuracy and stability of our
+proposed UOSTrack for UVs in marine organism grasping tasks. More details can
+be found at https://github.com/LiYunfengLYF/UOSTrack.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffusionDepth: Diffusion Denoising Approach for Monocular Depth
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05021v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05021v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqun Duan, Xianda Guo, Zheng Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular depth estimation is a challenging task that predicts the pixel-wise
+depth from a single 2D image. Current methods typically model this problem as a
+regression or classification task. We propose DiffusionDepth, a new approach
+that reformulates monocular depth estimation as a denoising diffusion process.
+It learns an iterative denoising process to `denoise' random depth distribution
+into a depth map with the guidance of monocular visual conditions. The process
+is performed in the latent space encoded by a dedicated depth encoder and
+decoder. Instead of diffusing ground truth (GT) depth, the model learns to
+reverse the process of diffusing the refined depth of itself into random depth
+distribution. This self-diffusion formulation overcomes the difficulty of
+applying generative models to sparse GT depth scenarios. The proposed approach
+benefits this task by refining depth estimation step by step, which is superior
+for generating accurate and highly detailed depth maps. Experimental results on
+KITTI and NYU-Depth-V2 datasets suggest that a simple yet efficient diffusion
+approach could reach state-of-the-art performance in both indoor and outdoor
+scenarios with acceptable inference time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D Human Pose Estimation via Intuitive Physics <span class="chip">CVPR'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.18246v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.18246v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Tripathi, Lea Müller, Chun-Hao P. Huang, Omid Taheri, Michael J. Black, Dimitrios Tzionas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating 3D humans from images often produces implausible bodies that lean,
+float, or penetrate the floor. Such methods ignore the fact that bodies are
+typically supported by the scene. A physics engine can be used to enforce
+physical plausibility, but these are not differentiable, rely on unrealistic
+proxy bodies, and are difficult to integrate into existing optimization and
+learning frameworks. In contrast, we exploit novel intuitive-physics (IP) terms
+that can be inferred from a 3D SMPL body interacting with the scene. Inspired
+by biomechanics, we infer the pressure heatmap on the body, the Center of
+Pressure (CoP) from the heatmap, and the SMPL body's Center of Mass (CoM). With
+these, we develop IPMAN, to estimate a 3D body from a color image in a "stable"
+configuration by encouraging plausible floor contact and overlapping CoP and
+CoM. Our IP terms are intuitive, easy to implement, fast to compute,
+differentiable, and can be integrated into existing optimization and regression
+methods. We evaluate IPMAN on standard datasets and MoYo, a new dataset with
+synchronized multi-view images, ground-truth 3D bodies with complex poses,
+body-floor contact, CoM and pressure. IPMAN produces more plausible results
+than the state of the art, improving accuracy for static poses, while not
+hurting dynamic ones. Code and data are available for research at
+https://ipman.is.tue.mpg.de.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in CVPR'23. Project page: https://ipman.is.tue.mpg.de</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-guided Eyeglasses Manipulation with Spatial Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12539v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12539v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Wang, Ping Liu, Jingen Liu, Wei Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual try-on of eyeglasses involves placing eyeglasses of different shapes
+and styles onto a face image without physically trying them on. While existing
+methods have shown impressive results, the variety of eyeglasses styles is
+limited and the interactions are not always intuitive or efficient. To address
+these limitations, we propose a Text-guided Eyeglasses Manipulation method that
+allows for control of the eyeglasses shape and style based on a binary mask and
+text, respectively. Specifically, we introduce a mask encoder to extract mask
+conditions and a modulation module that enables simultaneous injection of text
+and mask conditions. This design allows for fine-grained control of the
+eyeglasses' appearance based on both textual descriptions and spatial
+constraints. Our approach includes a disentangled mapper and a decoupling
+strategy that preserves irrelevant areas, resulting in better local editing. We
+employ a two-stage training scheme to handle the different convergence speeds
+of the various modality conditions, successfully controlling both the shape and
+style of eyeglasses. Extensive comparison experiments and ablation analyses
+demonstrate the effectiveness of our approach in achieving diverse eyeglasses
+styles while preserving irrelevant areas.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revised version: add some experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MatSpectNet: Material Segmentation Network with Domain-Aware and
+  Physically-Constrained Hyperspectral Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11466v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11466v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuwen Heng, Yihong Wu, Jiawen Chen, Srinandan Dasmahapatra, Hansung Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Achieving accurate material segmentation for 3-channel RGB images is
+challenging due to the considerable variation in a material's appearance.
+Hyperspectral images, which are sets of spectral measurements sampled at
+multiple wavelengths, theoretically offer distinct information for material
+identification, as variations in intensity of electromagnetic radiation
+reflected by a surface depend on the material composition of a scene. However,
+existing hyperspectral datasets are impoverished regarding the number of images
+and material categories for the dense material segmentation task, and
+collecting and annotating hyperspectral images with a spectral camera is
+prohibitively expensive. To address this, we propose a new model, the
+MatSpectNet to segment materials with recovered hyperspectral images from RGB
+images. The network leverages the principles of colour perception in modern
+cameras to constrain the reconstructed hyperspectral images and employs the
+domain adaptation method to generalise the hyperspectral reconstruction
+capability from a spectral recovery dataset to material segmentation datasets.
+The reconstructed hyperspectral images are further filtered using learned
+response curves and enhanced with human perception. The performance of
+MatSpectNet is evaluated on the LMD dataset as well as the OpenSurfaces
+dataset. Our experiments demonstrate that MatSpectNet attains a 1.60% increase
+in average pixel accuracy and a 3.42% improvement in mean class accuracy
+compared with the most recent publication. The project code is attached to the
+supplementary material and will be published on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages main paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RED-PSM: Regularization by Denoising of Partially Separable Models for
+  Dynamic Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03483v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03483v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Berk Iskender, Marc L. Klasky, Yoram Bresler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at
+each time instant using its undersampled measurements. In particular, in the
+case of dynamic tomography, only a single projection at a single view angle may
+be available at a time, making the problem severely ill-posed. In this work, we
+propose an approach, RED-PSM, which combines for the first time two powerful
+techniques to address this challenging imaging problem. The first, are
+partially separable models, which have been used to efficiently introduce a
+low-rank prior for the spatio-temporal object. The second is the recent
+Regularization by Denoising (RED), which provides a flexible framework to
+exploit the impressive performance of state-of-the-art image denoising
+algorithms, for various inverse problems. We propose a partially separable
+objective with RED and a computationally efficient and scalable optimization
+scheme with variable splitting and ADMM. Theoretical analysis proves the
+convergence of our objective to a value corresponding to a stationary point
+satisfying the first-order optimality conditions. Convergence is accelerated by
+a particular projection-domain-based initialization. We demonstrate the
+performance and computational improvements of our proposed RED-PSM with a
+learned image denoiser by comparing it to a recent deep-prior-based method
+known as TD-DIP. Although the main focus is on dynamic tomography, we also show
+the performance advantages of RED-PSM in a cardiac dynamic MRI setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reconstruction-Aware Prior Distillation for Semi-supervised Point Cloud
+  Completion <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.09186v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.09186v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoxin Fan, Yulin He, Zhicheng Wang, Kejian Wu, Hongyan Liu, Jun He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world sensors often produce incomplete, irregular, and noisy point
+clouds, making point cloud completion increasingly important. However, most
+existing completion methods rely on large paired datasets for training, which
+is labor-intensive. This paper proposes RaPD, a novel semi-supervised point
+cloud completion method that reduces the need for paired datasets. RaPD
+utilizes a two-stage training scheme, where a deep semantic prior is learned in
+stage 1 from unpaired complete and incomplete point clouds, and a
+semi-supervised prior distillation process is introduced in stage 2 to train a
+completion network using only a small number of paired samples. Additionally, a
+self-supervised completion module is introduced to improve performance using
+unpaired incomplete point clouds. Experiments on multiple datasets show that
+RaPD outperforms previous methods in both homologous and heterologous
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IJCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Target-oriented Sentiment Classification with Sequential Cross-modal
+  Semantic Graph <span class="chip">ICANN 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.09417v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.09417v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufeng Huang, Zhuo Chen, Jiaoyan Chen, Jeff Z. Pan, Zhen Yao, Wen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal aspect-based sentiment classification (MABSC) is task of
+classifying the sentiment of a target entity mentioned in a sentence and an
+image. However, previous methods failed to account for the fine-grained
+semantic association between the image and the text, which resulted in limited
+identification of fine-grained image aspects and opinions. To address these
+limitations, in this paper we propose a new approach called SeqCSG, which
+enhances the encoder-decoder sentiment classification framework using
+sequential cross-modal semantic graphs. SeqCSG utilizes image captions and
+scene graphs to extract both global and local fine-grained image information
+and considers them as elements of the cross-modal semantic graph along with
+tokens from tweets. The sequential cross-modal semantic graph is represented as
+a sequence with a multi-modal adjacency matrix indicating relationships between
+elements. Experimental results show that the approach outperforms existing
+methods and achieves state-of-the-art performance on two standard datasets.
+Further analysis has demonstrated that the model can implicitly learn the
+correlation between fine-grained information of the image and the text with the
+given target. Our code is available at https://github.com/zjukg/SeqCSG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICANN 2023, https://github.com/zjukg/SeqCSG</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Directly-Trained Spiking Neural Networks for Object Detection <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11411v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11411v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaoyi Su, Yuhong Chou, Yifan Hu, Jianing Li, Shijie Mei, Ziyang Zhang, Guoqi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs) are brain-inspired energy-efficient models
+that encode information in spatiotemporal dynamics. Recently, deep SNNs trained
+directly have shown great success in achieving high performance on
+classification tasks with very few time steps. However, how to design a
+directly-trained SNN for the regression task of object detection still remains
+a challenging problem. To address this problem, we propose EMS-YOLO, a novel
+directly-trained SNN framework for object detection, which is the first trial
+to train a deep SNN with surrogate gradients for object detection rather than
+ANN-SNN conversion strategies. Specifically, we design a full-spike residual
+block, EMS-ResNet, which can effectively extend the depth of the
+directly-trained SNN with low power consumption. Furthermore, we theoretically
+analyze and prove the EMS-ResNet could avoid gradient vanishing or exploding.
+The results demonstrate that our approach outperforms the state-of-the-art
+ANN-SNN conversion methods (at least 500 time steps) in extremely fewer time
+steps (only 4 time steps). It is shown that our model could achieve comparable
+performance to the ANN with the same architecture while consuming 5.83 times
+less energy on the frame-based COCO Dataset and the event-based Gen1 Dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A large calcium-imaging <span class="highlight-title">dataset</span> reveals a systematic V4 organization for
+  natural scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00932v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00932v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianye Wang, Haoxuan Yao, Tai Sing Lee, Jiayi Hong, Yang Li, Hongfei Jiang, Ian Max Andolina, Shiming Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The visual system evolved to process natural scenes, yet most of our
+understanding of the topology and function of visual cortex derives from
+studies using artificial stimuli. To gain deeper insights into visual
+processing of natural scenes, we utilized widefield calcium-imaging of primate
+V4 in response to many natural images, generating a large dataset of
+columnar-scale responses. We used this dataset to build a digital twin of V4
+via deep learning, generating a detailed topographical map of natural image
+preferences at each cortical position. The map revealed clustered functional
+domains for specific classes of natural image features. These ranged from
+surface-related attributes like color and texture to shape-related features
+such as edges, curvature, and facial features. We validated the model-predicted
+domains with additional widefield calcium-imaging and single-cell resolution
+two-photon imaging. Our study illuminates the detailed topological organization
+and neural codes in V4 that represent natural scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation
+  Incorporating Gloss Information <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01788v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01788v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunjae Kwon, Rishabh Garodia, Minhwa Lee, Zhichao Yang, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Word Sense Disambiguation (VWSD) is a task to find the image that most
+accurately depicts the correct sense of the target word for the given context.
+Previously, image-text matching models often suffered from recognizing
+polysemous words. This paper introduces an unsupervised VWSD approach that uses
+gloss information of an external lexical knowledge-base, especially the sense
+definitions. Specifically, we suggest employing Bayesian inference to
+incorporate the sense definitions when sense information of the answer is not
+provided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we
+propose a context-aware definition generation with GPT-3. Experimental results
+show that the VWSD performance significantly increased with our Bayesian
+inference-based approach. In addition, our context-aware definition generation
+achieved prominent performance improvement in OOD examples exhibiting better
+performance than the existing definition generation method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, https://aclanthology.org/2023.acl-long.88</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GaitRef: Gait Recognition with Refined Sequential Skeletons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07916v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07916v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haidong Zhu, Wanrong Zheng, Zhaoheng Zheng, Ram Nevatia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying humans with their walking sequences, known as gait recognition,
+is a useful biometric understanding task as it can be observed from a long
+distance and does not require cooperation from the subject. Two common
+modalities used for representing the walking sequence of a person are
+silhouettes and joint skeletons. Silhouette sequences, which record the
+boundary of the walking person in each frame, may suffer from the variant
+appearances from carried-on objects and clothes of the person. Framewise joint
+detections are noisy and introduce some jitters that are not consistent with
+sequential detections. In this paper, we combine the silhouettes and skeletons
+and refine the framewise joint predictions for gait recognition. With temporal
+information from the silhouette sequences. We show that the refined skeletons
+can improve gait recognition performance without extra annotations. We compare
+our methods on four public datasets, CASIA-B, OUMVLP, Gait3D and GREW, and show
+state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCB 2023. Code is available at
+  https://github.com/haidongz-usc/GaitRef</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A$^2$-UAV: Application-Aware Content and Network Optimization of
+  Edge-Assisted UAV Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06363v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06363v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Coletta, Flavio Giorgi, Gaia Maselli, Matteo Prata, Domenicomichele Silvestri, Jonathan Ashdown, Francesco Restuccia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To perform advanced surveillance, Unmanned Aerial Vehicles (UAVs) require the
+execution of edge-assisted computer vision (CV) tasks. In multi-hop UAV
+networks, the successful transmission of these tasks to the edge is severely
+challenged due to severe bandwidth constraints. For this reason, we propose a
+novel A$^2$-UAV framework to optimize the number of correctly executed tasks at
+the edge. In stark contrast with existing art, we take an application-aware
+approach and formulate a novel pplication-Aware Task Planning Problem
+(A$^2$-TPP) that takes into account (i) the relationship between deep neural
+network (DNN) accuracy and image compression for the classes of interest based
+on the available dataset, (ii) the target positions, (iii) the current
+energy/position of the UAVs to optimize routing, data pre-processing and target
+assignment for each UAV. We demonstrate A$^2$-TPP is NP-Hard and propose a
+polynomial-time algorithm to solve it efficiently. We extensively evaluate
+A$^2$-UAV through real-world experiments with a testbed composed by four DJI
+Mavic Air 2 UAVs. We consider state-of-the-art image classification tasks with
+four different DNN models (i.e., DenseNet, ResNet152, ResNet50 and
+MobileNet-V2) and object detection tasks using YoloV4 trained on the ImageNet
+dataset. Results show that A$^2$-UAV attains on average around 38% more
+accomplished tasks than the state-of-the-art, with 400% more accomplished tasks
+when the number of targets increases significantly. To allow full
+reproducibility, we pledge to share datasets and code with the research
+community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INFOCOM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Asynchronous Event-Based Algorithm for Periodic Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.04691v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.04691v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David El-Chai Ben-Ezra, Ron Arad, Ayelet Padowicz, Israel Tugendhaft
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Let $0\leq\tau_{1}\leq\tau_{2}\leq\cdots\leq\tau_{m}\leq1$, originated from a
+uniform distribution. Let also $\epsilon,\delta\in\mathbb{R}$, and
+$d\in\mathbb{N}$. What is the probability of having more than $d$ adjacent
+$\tau_{i}$-s pairs that the distance between them is $\delta$, up to an error
+$\epsilon$ ? In this paper we are going to show how this untreated theoretical
+probabilistic problem arises naturally from the motivation of analyzing a
+simple asynchronous algorithm for detection of signals with a known frequency,
+using the novel technology of an event camera.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D Medical Image Segmentation based on multi-scale MPU-Net 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05799v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05799v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeqiu. Yu, Shuo. Han, Ziheng. Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The high cure rate of cancer is inextricably linked to physicians' accuracy
+in diagnosis and treatment, therefore a model that can accomplish
+high-precision tumor segmentation has become a necessity in many applications
+of the medical industry. It can effectively lower the rate of misdiagnosis
+while considerably lessening the burden on clinicians. However, fully automated
+target organ segmentation is problematic due to the irregular stereo structure
+of 3D volume organs. As a basic model for this class of real applications,
+U-Net excels. It can learn certain global and local features, but still lacks
+the capacity to grasp spatial long-range relationships and contextual
+information at multiple scales. This paper proposes a tumor segmentation model
+MPU-Net for patient volume CT images, which is inspired by Transformer with a
+global attention mechanism. By combining image serialization with the Position
+Attention Module, the model attempts to comprehend deeper contextual
+dependencies and accomplish precise positioning. Each layer of the decoder is
+also equipped with a multi-scale module and a cross-attention mechanism. The
+capability of feature extraction and integration at different levels has been
+enhanced, and the hybrid loss function developed in this study can better
+exploit high-resolution characteristic information. Moreover, the suggested
+architecture is tested and evaluated on the Liver Tumor Segmentation Challenge
+2017 (LiTS 2017) dataset. Compared with the benchmark model U-Net, MPU-Net
+shows excellent segmentation results. The dice, accuracy, precision,
+specificity, IOU, and MCC metrics for the best model segmentation results are
+92.17%, 99.08%, 91.91%, 99.52%, 85.91%, and 91.74%, respectively. Outstanding
+indicators in various aspects illustrate the exceptional performance of this
+framework in automatic medical image segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automating Wood Species Detection and Classification in Microscopic
+  Images of Fibrous Materials with Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09588v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09588v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lars Nieradzik, Jördis Sieburg-Rockel, Stephanie Helmling, Janis Keuper, Thomas Weibel, Andrea Olbrich, Henrike Stephani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We have developed a methodology for the systematic generation of a large
+image dataset of macerated wood references, which we used to generate image
+data for nine hardwood genera. This is the basis for a substantial approach to
+automate, for the first time, the identification of hardwood species in
+microscopic images of fibrous materials by deep learning. Our methodology
+includes a flexible pipeline for easy annotation of vessel elements. We compare
+the performance of different neural network architectures and hyperparameters.
+Our proposed method performs similarly well to human experts. In the future,
+this will improve controls on global wood fiber product flows to protect
+forests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retinex-based Image Denoising / Contrast Enhancement using Gradient
+  Graph Laplacian Regularizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02625v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02625v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeganeh Gharedaghi, Gene Cheung, Xianming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Images captured in poorly lit conditions are often corrupted by acquisition
+noise. Leveraging recent advances in graph-based regularization, we propose a
+fast Retinex-based restoration scheme that denoises and contrast-enhances an
+image. Specifically, by Retinex theory we first assume that each image pixel is
+a multiplication of its reflectance and illumination components. We next assume
+that the reflectance and illumination components are piecewise constant (PWC)
+and continuous piecewise planar (PWP) signals, which can be recovered via graph
+Laplacian regularizer (GLR) and gradient graph Laplacian regularizer (GGLR)
+respectively. We formulate quadratic objectives regularized by GLR and GGLR,
+which are minimized alternately until convergence by solving linear systems --
+with improved condition numbers via proposed preconditioners -- via conjugate
+gradient (CG) efficiently. Experimental results show that our algorithm
+achieves competitive visual image quality while reducing computation complexity
+noticeably.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">11</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HeteFedRec: Federated Recommender Systems with Model Heterogeneity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Yuan, Liang Qu, Lizhen Cui, Yongxin Tong, Xiaofang Zhou, Hongzhi Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Owing to the nature of privacy protection, federated recommender systems
+(FedRecs) have garnered increasing interest in the realm of on-device
+recommender systems. However, most existing FedRecs only allow participating
+clients to collaboratively train a recommendation model of the same public
+parameter size. Training a model of the same size for all clients can lead to
+suboptimal performance since clients possess varying resources. For example,
+clients with limited training data may prefer to train a smaller recommendation
+model to avoid excessive data consumption, while clients with sufficient data
+would benefit from a larger model to achieve higher recommendation accuracy. To
+address the above challenge, this paper introduces HeteFedRec, a novel FedRec
+framework that enables the assignment of personalized model sizes to
+participants. In HeteFedRec, we present a heterogeneous recommendation model
+aggregation strategy, including a unified dual-task learning mechanism and a
+dimensional decorrelation regularization, to allow knowledge aggregation among
+recommender models of different sizes. Additionally, a relation-based ensemble
+knowledge distillation method is proposed to effectively distil knowledge from
+heterogeneous item embeddings. Extensive experiments conducted on three
+real-world recommendation datasets demonstrate the effectiveness and efficiency
+of HeteFedRec in training federated recommender systems under heterogeneous
+settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RRAML: Reinforced Retrieval Augmented Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12798v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12798v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bacciu, Florin Cocunasu, Federico Siciliano, Fabrizio Silvestri, Nicola Tonellotto, Giovanni Trappolini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large language models (LLMs) has revolutionized machine
+learning and related fields, showcasing remarkable abilities in comprehending,
+generating, and manipulating human language. However, their conventional usage
+through API-based text prompt submissions imposes certain limitations in terms
+of context constraints and external source availability. To address these
+challenges, we propose a novel framework called Reinforced Retrieval Augmented
+Machine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs
+with supporting information retrieved by a purpose-built retriever from a vast
+user-provided database. By leveraging recent advancements in reinforcement
+learning, our method effectively addresses several critical challenges.
+Firstly, it circumvents the need for accessing LLM gradients. Secondly, our
+method alleviates the burden of retraining LLMs for specific tasks, as it is
+often impractical or impossible due to restricted access to the model and the
+computational intensity involved. Additionally we seamlessly link the
+retriever's task with the reasoner, mitigating hallucinations and reducing
+irrelevant, and potentially damaging retrieved documents. We believe that the
+research agenda outlined in this paper has the potential to profoundly impact
+the field of AI, democratizing access to and utilization of LLMs for a wide
+range of entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unbiased Delayed Feedback Label Correction for Conversion Rate
+  Prediction <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Wang, Peijie Sun, Min Zhang, Qinglin Jia, Jingjie Li, Shaoping Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversion rate prediction is critical to many online applications such as
+digital display advertising. To capture dynamic data distribution, industrial
+systems often require retraining models on recent data daily or weekly.
+However, the delay of conversion behavior usually leads to incorrect labeling,
+which is called delayed feedback problem. Existing work may fail to introduce
+the correct information about false negative samples due to data sparsity and
+dynamic data distribution. To directly introduce the correct feedback label
+information, we propose an Unbiased delayed feedback Label Correction framework
+(ULC), which uses an auxiliary model to correct labels for observed negative
+feedback samples. Firstly, we theoretically prove that the label-corrected loss
+is an unbiased estimate of the oracle loss using true labels. Then, as there
+are no ready training data for label correction, counterfactual labeling is
+used to construct artificial training data. Furthermore, since counterfactual
+labeling utilizes only partial training data, we design an embedding-based
+alternative training method to enhance performance. Comparative experiments on
+both public and private datasets and detailed analyses show that our proposed
+approach effectively alleviates the delayed feedback problem and consistently
+outperforms the previous state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by KDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-refining of Pseudo Labels for Music Source Separation with Noisy
+  Labeled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junghyun Koo, Yunkee Chae, Chang-Bin Jeon, Kyogu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music source separation (MSS) faces challenges due to the limited
+availability of correctly-labeled individual instrument tracks. With the push
+to acquire larger datasets to improve MSS performance, the inevitability of
+encountering mislabeled individual instrument tracks becomes a significant
+challenge to address. This paper introduces an automated technique for refining
+the labels in a partially mislabeled dataset. Our proposed self-refining
+technique, employed with a noisy-labeled dataset, results in only a 1% accuracy
+degradation in multi-label instrument recognition compared to a classifier
+trained on a clean-labeled dataset. The study demonstrates the importance of
+refining noisy-labeled data in MSS model training and shows that utilizing the
+refined dataset leads to comparable results derived from a clean-labeled
+dataset. Notably, upon only access to a noisy dataset, MSS models trained on a
+self-refined dataset even outperform those trained on a dataset refined with a
+classifier trained on clean labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24th International Society for Music Information Retrieval Conference
+  (ISMIR 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FaFCNN: A General Disease Classification Framework Based on Feature
+  Fusion Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menglin Kong, Shaojie Zhao, Juan Cheng, Xingquan Li, Ri Su, Muzhou Hou, Cong Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There are two fundamental problems in applying deep learning/machine learning
+methods to disease classification tasks, one is the insufficient number and
+poor quality of training samples; another one is how to effectively fuse
+multiple source features and thus train robust classification models. To
+address these problems, inspired by the process of human learning knowledge, we
+propose the Feature-aware Fusion Correlation Neural Network (FaFCNN), which
+introduces a feature-aware interaction module and a feature alignment module
+based on domain adversarial learning. This is a general framework for disease
+classification, and FaFCNN improves the way existing methods obtain sample
+correlation features. The experimental results show that training using
+augmented features obtained by pre-training gradient boosting decision tree
+yields more performance gains than random-forest based methods. On the
+low-quality dataset with a large amount of missing data in our setup, FaFCNN
+obtains a consistently optimal performance compared to competitive baselines.
+In addition, extensive experiments demonstrate the robustness of the proposed
+method and the effectiveness of each component of the model\footnote{Accepted
+in IEEE SMC2023}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Robustness of Sequential Recommender Systems Against
+  Training Data Perturbations: an Empirical Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Filippo Betello, Federico Siciliano, Pushkar Mishra, Fabrizio Silvestri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential Recommender Systems (SRSs) have been widely used to model user
+behavior over time, but their robustness in the face of perturbations to
+training data is a critical issue. In this paper, we conduct an empirical study
+to investigate the effects of removing items at different positions within a
+temporally ordered sequence. We evaluate two different SRS models on multiple
+datasets, measuring their performance using Normalized Discounted Cumulative
+Gain (NDCG) and Rank Sensitivity List metrics. Our results demonstrate that
+removing items at the end of the sequence significantly impacts performance,
+with NDCG decreasing up to 60\%, while removing items from the beginning or
+middle has no significant effect. These findings highlight the importance of
+considering the position of the perturbed items in the training data and shall
+inform the design of more robust SRSs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PubMed and Beyond: Recent Advances and Best Practices in Biomedical
+  Literature Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09683v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09683v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiao Jin, Robert Leaman, Zhiyong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical research yields a wealth of information, much of which is only
+accessible through the literature. Consequently, literature search is an
+essential tool for building on prior knowledge in clinical and biomedical
+research. Although recent improvements in artificial intelligence have expanded
+functionality beyond keyword-based search, these advances may be unfamiliar to
+clinicians and researchers. In response, we present a survey of literature
+search tools tailored to both general and specific information needs in
+biomedicine, with the objective of helping readers efficiently fulfill their
+information needs. We first examine the widely used PubMed search engine,
+discussing recent improvements and continued challenges. We then describe
+literature search tools catering to five specific information needs: 1.
+Identifying high-quality clinical research for evidence-based medicine. 2.
+Retrieving gene-related information for precision medicine and genomics. 3.
+Searching by meaning, including natural language questions. 4. Locating related
+articles with literature recommendation. 5. Mining literature to discover
+associations between concepts such as diseases and genetic variants.
+Additionally, we cover practical considerations and best practices for choosing
+and using these tools. Finally, we provide a perspective on the future of
+literature search engines, considering recent breakthroughs in large language
+models such as ChatGPT. In summary, our survey provides a comprehensive view of
+biomedical literature search functionalities with 36 publicly available tools.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 6 figures, 36 tools</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unmasking Falsehoods in <span class="highlight-title">Review</span>s: An Exploration of NLP Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10617v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10617v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anusuya Baby Hari Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the contemporary digital landscape, online reviews have become an
+indispensable tool for promoting products and services across various
+businesses. Marketers, advertisers, and online businesses have found incentives
+to create deceptive positive reviews for their products and negative reviews
+for their competitors' offerings. As a result, the writing of deceptive reviews
+has become an unavoidable practice for businesses seeking to promote themselves
+or undermine their rivals. Detecting such deceptive reviews has become an
+intense and ongoing area of research. This research paper proposes a machine
+learning model to identify deceptive reviews, with a particular focus on
+restaurants. This study delves into the performance of numerous experiments
+conducted on a dataset of restaurant reviews known as the Deceptive Opinion
+Spam Corpus. To accomplish this, an n-gram model and max features are developed
+to effectively identify deceptive content, particularly focusing on fake
+reviews. A benchmark study is undertaken to explore the performance of two
+different feature extraction techniques, which are then coupled with five
+distinct machine learning classification algorithms. The experimental results
+reveal that the passive aggressive classifier stands out among the various
+algorithms, showcasing the highest accuracy not only in text classification but
+also in identifying fake reviews. Moreover, the research delves into data
+augmentation and implements various deep learning techniques to further enhance
+the process of detecting deceptive reviews. The findings shed light on the
+efficacy of the proposed machine learning approach and offer valuable insights
+into dealing with deceptive reviews in the realm of online businesses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification of Consumer Belief Statements From Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.15498v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.15498v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Wenbin Le, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media offer plenty of information to perform market research in order
+to meet the requirements of customers. One way how this research is conducted
+is that a domain expert gathers and categorizes user-generated content into a
+complex and fine-grained class structure. In many of such cases, little data
+meets complex annotations. It is not yet fully understood how this can be
+leveraged successfully for classification. We examine the classification
+accuracy of expert labels when used with a) many fine-grained classes and b)
+few abstract classes. For scenario b) we compare abstract class labels given by
+the domain expert as baseline and by automatic hierarchical clustering. We
+compare this to another baseline where the entire class structure is given by a
+completely unsupervised clustering approach. By doing so, this work can serve
+as an example of how complex expert annotations are potentially beneficial and
+can be utilized in the most optimal way for opinion mining in highly specific
+domains. By exploring across a range of techniques and experiments, we find
+that automated class abstraction approaches in particular the unsupervised
+approach performs remarkably well against domain expert baseline on text
+classification tasks. This has the potential to inspire opinion mining
+applications in order to support market researchers in practice and to inspire
+fine-grained automated content analysis on a large scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Case Study and Qualitative Analysis of Simple Cross-Lingual Opinion
+  Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.02259v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.02259v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Wing Sheung Leung, Qiaoxi Liu, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User-generated content from social media is produced in many languages,
+making it technically challenging to compare the discussed themes from one
+domain across different cultures and regions. It is relevant for domains in a
+globalized world, such as market research, where people from two nations and
+markets might have different requirements for a product. We propose a simple,
+modern, and effective method for building a single topic model with sentiment
+analysis capable of covering multiple languages simultanteously, based on a
+pre-trained state-of-the-art deep neural network for natural language
+understanding. To demonstrate its feasibility, we apply the model to newspaper
+articles and user comments of a specific domain, i.e., organic food products
+and related consumption behavior. The themes match across languages.
+Additionally, we obtain an high proportion of stable and domain-relevant
+topics, a meaningful relation between topics and their respective textual
+contents, and an interpretable representation for social media documents.
+Marketing can potentially benefit from our method, since it provides an
+easy-to-use means of addressing specific customer interests from different
+market regions around the globe. For reproducibility, we provide the code,
+data, and results of our study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 tables, 5 figures, full paper, peer-reviewed, published
+  at KDIR/IC3k 2021 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Similarity search in the blink of an eye with compressed indices <span class="chip">VLDB 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cecilia Aguerrebere, Ishwar Bhati, Mark Hildebrand, Mariano Tepper, Ted Willke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, data is represented by vectors. Retrieving those vectors, among
+millions and billions, that are similar to a given query is a ubiquitous
+problem, known as similarity search, of relevance for a wide range of
+applications. Graph-based indices are currently the best performing techniques
+for billion-scale similarity search. However, their random-access memory
+pattern presents challenges to realize their full potential. In this work, we
+present new techniques and systems for creating faster and smaller graph-based
+indices. To this end, we introduce a novel vector compression method,
+Locally-adaptive Vector Quantization (LVQ), that uses per-vector scaling and
+scalar quantization to improve search performance with fast similarity
+computations and a reduced effective bandwidth, while decreasing memory
+footprint and barely impacting accuracy. LVQ, when combined with a new
+high-performance computing system for graph-based similarity search,
+establishes the new state of the art in terms of performance and memory
+footprint. For billions of vectors, LVQ outcompetes the second-best
+alternatives: (1) in the low-memory regime, by up to 20.7x in throughput with
+up to a 3x memory footprint reduction, and (2) in the high-throughput regime by
+5.8x with 1.4x less memory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>VLDB 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">138</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parallel $Q$-Learning: Scaling Off-policy Reinforcement Learning under
+  Massively Parallel Simulation <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12983v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12983v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zechu Li, Tao Chen, Zhang-Wei Hong, Anurag Ajay, Pulkit Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning is time-consuming for complex tasks due to the need
+for large amounts of training data. Recent advances in GPU-based simulation,
+such as Isaac Gym, have sped up data collection thousands of times on a
+commodity GPU. Most prior works used on-policy methods like PPO due to their
+simplicity and ease of scaling. Off-policy methods are more data efficient but
+challenging to scale, resulting in a longer wall-clock training time. This
+paper presents a Parallel $Q$-Learning (PQL) scheme that outperforms PPO in
+wall-clock time while maintaining superior sample efficiency of off-policy
+learning. PQL achieves this by parallelizing data collection, policy learning,
+and value learning. Different from prior works on distributed off-policy
+learning, such as Apex, our scheme is designed specifically for massively
+parallel GPU-based simulation and optimized to work on a single workstation. In
+experiments, we demonstrate that $Q$-learning can be scaled to \textit{tens of
+thousands of parallel environments} and investigate important factors affecting
+learning speed. The code is available at https://github.com/Improbable-AI/pql.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D-LLM: Injecting the 3D World into Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yining Hong, Haoyu Zhen, Peihao Chen, Shuhong Zheng, Yilun Du, Zhenfang Chen, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) and Vision-Language Models (VLMs) have been
+proven to excel at multiple tasks, such as commonsense reasoning. Powerful as
+these models can be, they are not grounded in the 3D physical world, which
+involves richer concepts such as spatial relationships, affordances, physics,
+layout, and so on. In this work, we propose to inject the 3D world into large
+language models and introduce a whole new family of 3D-LLMs. Specifically,
+3D-LLMs can take 3D point clouds and their features as input and perform a
+diverse set of 3D-related tasks, including captioning, dense captioning, 3D
+question answering, task decomposition, 3D grounding, 3D-assisted dialog,
+navigation, and so on. Using three types of prompting mechanisms that we
+design, we are able to collect over 300k 3D-language data covering these tasks.
+To efficiently train 3D-LLMs, we first utilize a 3D feature extractor that
+obtains 3D features from rendered multi- view images. Then, we use 2D VLMs as
+our backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,
+3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show
+that our model outperforms state-of-the-art baselines by a large margin (e.g.,
+the BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,
+experiments on our held-in datasets for 3D captioning, task composition, and
+3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative
+examples also show that our model could perform more tasks beyond the scope of
+existing LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: : https://vis-www.cs.umass.edu/3dllm/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Isometric Stochastic Optimizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12979v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12979v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob Jackson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Adam optimizer is the standard choice in deep learning applications. I
+propose a simple explanation of Adam's success: it makes each parameter's step
+size independent of the norms of the other parameters. Based on this principle
+I derive Iso, a new optimizer which makes the norm of a parameter's update
+invariant to the application of any linear transformation to its inputs and
+outputs. I develop a variant of Iso called IsoAdam that allows optimal
+hyperparameters to be transferred from Adam, and demonstrate that IsoAdam
+obtains a speedup over Adam when training a small Transformer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provable Benefits of Policy Learning from Human Preferences in
+  Contextual Bandit Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Ji, Huazheng Wang, Minshuo Chen, Tuo Zhao, Mengdi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A crucial task in decision-making problems is reward engineering. It is
+common in practice that no obvious choice of reward function exists. Thus, a
+popular approach is to introduce human feedback during training and leverage
+such feedback to learn a reward function. Among all policy learning methods
+that use human feedback, preference-based methods have demonstrated substantial
+success in recent empirical applications such as InstructGPT. In this work, we
+develop a theory that provably shows the benefits of preference-based methods
+in offline contextual bandits. In particular, we improve the modeling and
+suboptimality analysis for running policy learning methods on human-scored
+samples directly. Then, we compare it with the suboptimality guarantees of
+preference-based methods and show that preference-based methods enjoy lower
+suboptimality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Big Data - Supply Chain Management Framework for Forecasting: Data
+  Preprocessing and Machine Learning Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abrar Jahin, Md Sakib Hossain Shovon, Jungpil Shin, Istiyaque Ahmed Ridoy, Yoichi Tomioka, M. F. Mridha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article intends to systematically identify and comparatively analyze
+state-of-the-art supply chain (SC) forecasting strategies and technologies. A
+novel framework has been proposed incorporating Big Data Analytics in SC
+Management (problem identification, data sources, exploratory data analysis,
+machine-learning model training, hyperparameter tuning, performance evaluation,
+and optimization), forecasting effects on human-workforce, inventory, and
+overall SC. Initially, the need to collect data according to SC strategy and
+how to collect them has been discussed. The article discusses the need for
+different types of forecasting according to the period or SC objective. The SC
+KPIs and the error-measurement systems have been recommended to optimize the
+top-performing model. The adverse effects of phantom inventory on forecasting
+and the dependence of managerial decisions on the SC KPIs for determining model
+performance parameters and improving operations management, transparency, and
+planning efficiency have been illustrated. The cyclic connection within the
+framework introduces preprocessing optimization based on the post-process KPIs,
+optimizing the overall control process (inventory management, workforce
+determination, cost, production and capacity planning). The contribution of
+this research lies in the standard SC process framework proposal, recommended
+forecasting data analysis, forecasting effects on SC performance, machine
+learning algorithms optimization followed, and in shedding light on future
+research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Connection between One-Step Regularization and Critic Regularization
+  in Reinforcement Learning <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Eysenbach, Matthieu Geist, Sergey Levine, Ruslan Salakhutdinov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As with any machine learning problem with limited data, effective offline RL
+algorithms require careful regularization to avoid overfitting. One-step
+methods perform regularization by doing just a single step of policy
+improvement, while critic regularization methods do many steps of policy
+improvement with a regularized objective. These methods appear distinct.
+One-step methods, such as advantage-weighted regression and conditional
+behavioral cloning, truncate policy iteration after just one step. This ``early
+stopping'' makes one-step RL simple and stable, but can limit its asymptotic
+performance. Critic regularization typically requires more compute but has
+appealing lower-bound guarantees. In this paper, we draw a close connection
+between these methods: applying a multi-step critic regularization method with
+a regularization coefficient of 1 yields the same policy as one-step RL. While
+practical implementations violate our assumptions and critic regularization is
+typically applied with smaller regularization coefficients, our experiments
+nevertheless show that our analysis makes accurate, testable predictions about
+practical offline RL methods (CQL and one-step RL) with commonly-used
+hyperparameters. Our results that every problem can be solved with a single
+step of policy improvement, but rather that one-step RL might be competitive
+with critic regularization on RL problems that demand strong regularization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023. Video
+  (https://www.youtube.com/watch?v=1xlixIHZ0R4) and code
+  (https://github.com/ben-eysenbach/ac-connection)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Dense Correspondences between Photos and Sketches <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanchen Lu, Xiaolong Wang, Judith E Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans effortlessly grasp the connection between sketches and real-world
+objects, even when these sketches are far from realistic. Moreover, human
+sketch understanding goes beyond categorization -- critically, it also entails
+understanding how individual elements within a sketch correspond to parts of
+the physical world it represents. What are the computational ingredients needed
+to support this ability? Towards answering this question, we make two
+contributions: first, we introduce a new sketch-photo correspondence benchmark,
+$\textit{PSC6k}$, containing 150K annotations of 6250 sketch-photo pairs across
+125 object categories, augmenting the existing Sketchy dataset with
+fine-grained correspondence metadata. Second, we propose a self-supervised
+method for learning dense correspondences between sketch-photo pairs, building
+upon recent advances in correspondence learning for pairs of photos. Our model
+uses a spatial transformer network to estimate the warp flow between latent
+representations of a sketch and photo extracted by a contrastive learning-based
+ConvNet backbone. We found that this approach outperformed several strong
+baselines and produced predictions that were quantitatively consistent with
+other warp-based methods. However, our benchmark also revealed systematic
+differences between predictions of the suite of models we tested and those of
+humans. Taken together, our work suggests a promising path towards developing
+artificial systems that achieve more human-like understanding of visual images
+at different levels of abstraction. Project page:
+https://photo-sketch-correspondence.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023. Project page:
+  https://photo-sketch-correspondence.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficiently Sampling the PSD Cone with the Metric Dikin Walk 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunbum Kook, Santosh S. Vempala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-definite programs represent a frontier of efficient computation. While
+there has been much progress on semi-definite optimization, with moderate-sized
+instances currently solvable in practice by the interior-point method, the
+basic problem of sampling semi-definite solutions remains a formidable
+challenge. The direct application of known polynomial-time algorithms for
+sampling general convex bodies to semi-definite sampling leads to a
+prohibitively high running time. In addition, known general methods require an
+expensive rounding phase as pre-processing. Here we analyze the Dikin walk, by
+first adapting it to general metrics, then devising suitable metrics for the
+PSD cone with affine constraints. The resulting mixing time and per-step
+complexity are considerably smaller, and by an appropriate choice of the
+metric, the dependence on the number of constraints can be made
+polylogarithmic. We introduce a refined notion of self-concordant matrix
+functions and give rules for combining different metrics. Along the way, we
+further develop the theory of interior-point methods for sampling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>54 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Privileged and Convergent Bases in Neural Network Representations <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davis Brown, Nikhil Vyas, Yamini Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we investigate whether the representations learned by neural
+networks possess a privileged and convergent basis. Specifically, we examine
+the significance of feature directions represented by individual neurons.
+First, we establish that arbitrary rotations of neural representations cannot
+be inverted (unlike linear networks), indicating that they do not exhibit
+complete rotational invariance. Subsequently, we explore the possibility of
+multiple bases achieving identical performance. To do this, we compare the
+bases of networks trained with the same parameters but with varying random
+initializations. Our study reveals two findings: (1) Even in wide networks such
+as WideResNets, neural networks do not converge to a unique basis; (2) Basis
+correlation increases significantly when a few early layers of the network are
+frozen identically.
+  Furthermore, we analyze Linear Mode Connectivity, which has been studied as a
+measure of basis correlation. Our findings give evidence that while Linear Mode
+Connectivity improves with increased network width, this improvement is not due
+to an increase in basis correlation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In the Workshop on High-dimensional Learning Dynamics at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contextual Bandits and Imitation Learning via Preference-Based Active
+  Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayush Sekhari, Karthik Sridharan, Wen Sun, Runzhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of contextual bandits and imitation learning, where
+the learner lacks direct knowledge of the executed action's reward. Instead,
+the learner can actively query an expert at each round to compare two actions
+and receive noisy preference feedback. The learner's objective is two-fold: to
+minimize the regret associated with the executed actions, while simultaneously,
+minimizing the number of comparison queries made to the expert. In this paper,
+we assume that the learner has access to a function class that can represent
+the expert's preference model under appropriate link functions, and provide an
+algorithm that leverages an online regression oracle with respect to this
+function class for choosing its actions and deciding when to query. For the
+contextual bandit setting, our algorithm achieves a regret bound that combines
+the best of both worlds, scaling as $O(\min\{\sqrt{T}, d/\Delta\})$, where $T$
+represents the number of interactions, $d$ represents the eluder dimension of
+the function class, and $\Delta$ represents the minimum preference of the
+optimal action over any suboptimal action under all contexts. Our algorithm
+does not require the knowledge of $\Delta$, and the obtained regret bound is
+comparable to what can be achieved in the standard contextual bandits setting
+where the learner observes reward signals at each round. Additionally, our
+algorithm makes only $O(\min\{T, d^2/\Delta^2\})$ queries to the expert. We
+then extend our algorithm to the imitation learning setting, where the learning
+agent engages with an unknown environment in episodes of length $H$ each, and
+provide similar guarantees for regret and query complexity. Interestingly, our
+algorithm for imitation learning can even learn to outperform the underlying
+expert, when it is suboptimal, highlighting a practical benefit of
+preference-based feedback in imitation learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QAmplifyNet: Pushing the Boundaries of Supply Chain Backorder Prediction
+  Using Interpretable Hybrid Quantum - Classical Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abrar Jahin, Md Sakib Hossain Shovon, Md. Saiful Islam, Jungpil Shin, M. F. Mridha, Yuichi Okuyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supply chain management relies on accurate backorder prediction for
+optimizing inventory control, reducing costs, and enhancing customer
+satisfaction. However, traditional machine-learning models struggle with
+large-scale datasets and complex relationships, hindering real-world data
+collection. This research introduces a novel methodological framework for
+supply chain backorder prediction, addressing the challenge of handling large
+datasets. Our proposed model, QAmplifyNet, employs quantum-inspired techniques
+within a quantum-classical neural network to predict backorders effectively on
+short and imbalanced datasets. Experimental evaluations on a benchmark dataset
+demonstrate QAmplifyNet's superiority over classical models, quantum ensembles,
+quantum neural networks, and deep reinforcement learning. Its proficiency in
+handling short, imbalanced datasets makes it an ideal solution for supply chain
+management. To enhance model interpretability, we use Explainable Artificial
+Intelligence techniques. Practical implications include improved inventory
+control, reduced backorders, and enhanced operational efficiency. QAmplifyNet
+seamlessly integrates into real-world supply chain management systems, enabling
+proactive decision-making and efficient resource allocation. Future work
+involves exploring additional quantum-inspired techniques, expanding the
+dataset, and investigating other supply chain applications. This research
+unlocks the potential of quantum computing in supply chain optimization and
+paves the way for further exploration of quantum-inspired machine learning
+models in supply chain management. Our framework and QAmplifyNet model offer a
+breakthrough approach to supply chain backorder prediction, providing superior
+performance and opening new avenues for leveraging quantum-inspired techniques
+in supply chain management.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Approximation Theorem and error bounds for quantum neural
+  networks and quantum reservoirs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Gonon, Antoine Jacquier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Universal approximation theorems are the foundations of classical neural
+networks, providing theoretical guarantees that the latter are able to
+approximate maps of interest. Recent results have shown that this can also be
+achieved in a quantum setting, whereby classical functions can be approximated
+by parameterised quantum circuits. We provide here precise error bounds for
+specific classes of functions and extend these results to the interesting new
+setup of randomised quantum circuits, mimicking classical reservoir neural
+networks. Our results show in particular that a quantum neural network with
+$\mathcal{O}(\varepsilon^{-2})$ weights and $\mathcal{O} (\lceil
+\log_2(\varepsilon^{-1}) \rceil)$ qubits suffices to achieve accuracy
+$\varepsilon>0$ when approximating functions with integrable Fourier transform.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 0 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anytime Model Selection in Linear Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parnian Kassraie, Aldo Pacchiano, Nicolas Emmenegger, Andreas Krause
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model selection in the context of bandit optimization is a challenging
+problem, as it requires balancing exploration and exploitation not only for
+action selection, but also for model selection. One natural approach is to rely
+on online learning algorithms that treat different models as experts. Existing
+methods, however, scale poorly ($\text{poly}M$) with the number of models $M$
+in terms of their regret. Our key insight is that, for model selection in
+linear bandits, we can emulate full-information feedback to the online learner
+with a favorable bias-variance trade-off. This allows us to develop ALEXP,
+which has an exponentially improved ($\log M$) dependence on $M$ for its
+regret. ALEXP has anytime guarantees on its regret, and neither requires
+knowledge of the horizon $n$, nor relies on an initial purely exploratory
+stage. Our approach utilizes a novel time-uniform analysis of the Lasso,
+establishing a new connection between online learning and high-dimensional
+statistics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Statistical View of Column Subset Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anav Sood, Trevor Hastie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of selecting a small subset of representative
+variables from a large dataset. In the computer science literature, this
+dimensionality reduction problem is typically formalized as Column Subset
+Selection (CSS). Meanwhile, the typical statistical formalization is to find an
+information-maximizing set of Principal Variables. This paper shows that these
+two approaches are equivalent, and moreover, both can be viewed as maximum
+likelihood estimation within a certain semi-parametric model. Using these
+connections, we show how to efficiently (1) perform CSS using only summary
+statistics from the original dataset; (2) perform CSS in the presence of
+missing and/or censored data; and (3) select the subset size for CSS in a
+hypothesis testing framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-free Black-box Attack based on Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12872v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12872v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingwen Shao, Lingzhuang Meng, Yuanjian Qiao, Lixu Zhang, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the training data for the target model in a data-free black-box attack
+is not available, most recent schemes utilize GANs to generate data for
+training substitute model. However, these GANs-based schemes suffer from low
+training efficiency as the generator needs to be retrained for each target
+model during the substitute training process, as well as low generation
+quality. To overcome these limitations, we consider utilizing the diffusion
+model to generate data, and propose a data-free black-box attack scheme based
+on diffusion model to improve the efficiency and accuracy of substitute
+training. Despite the data generated by the diffusion model exhibits high
+quality, it presents diverse domain distributions and contains many samples
+that do not meet the discriminative criteria of the target model. To further
+facilitate the diffusion model to generate data suitable for the target model,
+we propose a Latent Code Augmentation (LCA) method to guide the diffusion model
+in generating data. With the guidance of LCA, the data generated by the
+diffusion model not only meets the discriminative criteria of the target model
+but also exhibits high diversity. By utilizing this data, it is possible to
+train substitute model that closely resemble the target model more efficiently.
+Extensive experiments demonstrate that our LCA achieves higher attack success
+rates and requires fewer query budgets compared to GANs-based schemes for
+different target models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stochastic Step-wise Feature Selection for Exponential Random Graph
+  Models (ERGMs) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Helal El-Zaatari, Fei Yu, Michael R Kosorok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical analysis of social networks provides valuable insights into
+complex network interactions across various scientific disciplines. However,
+accurate modeling of networks remains challenging due to the heavy
+computational burden and the need to account for observed network dependencies.
+Exponential Random Graph Models (ERGMs) have emerged as a promising technique
+used in social network modeling to capture network dependencies by
+incorporating endogenous variables. Nevertheless, using ERGMs poses multiple
+challenges, including the occurrence of ERGM degeneracy, which generates
+unrealistic and meaningless network structures. To address these challenges and
+enhance the modeling of collaboration networks, we propose and test a novel
+approach that focuses on endogenous variable selection within ERGMs. Our method
+aims to overcome the computational burden and improve the accommodation of
+observed network dependencies, thereby facilitating more accurate and
+meaningful interpretations of network phenomena in various scientific fields.
+We conduct empirical testing and rigorous analysis to contribute to the
+advancement of statistical techniques and offer practical insights for network
+analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 6 tables and 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Real-World WebAgent with Planning, Long Context Understanding, and
+  Program Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Izzeddin Gur, Hiroki Furuta, Austin Huang, Mustafa Safdari, Yutaka Matsuo, Douglas Eck, Aleksandra Faust
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large language models (LLMs) have recently achieved better
+generalization and sample efficiency in autonomous web navigation. However, the
+performance on real-world websites has still suffered from (1) open domainness,
+(2) limited context length, and (3) lack of inductive bias on HTML. We
+introduce WebAgent, an LLM-driven agent that can complete the tasks on real
+websites following natural language instructions. WebAgent plans ahead by
+decomposing instructions into canonical sub-instructions, summarizes long HTML
+documents into task-relevant snippets, and acts on websites via generated
+Python programs from those. We design WebAgent with Flan-U-PaLM, for grounded
+code generation, and HTML-T5, new pre-trained LLMs for long HTML documents
+using local and global attention mechanisms and a mixture of long-span
+denoising objectives, for planning and summarization. We empirically
+demonstrate that our recipe improves the success on a real website by over 50%,
+and that HTML-T5 is the best model to solve HTML-based tasks; achieving 14.9%
+higher success rate than prior SoTA on the MiniWoB web navigation benchmark and
+better accuracy on offline task planning evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early Neuron Alignment in Two-layer ReLU Networks with Small
+  Initialization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hancheng Min, René Vidal, Enrique Mallada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the problem of training a two-layer ReLU network for
+binary classification using gradient flow with small initialization. We
+consider a training dataset with well-separated input vectors: Any pair of
+input data with the same label are positively correlated, and any pair with
+different labels are negatively correlated. Our analysis shows that, during the
+early phase of training, neurons in the first layer try to align with either
+the positive data or the negative data, depending on its corresponding weight
+on the second layer. A careful analysis of the neurons' directional dynamics
+allows us to provide an $\mathcal{O}(\frac{\log n}{\sqrt{\mu}})$ upper bound on
+the time it takes for all neurons to achieve good alignment with the input
+data, where $n$ is the number of data points and $\mu$ measures how well the
+data are separated. After the early alignment phase, the loss converges to zero
+at a $\mathcal{O}(\frac{1}{t})$ rate, and the weight matrix on the first layer
+is approximately low-rank. Numerical experiments on the MNIST dataset
+illustrate our theoretical findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficiently Learning One-Hidden-Layer ReLU Networks via Schur
+  Polynomials 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12840v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12840v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilias Diakonikolas, Daniel M. Kane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of PAC learning a linear combination of $k$ ReLU
+activations under the standard Gaussian distribution on $\mathbb{R}^d$ with
+respect to the square loss. Our main result is an efficient algorithm for this
+learning task with sample and computational complexity $(dk/\epsilon)^{O(k)}$,
+where $\epsilon>0$ is the target accuracy. Prior work had given an algorithm
+for this problem with complexity $(dk/\epsilon)^{h(k)}$, where the function
+$h(k)$ scales super-polynomially in $k$. Interestingly, the complexity of our
+algorithm is near-optimal within the class of Correlational Statistical Query
+algorithms. At a high-level, our algorithm uses tensor decomposition to
+identify a subspace such that all the $O(k)$-order moments are small in the
+orthogonal directions. Its analysis makes essential use of the theory of Schur
+polynomials to show that the higher-moment error tensors are small given that
+the lower-order ones are.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Provably Robust Estimators for Inverse Problems via Jittering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anselm Krainovic, Mahdi Soltanolkotabi, Reinhard Heckel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks provide excellent performance for inverse problems such
+as denoising. However, neural networks can be sensitive to adversarial or
+worst-case perturbations. This raises the question of whether such networks can
+be trained efficiently to be worst-case robust. In this paper, we investigate
+whether jittering, a simple regularization technique that adds isotropic
+Gaussian noise during training, is effective for learning worst-case robust
+estimators for inverse problems. While well studied for prediction in
+classification tasks, the effectiveness of jittering for inverse problems has
+not been systematically investigated. In this paper, we present a novel
+analytical characterization of the optimal $\ell_2$-worst-case robust estimator
+for linear denoising and show that jittering yields optimal robust denoisers.
+Furthermore, we examine jittering empirically via training deep neural networks
+(U-nets) for natural image denoising, deconvolution, and accelerated magnetic
+resonance imaging (MRI). The results show that jittering significantly enhances
+the worst-case robustness, but can be suboptimal for inverse problems beyond
+denoising. Moreover, our results imply that training on real data which often
+contains slight noise is somewhat robustness enhancing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Fair Machine Learning via Rank-Preserving Interventional
+  Distributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12797v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12797v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ludwig Bothmann, Susanne Dandl, Michael Schomaker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A decision can be defined as fair if equal individuals are treated equally
+and unequals unequally. Adopting this definition, the task of designing machine
+learning models that mitigate unfairness in automated decision-making systems
+must include causal thinking when introducing protected attributes. Following a
+recent proposal, we define individuals as being normatively equal if they are
+equal in a fictitious, normatively desired (FiND) world, where the protected
+attribute has no (direct or indirect) causal effect on the target. We propose
+rank-preserving interventional distributions to define an estimand of this FiND
+world and a warping method for estimation. Evaluation criteria for both the
+method and resulting model are presented and validated through simulations and
+empirical data. With this, we show that our warping approach effectively
+identifies the most discriminated individuals and mitigates unfairness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compact & Capable: Harnessing Graph Neural Networks and Edge Convolution
+  for Medical Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aryan Singh, Pepijn Van de Ven, Ciarán Eising, Patrick Denny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-based neural network models are gaining traction in the field of
+representation learning due to their ability to uncover latent topological
+relationships between entities that are otherwise challenging to identify.
+These models have been employed across a diverse range of domains, encompassing
+drug discovery, protein interactions, semantic segmentation, and fluid dynamics
+research. In this study, we investigate the potential of Graph Neural Networks
+(GNNs) for medical image classification. We introduce a novel model that
+combines GNNs and edge convolution, leveraging the interconnectedness of RGB
+channel feature values to strongly represent connections between crucial graph
+nodes. Our proposed model not only performs on par with state-of-the-art Deep
+Neural Networks (DNNs) but does so with 1000 times fewer parameters, resulting
+in reduced training time and data requirements. We compare our Graph
+Convolutional Neural Network (GCNN) to pre-trained DNNs for classifying
+MedMNIST dataset classes, revealing promising prospects for GNNs in medical
+image analysis. Our results also encourage further exploration of advanced
+graph-based models such as Graph Attention Networks (GAT) and Graph
+Auto-Encoders in the medical imaging domain. The proposed model yields more
+reliable, interpretable, and accurate outcomes for tasks like semantic
+segmentation and image classification compared to simpler GCNNs
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing the Strategy of Propaganda using Inverse Reinforcement
+  Learning: Evidence from the 2022 Russian Invasion of Ukraine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominique Geissler, Stefan Feuerriegel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The 2022 Russian invasion of Ukraine was accompanied by a large-scale,
+pro-Russian propaganda campaign on social media. However, the strategy behind
+the dissemination of propaganda has remained unclear, particularly how the
+online discourse was strategically shaped by the propagandists' community.
+Here, we analyze the strategy of the Twitter community using an inverse
+reinforcement learning (IRL) approach. Specifically, IRL allows us to model
+online behavior as a Markov decision process, where the goal is to infer the
+underlying reward structure that guides propagandists when interacting with
+users with a supporting or opposing stance toward the invasion. Thereby, we aim
+to understand empirically whether and how between-user interactions are
+strategically used to promote the proliferation of Russian propaganda. For
+this, we leverage a large-scale dataset with 349,455 posts with pro-Russian
+propaganda from 132,131 users. We show that bots and humans follow a different
+strategy: bots respond predominantly to pro-invasion messages, suggesting that
+they seek to drive virality; while messages indicating opposition primarily
+elicit responses from humans, suggesting that they tend to engage in critical
+discussions. To the best of our knowledge, this is the first study analyzing
+the strategy behind propaganda from the 2022 Russian invasion of Ukraine
+through the lens of IRL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is attention all you need in medical image analysis? A <span class="highlight-title">review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgos Papanastasiou, Nikolaos Dikaios, Jiahao Huang, Chengjia Wang, Guang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical imaging is a key component in clinical diagnosis, treatment planning
+and clinical trial design, accounting for almost 90% of all healthcare data.
+CNNs achieved performance gains in medical image analysis (MIA) over the last
+years. CNNs can efficiently model local pixel interactions and be trained on
+small-scale MI data. The main disadvantage of typical CNN models is that they
+ignore global pixel relationships within images, which limits their
+generalisation ability to understand out-of-distribution data with different
+'global' information. The recent progress of Artificial Intelligence gave rise
+to Transformers, which can learn global relationships from data. However, full
+Transformer models need to be trained on large-scale data and involve
+tremendous computational complexity. Attention and Transformer compartments
+(Transf/Attention) which can well maintain properties for modelling global
+relationships, have been proposed as lighter alternatives of full Transformers.
+Recently, there is an increasing trend to co-pollinate complementary
+local-global properties from CNN and Transf/Attention architectures, which led
+to a new era of hybrid models. The past years have witnessed substantial growth
+in hybrid CNN-Transf/Attention models across diverse MIA problems. In this
+systematic review, we survey existing hybrid CNN-Transf/Attention models,
+review and unravel key architectural designs, analyse breakthroughs, and
+evaluate current and future opportunities as well as challenges. We also
+introduced a comprehensive analysis framework on generalisation opportunities
+of scientific and clinical impact, based on which new data-driven domain
+generalisation and adaptation methods can be stimulated.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting disturbances in network-coupled dynamical systems with machine
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12771v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12771v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Per Sebastian Skardal, Juan G. Restrepo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying disturbances in network-coupled dynamical systems without
+knowledge of the disturbances or underlying dynamics is a problem with a wide
+range of applications. For example, one might want to know which nodes in the
+network are being disturbed and identify the type of disturbance. Here we
+present a model-free method based on machine learning to identify such unknown
+disturbances based only on prior observations of the system when forced by a
+known training function. We find that this method is able to identify the
+locations and properties of many different types of unknown disturbances using
+a variety of known forcing functions. We illustrate our results both with
+linear and nonlinear disturbances using food web and neuronal activity models.
+Finally, we discuss how to scale our method to large networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonparametric Linear Feature Learning in Regression Through
+  Regularisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bertille Follain, Umut Simsekli, Francis Bach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representation learning plays a crucial role in automated feature selection,
+particularly in the context of high-dimensional data, where non-parametric
+methods often struggle. In this study, we focus on supervised learning
+scenarios where the pertinent information resides within a lower-dimensional
+linear subspace of the data, namely the multi-index model. If this subspace
+were known, it would greatly enhance prediction, computation, and
+interpretation. To address this challenge, we propose a novel method for linear
+feature learning with non-parametric prediction, which simultaneously estimates
+the prediction function and the linear subspace. Our approach employs empirical
+risk minimisation, augmented with a penalty on function derivatives, ensuring
+versatility. Leveraging the orthogonality and rotation invariance properties of
+Hermite polynomials, we introduce our estimator, named RegFeaL. By utilising
+alternative minimisation, we iteratively rotate the data to improve alignment
+with leading directions and accurately estimate the relevant dimension in
+practical settings. We establish that our method yields a consistent estimator
+of the prediction function with explicit rates. Additionally, we provide
+empirical results demonstrating the performance of RegFeaL in various
+experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, 16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Concept-based explainability for an EEG <span class="highlight-title">transformer</span> model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12745v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12745v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anders Gjølbye Madsen, William Theodor Lehn-Schiøler, Áshildur Jónsdóttir, Bergdís Arnardóttir, Lars Kai Hansen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models are complex due to their size, structure, and inherent
+randomness in training procedures. Additional complexity arises from the
+selection of datasets and inductive biases. Addressing these challenges for
+explainability, Kim et al. (2018) introduced Concept Activation Vectors (CAVs),
+which aim to understand deep models' internal states in terms of human-aligned
+concepts. These concepts correspond to directions in latent space, identified
+using linear discriminants. Although this method was first applied to image
+classification, it was later adapted to other domains, including natural
+language processing. In this work, we attempt to apply the method to
+electroencephalogram (EEG) data for explainability in Kostas et al.'s BENDR
+(2021), a large-scale transformer model. A crucial part of this endeavor
+involves defining the explanatory concepts and selecting relevant datasets to
+ground concepts in the latent space. Our focus is on two mechanisms for EEG
+concept formation: the use of externally labeled EEG datasets, and the
+application of anatomically defined concepts. The former approach is a
+straightforward generalization of methods used in image classification, while
+the latter is novel and specific to EEG. We present evidence that both
+approaches to concept formation yield valuable insights into the
+representations learned by deep EEG models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in proceedings of 2023 IEEE International workshop on
+  Machine Learning for Signal Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safety Performance of Neural Networks in the Presence of Covariate Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12716v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12716v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chih-Hong Cheng, Harald Ruess, Konstantinos Theodorou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Covariate shift may impact the operational safety performance of neural
+networks. A re-evaluation of the safety performance, however, requires
+collecting new operational data and creating corresponding ground truth labels,
+which often is not possible during operation. We are therefore proposing to
+reshape the initial test set, as used for the safety performance evaluation
+prior to deployment, based on an approximation of the operational data. This
+approximation is obtained by observing and learning the distribution of
+activation patterns of neurons in the network during operation. The reshaped
+test set reflects the distribution of neuron activation values as observed
+during operation, and may therefore be used for re-evaluating safety
+performance in the presence of covariate shift. First, we derive conservative
+bounds on the values of neurons by applying finite binning and static dataflow
+analysis. Second, we formulate a mixed integer linear programming (MILP)
+constraint for constructing the minimum set of data points to be removed in the
+test set, such that the difference between the discretized test and operational
+distributions is bounded. We discuss potential benefits and limitations of this
+constraint-based approach based on our initial experience with an implemented
+research prototype.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Policy Gradient Optimal Correlation Search for Variance Reduction in
+  Monte Carlo simulation and Maximum Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Bras, Gilles Pagès
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new algorithm for variance reduction when estimating $f(X_T)$
+where $X$ is the solution to some stochastic differential equation and $f$ is a
+test function. The new estimator is $(f(X^1_T) + f(X^2_T))/2$, where $X^1$ and
+$X^2$ have same marginal law as $X$ but are pathwise correlated so that to
+reduce the variance. The optimal correlation function $\rho$ is approximated by
+a deep neural network and is calibrated along the trajectories of $(X^1, X^2)$
+by policy gradient and reinforcement learning techniques. Finding an optimal
+coupling given marginal laws has links with maximum optimal transport.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> MC-JEPA: A Joint-Embedding Predictive Architecture for <span class="highlight-title">Self-Supervised</span>
+  Learning of Motion and Content Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrien Bardes, Jean Ponce, <span class="highlight-author">Yann LeCun</span>
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning of visual representations has been focusing on
+learning content features, which do not capture object motion or location, and
+focus on identifying and differentiating objects in images and videos. On the
+other hand, optical flow estimation is a task that does not involve
+understanding the content of the images on which it is estimated. We unify the
+two approaches and introduce MC-JEPA, a joint-embedding predictive architecture
+and self-supervised learning approach to jointly learn optical flow and content
+features within a shared encoder, demonstrating that the two associated
+objectives; the optical flow estimation objective and the self-supervised
+learning objective; benefit from each other and thus learn content features
+that incorporate motion information. The proposed approach achieves performance
+on-par with existing unsupervised optical flow benchmarks, as well as with
+common self-supervised learning approaches on downstream tasks such as semantic
+segmentation of images and videos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing the Impact of Localized Training Data in Graph Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Singh Akansha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have achieved notable success in learning from
+graph-structured data, owing to their ability to capture intricate dependencies
+and relationships between nodes. They excel in various applications, including
+semi-supervised node classification, link prediction, and graph generation.
+However, it is important to acknowledge that the majority of state-of-the-art
+GNN models are built upon the assumption of an in-distribution setting, which
+hinders their performance on real-world graphs with dynamic structures. In this
+article, we aim to assess the impact of training GNNs on localized subsets of
+the graph. Such restricted training data may lead to a model that performs well
+in the specific region it was trained on but fails to generalize and make
+accurate predictions for the entire graph. In the context of graph-based
+semi-supervised learning (SSL), resource constraints often lead to scenarios
+where the dataset is large, but only a portion of it can be labeled, affecting
+the model's performance. This limitation affects tasks like anomaly detection
+or spam detection when labeling processes are biased or influenced by human
+subjectivity. To tackle the challenges posed by localized training data, we
+approach the problem as an out-of-distribution (OOD) data issue by by aligning
+the distributions between the training data, which represents a small portion
+of labeled data, and the graph inference process that involves making
+predictions for the entire graph. We propose a regularization method to
+minimize distributional discrepancies between localized training data and graph
+inference, improving model performance on OOD data. Extensive tests on popular
+GNN models show significant performance improvement on three citation GNN
+benchmark datasets. The regularization approach effectively enhances model
+adaptation and generalization, overcoming challenges posed by OOD data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Estimator for the Sensitivity to Perturbations of Deep Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12679v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12679v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naman Maheshwari, Nicholas Malaya, Scott Moe, Jaydeep P. Kulkarni, Sudhanva Gurumurthi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For Deep Neural Networks (DNNs) to become useful in safety-critical
+applications, such as self-driving cars and disease diagnosis, they must be
+stable to perturbations in input and model parameters. Characterizing the
+sensitivity of a DNN to perturbations is necessary to determine minimal
+bit-width precision that may be used to safely represent the network. However,
+no general result exists that is capable of predicting the sensitivity of a
+given DNN to round-off error, noise, or other perturbations in input. This
+paper derives an estimator that can predict such quantities. The estimator is
+derived via inequalities and matrix norms, and the resulting quantity is
+roughly analogous to a condition number for the entire neural network. An
+approximation of the estimator is tested on two Convolutional Neural Networks,
+AlexNet and VGG-19, using the ImageNet dataset. For each of these networks, the
+tightness of the estimator is explored via random perturbations and adversarial
+attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Actual work and paper concluded in January 2019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked
+  Image Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhen Pan, Suprosanna Shit, Özgün Turgut, Wenqi Huang, Hongwei Bran Li, Nil Stolt-Ansó, Thomas Küstner, Kerstin Hammernik, Daniel Rueckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In dynamic Magnetic Resonance Imaging (MRI), k-space is typically
+undersampled due to limited scan time, resulting in aliasing artifacts in the
+image domain. Hence, dynamic MR reconstruction requires not only modeling
+spatial frequency components in the x and y directions of k-space but also
+considering temporal redundancy. Most previous works rely on image-domain
+regularizers (priors) to conduct MR reconstruction. In contrast, we focus on
+interpolating the undersampled k-space before obtaining images with Fourier
+transform. In this work, we connect masked image modeling with k-space
+interpolation and propose a novel Transformer-based k-space Global
+Interpolation Network, termed k-GIN. Our k-GIN learns global dependencies among
+low- and high-frequency components of 2D+t k-space and uses it to interpolate
+unsampled data. Further, we propose a novel k-space Iterative Refinement Module
+(k-IRM) to enhance the high-frequency components learning. We evaluate our
+approach on 92 in-house 2D+t cardiac MR subjects and compare it to MR
+reconstruction methods with image-domain regularizers. Experiments show that
+our proposed k-space interpolation method quantitatively and qualitatively
+outperforms baseline methods. Importantly, the proposed approach achieves
+substantially higher robustness and generalizability in cases of
+highly-undersampled MR data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TransFusion: Generating Long, High Fidelity Time Series using Diffusion
+  Models with <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Fahim Sikder, Resmi Ramachandranpillai, Fredrik Heintz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of high-quality, long-sequenced time-series data is essential
+due to its wide range of applications. In the past, standalone Recurrent and
+Convolutional Neural Network-based Generative Adversarial Networks (GAN) were
+used to synthesize time-series data. However, they are inadequate for
+generating long sequences of time-series data due to limitations in the
+architecture. Furthermore, GANs are well known for their training instability
+and mode collapse problem. To address this, we propose TransFusion, a
+diffusion, and transformers-based generative model to generate high-quality
+long-sequence time-series data. We have stretched the sequence length to 384,
+and generated high-quality synthetic data. To the best of our knowledge, this
+is the first study that has been done with this long-sequence length. Also, we
+introduce two evaluation metrics to evaluate the quality of the synthetic data
+as well as its predictive characteristics. We evaluate TransFusion with a wide
+variety of visual and empirical metrics, and TransFusion outperforms the
+previous state-of-the-art by a significant margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Continual Learning in Keyword Spotting for Low-Resource Devices
+  via Pooling High-Order Temporal Statistics <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Umberto Michieli, Pablo Peso Parada, Mete Ozay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Keyword Spotting (KWS) models on embedded devices should adapt fast to new
+user-defined words without forgetting previous ones. Embedded devices have
+limited storage and computational resources, thus, they cannot save samples or
+update large models. We consider the setup of embedded online continual
+learning (EOCL), where KWS models with frozen backbone are trained to
+incrementally recognize new words from a non-repeated stream of samples, seen
+one at a time. To this end, we propose Temporal Aware Pooling (TAP) which
+constructs an enriched feature space computing high-order moments of speech
+features extracted by a pre-trained backbone. Our method, TAP-SLDA, updates a
+Gaussian model for each class on the enriched feature space to effectively use
+audio representations. In experimental analyses, TAP-SLDA outperforms
+competitors on several setups, backbones, and baselines, bringing a relative
+average gain of 11.3% on the GSC dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation
+  of rPPG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dae Yeol Kim, Eunsu Goh, KwangKee Lee, JongEui Chae, JongHyeon Mun, Junyeong Na, Chae-bong Sohn, Do-Yup Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote Photoplethysmography (rPPG) is a technology that utilizes the light
+absorption properties of hemoglobin, captured via camera, to analyze and
+measure blood volume pulse (BVP). By analyzing the measured BVP, various
+physiological signals such as heart rate, stress levels, and blood pressure can
+be derived, enabling applications such as the early prediction of
+cardiovascular diseases. rPPG is a rapidly evolving field as it allows the
+measurement of vital signals using camera-equipped devices without the need for
+additional devices such as blood pressure monitors or pulse oximeters, and
+without the assistance of medical experts. Despite extensive efforts and
+advances in this field, serious challenges remain, including issues related to
+skin color, camera characteristics, ambient lighting, and other sources of
+noise, which degrade performance accuracy. We argue that fair and evaluable
+benchmarking is urgently required to overcome these challenges and make any
+meaningful progress from both academic and commercial perspectives. In most
+existing work, models are trained, tested, and validated only on limited
+datasets. Worse still, some studies lack available code or reproducibility,
+making it difficult to fairly evaluate and compare performance. Therefore, the
+purpose of this study is to provide a benchmarking framework to evaluate
+various rPPG techniques across a wide range of datasets for fair evaluation and
+comparison, including both conventional non-deep neural network (non-DNN) and
+deep neural network (DNN) methods. GitHub URL:
+https://github.com/remotebiosensing/rppg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fake News Detection Through Graph-based Neural Networks: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuzhi Gong, Richard O. Sinnott, Jianzhong Qi, Cecile Paris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of online social networks has enabled rapid dissemination of
+information. People now can share and consume information much more rapidly
+than ever before. However, low-quality and/or accidentally/deliberately fake
+information can also spread rapidly. This can lead to considerable and negative
+impacts on society. Identifying, labelling and debunking online misinformation
+as early as possible has become an increasingly urgent problem. Many methods
+have been proposed to detect fake news including many deep learning and
+graph-based approaches. In recent years, graph-based methods have yielded
+strong results, as they can closely model the social context and propagation
+process of online news. In this paper, we present a systematic review of fake
+news detection studies based on graph-based and deep learning-based techniques.
+We classify existing graph-based methods into knowledge-driven methods,
+propagation-based methods, and heterogeneous social context-based methods,
+depending on how a graph structure is constructed to model news related
+information flows. We further discuss the challenges and open problems in
+graph-based fake news detection and identify future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 3 tables, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identifying drivers and mitigators for congestion and redispatch in the
+  German electric power system with explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12636v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12636v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maurizio Titz, Sebastian Pütz, Dirk Witthaut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transition to a sustainable energy supply challenges the operation of
+electric power systems in manifold ways. Transmission grid loads increase as
+wind and solar power are often installed far away from the consumers. In
+extreme cases, system operators must intervene via countertrading or redispatch
+to ensure grid stability. In this article, we provide a data-driven analysis of
+congestion in the German transmission grid. We develop an explainable machine
+learning model to predict the volume of redispatch and countertrade on an
+hourly basis. The model reveals factors that drive or mitigate grid congestion
+and quantifies their impact. We show that, as expected, wind power generation
+is the main driver, but hydropower and cross-border electricity trading also
+play an essential role. Solar power, on the other hand, has no mitigating
+effect. Our results suggest that a change to the market design would alleviate
+congestion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ De-confounding Representation Learning for Counterfactual Inference on
+  Continuous Treatment via Generative Adversarial Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12625v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12625v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yonghe Zhao, Qiang Huang, Haolong Zeng, Yun Pen, Huiyan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual inference for continuous rather than binary treatment
+variables is more common in real-world causal inference tasks. While there are
+already some sample reweighting methods based on Marginal Structural Model for
+eliminating the confounding bias, they generally focus on removing the
+treatment's linear dependence on confounders and rely on the accuracy of the
+assumed parametric models, which are usually unverifiable. In this paper, we
+propose a de-confounding representation learning (DRL) framework for
+counterfactual outcome estimation of continuous treatment by generating the
+representations of covariates disentangled with the treatment variables. The
+DRL is a non-parametric model that eliminates both linear and nonlinear
+dependence between treatment and covariates. Specifically, we train the
+correlations between the de-confounded representations and the treatment
+variables against the correlations between the covariate representations and
+the treatment variables to eliminate confounding bias. Further, a
+counterfactual inference network is embedded into the framework to make the
+learned representations serve both de-confounding and trusted inference.
+Extensive experiments on synthetic datasets show that the DRL model performs
+superiorly in learning de-confounding representations and outperforms
+state-of-the-art counterfactual inference models for continuous treatment
+variables. In addition, we apply the DRL model to a real-world medical dataset
+MIMIC and demonstrate a detailed causal relationship between red cell width
+distribution and mortality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages,4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting Ordinary Differential Equations with <span class="highlight-title">Transformer</span>s <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sören Becker, Michal Klein, Alexander Neitz, Giambattista Parascandolo, Niki Kilbertus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a transformer-based sequence-to-sequence model that recovers
+scalar ordinary differential equations (ODEs) in symbolic form from irregularly
+sampled and noisy observations of a single solution trajectory. We demonstrate
+in extensive empirical evaluations that our model performs better or on par
+with existing methods in terms of accurate recovery across various settings.
+Moreover, our method is efficiently scalable: after one-time pretraining on a
+large set of ODEs, we can infer the governing law of a new observed solution in
+a few forward passes of the model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ExWarp: Extrapolation and Warping-based Temporal Supersampling for
+  High-frequency Displays 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12607v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12607v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akanksha Dixit, Yashashwee Chakrabarty, Smruti R. Sarangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-frequency displays are gaining immense popularity because of their
+increasing use in video games and virtual reality applications. However, the
+issue is that the underlying GPUs cannot continuously generate frames at this
+high rate -- this results in a less smooth and responsive experience.
+Furthermore, if the frame rate is not synchronized with the refresh rate, the
+user may experience screen tearing and stuttering. Previous works propose
+increasing the frame rate to provide a smooth experience on modern displays by
+predicting new frames based on past or future frames. Interpolation and
+extrapolation are two widely used algorithms that predict new frames.
+Interpolation requires waiting for the future frame to make a prediction, which
+adds additional latency. On the other hand, extrapolation provides a better
+quality of experience because it relies solely on past frames -- it does not
+incur any additional latency. The simplest method to extrapolate a frame is to
+warp the previous frame using motion vectors; however, the warped frame may
+contain improperly rendered visual artifacts due to dynamic objects -- this
+makes it very challenging to design such a scheme. Past work has used DNNs to
+get good accuracy, however, these approaches are slow. This paper proposes
+Exwarp -- an approach based on reinforcement learning (RL) to intelligently
+choose between the slower DNN-based extrapolation and faster warping-based
+methods to increase the frame rate by 4x with an almost negligible reduction in
+the perceived image quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Concept backpropagation: An Explainable AI approach for visualising
+  learned concepts in neural network models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12601v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12601v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrik Hammersborg, Inga Strümke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural network models are widely used in a variety of domains, often as
+black-box solutions, since they are not directly interpretable for humans. The
+field of explainable artificial intelligence aims at developing explanation
+methods to address this challenge, and several approaches have been developed
+over the recent years, including methods for investigating what type of
+knowledge these models internalise during the training process. Among these,
+the method of concept detection, investigates which \emph{concepts} neural
+network models learn to represent in order to complete their tasks. In this
+work, we present an extension to the method of concept detection, named
+\emph{concept backpropagation}, which provides a way of analysing how the
+information representing a given concept is internalised in a given neural
+network model. In this approach, the model input is perturbed in a manner
+guided by a trained concept probe for the described model, such that the
+concept of interest is maximised. This allows for the visualisation of the
+detected concept directly in the input space of the model, which in turn makes
+it possible to see what information the model depends on for representing the
+described concept. We present results for this method applied to a various set
+of input modalities, and discuss how our proposed method can be used to
+visualise what information trained concept probes use, and the degree as to
+which the representation of the probed concept is entangled within the neural
+network model itself.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimized data collection and analysis process for studying
+  solar-thermal desalination by machine learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12594v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12594v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guilong Peng, Senshan Sun, Yangjun Qin, Zhenwei Xu, Juxin Du, Swellam W. sharshir, A. W. Kandel, A. E. Kabeel, Nuo Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An effective interdisciplinary study between machine learning and
+solar-thermal desalination requires a sufficiently large and well-analyzed
+experimental datasets. This study develops a modified dataset collection and
+analysis process for studying solar-thermal desalination by machine learning.
+Based on the optimized water condensation and collection process, the proposed
+experimental method collects over one thousand datasets, which is ten times
+more than the average number of datasets in previous works, by accelerating
+data collection and reducing the time by 83.3%. On the other hand, the effects
+of dataset features are investigated by using three different algorithms,
+including artificial neural networks, multiple linear regressions, and random
+forests. The investigation focuses on the effects of dataset size and range on
+prediction accuracy, factor importance ranking, and the model's generalization
+ability. The results demonstrate that a larger dataset can significantly
+improve prediction accuracy when using artificial neural networks and random
+forests. Additionally, the study highlights the significant impact of dataset
+size and range on ranking the importance of influence factors. Furthermore, the
+study reveals that the extrapolation data range significantly affects the
+extrapolation accuracy of artificial neural networks. Based on the results,
+massive dataset collection and analysis of dataset feature effects are
+important steps in an effective and consistent machine learning process flow
+for solar-thermal desalination, which can promote machine learning as a more
+general tool in the field of solar-thermal desalination.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InVAErt networks: a data-driven framework for emulation, inference and
+  identifiability analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxiang Grayson Tong, Carlos A. Sing Long, Daniele E. Schiavazzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Use of generative models and deep learning for physics-based systems is
+currently dominated by the task of emulation. However, the remarkable
+flexibility offered by data-driven architectures would suggest to extend this
+representation to other aspects of system synthesis including model inversion
+and identifiability. We introduce inVAErt (pronounced \emph{invert}) networks,
+a comprehensive framework for data-driven analysis and synthesis of parametric
+physical systems which uses a deterministic encoder and decoder to represent
+the forward and inverse solution maps, normalizing flow to capture the
+probabilistic distribution of system outputs, and a variational encoder
+designed to learn a compact latent representation for the lack of bijectivity
+between inputs and outputs. We formally investigate the selection of penalty
+coefficients in the loss function and strategies for latent space sampling,
+since we find that these significantly affect both training and testing
+performance. We validate our framework through extensive numerical examples,
+including simple linear, nonlinear, and periodic maps, dynamical systems, and
+spatio-temporal PDEs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-refining of Pseudo Labels for Music Source Separation with Noisy
+  Labeled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junghyun Koo, Yunkee Chae, Chang-Bin Jeon, Kyogu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music source separation (MSS) faces challenges due to the limited
+availability of correctly-labeled individual instrument tracks. With the push
+to acquire larger datasets to improve MSS performance, the inevitability of
+encountering mislabeled individual instrument tracks becomes a significant
+challenge to address. This paper introduces an automated technique for refining
+the labels in a partially mislabeled dataset. Our proposed self-refining
+technique, employed with a noisy-labeled dataset, results in only a 1% accuracy
+degradation in multi-label instrument recognition compared to a classifier
+trained on a clean-labeled dataset. The study demonstrates the importance of
+refining noisy-labeled data in MSS model training and shows that utilizing the
+refined dataset leads to comparable results derived from a clean-labeled
+dataset. Notably, upon only access to a noisy dataset, MSS models trained on a
+self-refined dataset even outperform those trained on a dataset refined with a
+classifier trained on clean labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24th International Society for Music Information Retrieval Conference
+  (ISMIR 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalising Neural Topical Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12564v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12564v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohao Yang, He Zhao, Dinh Phung, Lan Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topic models have evolved from conventional Bayesian probabilistic models to
+Neural Topic Models (NTMs) over the last two decays. Although NTMs have
+achieved promising performance when trained and tested on a specific corpus,
+their generalisation ability across corpora is rarely studied. In practice, we
+often expect that an NTM trained on a source corpus can still produce quality
+topical representation for documents in a different target corpus without
+retraining. In this work, we aim to improve NTMs further so that their benefits
+generalise reliably across corpora and tasks. To do so, we propose to model
+similar documents by minimising their semantical distance when training NTMs.
+Specifically, similar documents are created by data augmentation during
+training; The semantical distance between documents is measured by the
+Hierarchical Topic Transport Distance (HOTT), which computes the Optimal
+Transport (OT) distance between the topical representations. Our framework can
+be readily applied to most NTMs as a plug-and-play module. Extensive
+experiments show that our framework significantly improves the generalisation
+ability regarding neural topical representation across corpora.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Homophily-Driven Sanitation View for Robust Graph Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulin Zhu, Xing Ai, Yevgeniy Vorobeychik, Kai Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate adversarial robustness of unsupervised Graph Contrastive
+Learning (GCL) against structural attacks. First, we provide a comprehensive
+empirical and theoretical analysis of existing attacks, revealing how and why
+they downgrade the performance of GCL. Inspired by our analytic results, we
+present a robust GCL framework that integrates a homophily-driven sanitation
+view, which can be learned jointly with contrastive learning. A key challenge
+this poses, however, is the non-differentiable nature of the sanitation
+objective. To address this challenge, we propose a series of techniques to
+enable gradient-based end-to-end robust GCL. Moreover, we develop a fully
+unsupervised hyperparameter tuning method which, unlike prior approaches, does
+not require knowledge of node labels. We conduct extensive experiments to
+evaluate the performance of our proposed model, GCHS (Graph Contrastive
+Learning with Homophily-driven Sanitation View), against two state of the art
+structural attacks on GCL. Our results demonstrate that GCHS consistently
+outperforms all state of the art baselines in terms of the quality of generated
+node embeddings as well as performance on two important downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continuation Path Learning for Homotopy Optimization <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12551v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12551v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Lin, Zhiyuan Yang, Xiaoyuan Zhang, Qingfu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Homotopy optimization is a traditional method to deal with a complicated
+optimization problem by solving a sequence of easy-to-hard surrogate
+subproblems. However, this method can be very sensitive to the continuation
+schedule design and might lead to a suboptimal solution to the original
+problem. In addition, the intermediate solutions, often ignored by classic
+homotopy optimization, could be useful for many real-world applications. In
+this work, we propose a novel model-based approach to learn the whole
+continuation path for homotopy optimization, which contains infinite
+intermediate solutions for any surrogate subproblems. Rather than the classic
+unidirectional easy-to-hard optimization, our method can simultaneously
+optimize the original problem and all surrogate subproblems in a collaborative
+manner. The proposed model also supports real-time generation of any
+intermediate solution, which could be desirable for many applications.
+Experimental studies on different problems show that our proposed method can
+significantly improve the performance of homotopy optimization and provide
+extra helpful information to support better decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 40th International Conference on Machine Learning
+  (ICML 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Connection between <span class="highlight-title">Pre-train</span>ing Data Diversity and Fine-tuning
+  Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Ramanujan, Thao Nguyen, Sewoong Oh, Ludwig Schmidt, Ali Farhadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-training has been widely adopted in deep learning to improve model
+performance, especially when the training data for a target task is limited. In
+our work, we seek to understand the implications of this training strategy on
+the generalization properties of downstream models. More specifically, we ask
+the following question: how do properties of the pre-training distribution
+affect the robustness of a fine-tuned model? The properties we explore include
+the label space, label semantics, image diversity, data domains, and data
+quantity of the pre-training distribution. We find that the primary factor
+influencing downstream effective robustness (Taori et al., 2020) is data
+quantity, while other factors have limited significance. For example, reducing
+the number of ImageNet pre-training classes by 4x while increasing the number
+of images per class by 4x (that is, keeping total data quantity fixed) does not
+impact the robustness of fine-tuned models. We demonstrate our findings on
+pre-training distributions drawn from various natural and synthetic data
+sources, primarily using the iWildCam-WILDS distribution shift as a test for
+downstream robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Medical Report Generation: Disease Revealing Enhancement with
+  Knowledge Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixin Wang, Zihao Lin, Haoyu Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graph (KG) plays a crucial role in Medical Report Generation (MRG)
+because it reveals the relations among diseases and thus can be utilized to
+guide the generation process. However, constructing a comprehensive KG is
+labor-intensive and its applications on the MRG process are under-explored. In
+this study, we establish a complete KG on chest X-ray imaging that includes 137
+types of diseases and abnormalities. Based on this KG, we find that the current
+MRG data sets exhibit a long-tailed problem in disease distribution. To
+mitigate this problem, we introduce a novel augmentation strategy that enhances
+the representation of disease types in the tail-end of the distribution. We
+further design a two-stage MRG approach, where a classifier is first trained to
+detect whether the input images exhibit any abnormalities. The classified
+images are then independently fed into two transformer-based generators,
+namely, ``disease-specific generator" and ``disease-free generator" to generate
+the corresponding reports. To enhance the clinical evaluation of whether the
+generated reports correctly describe the diseases appearing in the input image,
+we propose diverse sensitivity (DS), a new metric that checks whether generated
+diseases match ground truth and measures the diversity of all generated
+diseases. Results show that the proposed two-stage generation framework and
+augmentation strategies improve DS by a considerable margin, indicating a
+notable reduction in the long-tailed problem associated with under-represented
+diseases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Landslide Surface Displacement Prediction Based on VSXC-LSTM Algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12524v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12524v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menglin Kong, Ruichen Li, Fan Liu, Xingquan Li, Juan Cheng, Muzhou Hou, Cong Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Landslide is a natural disaster that can easily threaten local ecology,
+people's lives and property. In this paper, we conduct modelling research on
+real unidirectional surface displacement data of recent landslides in the
+research area and propose a time series prediction framework named
+VMD-SegSigmoid-XGBoost-ClusterLSTM (VSXC-LSTM) based on variational mode
+decomposition, which can predict the landslide surface displacement more
+accurately. The model performs well on the test set. Except for the random item
+subsequence that is hard to fit, the root mean square error (RMSE) and the mean
+absolute percentage error (MAPE) of the trend item subsequence and the periodic
+item subsequence are both less than 0.1, and the RMSE is as low as 0.006 for
+the periodic item prediction module based on XGBoost\footnote{Accepted in
+ICANN2023}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lost In Translation: Generating Adversarial Examples Robust to
+  Round-Trip Translation <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neel Bhandari, Pin-Yu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language Models today provide a high accuracy across a large number of
+downstream tasks. However, they remain susceptible to adversarial attacks,
+particularly against those where the adversarial examples maintain considerable
+similarity to the original text. Given the multilingual nature of text, the
+effectiveness of adversarial examples across translations and how machine
+translations can improve the robustness of adversarial examples remain largely
+unexplored. In this paper, we present a comprehensive study on the robustness
+of current text adversarial attacks to round-trip translation. We demonstrate
+that 6 state-of-the-art text-based adversarial attacks do not maintain their
+efficacy after round-trip translation. Furthermore, we introduce an
+intervention-based solution to this problem, by integrating Machine Translation
+into the process of adversarial example generation and demonstrating increased
+robustness to round-trip translation. Our results indicate that finding
+adversarial examples robust to translation can help identify the insufficiency
+of language models that is common across languages, and motivate further
+research into multilingual adversarial attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at International Conference on Acoustics, Speech, and
+  Signal Processing (ICASSP) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DEPHN: Different Expression Parallel Heterogeneous Network using virtual
+  gradient optimization for Multi-task Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12519v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12519v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menglin Kong, Ri Su, Shaojie Zhao, Muzhou Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation system algorithm based on multi-task learning (MTL) is the
+major method for Internet operators to understand users and predict their
+behaviors in the multi-behavior scenario of platform. Task correlation is an
+important consideration of MTL goals, traditional models use shared-bottom
+models and gating experts to realize shared representation learning and
+information differentiation. However, The relationship between real-world tasks
+is often more complex than existing methods do not handle properly sharing
+information. In this paper, we propose an Different Expression Parallel
+Heterogeneous Network (DEPHN) to model multiple tasks simultaneously. DEPHN
+constructs the experts at the bottom of the model by using different feature
+interaction methods to improve the generalization ability of the shared
+information flow. In view of the model's differentiating ability for different
+task information flows, DEPHN uses feature explicit mapping and virtual
+gradient coefficient for expert gating during the training process, and
+adaptively adjusts the learning intensity of the gated unit by considering the
+difference of gating values and task correlation. Extensive experiments on
+artificial and real-world datasets demonstrate that our proposed method can
+capture task correlation in complex situations and achieve better performance
+than baseline models\footnote{Accepted in IJCNN2023}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FaFCNN: A General Disease Classification Framework Based on Feature
+  Fusion Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menglin Kong, Shaojie Zhao, Juan Cheng, Xingquan Li, Ri Su, Muzhou Hou, Cong Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There are two fundamental problems in applying deep learning/machine learning
+methods to disease classification tasks, one is the insufficient number and
+poor quality of training samples; another one is how to effectively fuse
+multiple source features and thus train robust classification models. To
+address these problems, inspired by the process of human learning knowledge, we
+propose the Feature-aware Fusion Correlation Neural Network (FaFCNN), which
+introduces a feature-aware interaction module and a feature alignment module
+based on domain adversarial learning. This is a general framework for disease
+classification, and FaFCNN improves the way existing methods obtain sample
+correlation features. The experimental results show that training using
+augmented features obtained by pre-training gradient boosting decision tree
+yields more performance gains than random-forest based methods. On the
+low-quality dataset with a large amount of missing data in our setup, FaFCNN
+obtains a consistently optimal performance compared to competitive baselines.
+In addition, extensive experiments demonstrate the robustness of the proposed
+method and the effectiveness of each component of the model\footnote{Accepted
+in IEEE SMC2023}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Evaluation of Temporal Graph Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we conduct an empirical evaluation of Temporal Graph Benchmark
+(TGB) by extending our Dynamic Graph Library (DyGLib) to TGB. Compared with
+TGB, we include eleven popular dynamic graph learning methods for more
+exhaustive comparisons. Through the experiments, we find that (1) some issues
+need to be addressed in the current version of TGB, including mismatched data
+statistics, inaccurate evaluation metric computation, and so on; (2) different
+models depict varying performance across various datasets, which is in line
+with previous observations; (3) the performance of some baselines can be
+significantly improved over the reported results in TGB when using DyGLib. This
+work aims to ease the researchers' efforts in evaluating various dynamic graph
+learning methods on TGB and attempts to offer results that can be directly
+referenced in the follow-up research. All the used resources in this project
+are publicly available at https://github.com/yule-BUAA/DyGLib_TGB. This work is
+in progress, and feedback from the community is welcomed for improvements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint, in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdvDiff: Generating Unrestricted Adversarial Examples using Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuelong Dai, Kaisheng Liang, Bin Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unrestricted adversarial attacks present a serious threat to deep learning
+models and adversarial defense techniques. They pose severe security problems
+for deep learning applications because they can effectively bypass defense
+mechanisms. However, previous attack methods often utilize Generative
+Adversarial Networks (GANs), which are not theoretically provable and thus
+generate unrealistic examples by incorporating adversarial objectives,
+especially for large-scale datasets like ImageNet. In this paper, we propose a
+new method, called AdvDiff, to generate unrestricted adversarial examples with
+diffusion models. We design two novel adversarial guidance techniques to
+conduct adversarial sampling in the reverse generation process of diffusion
+models. These two techniques are effective and stable to generate high-quality,
+realistic adversarial examples by integrating gradients of the target
+classifier interpretably. Experimental results on MNIST and ImageNet datasets
+demonstrate that AdvDiff is effective to generate unrestricted adversarial
+examples, which outperforms GAN-based methods in terms of attack performance
+and generation quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A faster and simpler algorithm for learning shallow networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sitan Chen, Shyam Narayanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We revisit the well-studied problem of learning a linear combination of $k$
+ReLU activations given labeled examples drawn from the standard $d$-dimensional
+Gaussian measure. Chen et al. [CDG+23] recently gave the first algorithm for
+this problem to run in $\text{poly}(d,1/\varepsilon)$ time when $k = O(1)$,
+where $\varepsilon$ is the target error. More precisely, their algorithm runs
+in time $(d/\varepsilon)^{\mathrm{quasipoly}(k)}$ and learns over multiple
+stages. Here we show that a much simpler one-stage version of their algorithm
+suffices, and moreover its runtime is only $(d/\varepsilon)^{O(k^2)}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Universal and Robust 3D Molecular Representations with Graph
+  Convolutional Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Zhang, Yang Liu, Li Xie, Lei Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To learn accurate representations of molecules, it is essential to consider
+both chemical and geometric features. To encode geometric information, many
+descriptors have been proposed in constrained circumstances for specific types
+of molecules and do not have the properties to be ``robust": 1. Invariant to
+rotations and translations; 2. Injective when embedding molecular structures.
+In this work, we propose a universal and robust Directional Node Pair (DNP)
+descriptor based on the graph representations of 3D molecules. Our DNP
+descriptor is robust compared to previous ones and can be applied to multiple
+molecular types. To combine the DNP descriptor and chemical features in
+molecules, we construct the Robust Molecular Graph Convolutional Network
+(RoM-GCN) which is capable to take both node and edge features into
+consideration when generating molecule representations. We evaluate our model
+on protein and small molecule datasets. Our results validate the superiority of
+the DNP descriptor in incorporating 3D geometric information of molecules.
+RoM-GCN outperforms all compared baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Resource Allocation Policy: Vertex-GNN or Edge-GNN? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Peng, Jia Guo, Chenyang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) update the hidden representations of vertices
+(called Vertex-GNNs) or hidden representations of edges (called Edge-GNNs) by
+processing and pooling the information of neighboring vertices and edges and
+combining to incorporate graph topology. When learning resource allocation
+policies, GNNs cannot perform well if their expressive power are weak, i.e., if
+they cannot differentiate all input features such as channel matrices. In this
+paper, we analyze the expressive power of the Vertex-GNNs and Edge-GNNs for
+learning three representative wireless policies: link scheduling, power
+control, and precoding policies. We find that the expressive power of the GNNs
+depend on the linearity and output dimensions of the processing and combination
+functions. When linear processors are used, the Vertex-GNNs cannot
+differentiate all channel matrices due to the loss of channel information,
+while the Edge-GNNs can. When learning the precoding policy, even the
+Vertex-GNNs with non-linear processors may not be with strong expressive
+ability due to the dimension compression. We proceed to provide necessary
+conditions for the GNNs to well learn the precoding policy. Simulation results
+validate the analyses and show that the Edge-GNNs can achieve the same
+performance as the Vertex-GNNs with much lower training and inference time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model-free generalized fiducial inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan P Williams
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the need for the development of safe and reliable methods for
+uncertainty quantification in machine learning, I propose and develop ideas for
+a model-free statistical framework for imprecise probabilistic prediction
+inference. This framework facilitates uncertainty quantification in the form of
+prediction sets that offer finite sample control of type 1 errors, a property
+shared with conformal prediction sets, but this new approach also offers more
+versatile tools for imprecise probabilistic reasoning. Furthermore, I propose
+and consider the theoretical and empirical properties of a precise
+probabilistic approximation to the model-free imprecise framework.
+Approximating a belief/plausibility measure pair by an [optimal in some sense]
+probability measure in the credal set is a critical resolution needed for the
+broader adoption of imprecise probabilistic approaches to inference in
+statistical and machine learning communities. It is largely undetermined in the
+statistical and machine learning literatures, more generally, how to properly
+quantify uncertainty in that there is no generally accepted standard of
+accountability of stated uncertainties. The research I present in this
+manuscript is aimed at motivating a framework for statistical inference with
+reliability and accountability as the guiding principles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Data Distillation: Do Not Overlook Calibration <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyao Zhu, Bowen Lei, Jie Zhang, Yanbo Fang, Ruqi Zhang, Yiqun Xie, Dongkuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks trained on distilled data often produce over-confident output
+and require correction by calibration methods. Existing calibration methods
+such as temperature scaling and mixup work well for networks trained on
+original large-scale data. However, we find that these methods fail to
+calibrate networks trained on data distilled from large source datasets. In
+this paper, we show that distilled data lead to networks that are not
+calibratable due to (i) a more concentrated distribution of the maximum logits
+and (ii) the loss of information that is semantically meaningful but unrelated
+to classification tasks. To address this problem, we propose Masked Temperature
+Scaling (MTS) and Masked Distillation Training (MDT) which mitigate the
+limitations of distilled data and achieve better calibration results while
+maintaining the efficiency of dataset distillation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rates of Approximation by ReLU Shallow Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Mao, Ding-Xuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks activated by the rectified linear unit (ReLU) play a central
+role in the recent development of deep learning. The topic of approximating
+functions from H\"older spaces by these networks is crucial for understanding
+the efficiency of the induced learning algorithms. Although the topic has been
+well investigated in the setting of deep neural networks with many layers of
+hidden neurons, it is still open for shallow networks having only one hidden
+layer. In this paper, we provide rates of uniform approximation by these
+networks. We show that ReLU shallow neural networks with $m$ hidden neurons can
+uniformly approximate functions from the H\"older space $W_\infty^r([-1, 1]^d)$
+with rates $O((\log m)^{\frac{1}{2} +d}m^{-\frac{r}{d}\frac{d+2}{d+4}})$ when
+$r<d/2 +2$. Such rates are very close to the optimal one $O(m^{-\frac{r}{d}})$
+in the sense that $\frac{d+2}{d+4}$ is close to $1$, when the dimension $d$ is
+large.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the reliability of automatically generated pedestrian and
+  bicycle crash surrogates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Agnimitra Sengupta, S. Ilgin Guler, Vikash V. Gayah, Shannon Warchol
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vulnerable road users (VRUs), such as pedestrians and bicyclists, are at a
+higher risk of being involved in crashes with motor vehicles, and crashes
+involving VRUs also are more likely to result in severe injuries or fatalities.
+Signalized intersections are a major safety concern for VRUs due to their
+complex and dynamic nature, highlighting the need to understand how these road
+users interact with motor vehicles and deploy evidence-based countermeasures to
+improve safety performance. Crashes involving VRUs are relatively infrequent,
+making it difficult to understand the underlying contributing factors. An
+alternative is to identify and use conflicts between VRUs and motorized
+vehicles as a surrogate for safety performance. Automatically detecting these
+conflicts using a video-based systems is a crucial step in developing smart
+infrastructure to enhance VRU safety. The Pennsylvania Department of
+Transportation conducted a study using video-based event monitoring system to
+assess VRU and motor vehicle interactions at fifteen signalized intersections
+across Pennsylvania to improve VRU safety performance. This research builds on
+that study to assess the reliability of automatically generated surrogates in
+predicting confirmed conflicts using advanced data-driven models. The surrogate
+data used for analysis include automatically collectable variables such as
+vehicular and VRU speeds, movements, post-encroachment time, in addition to
+manually collected variables like signal states, lighting, and weather
+conditions. The findings highlight the varying importance of specific
+surrogates in predicting true conflicts, some being more informative than
+others. The findings can assist transportation agencies to collect the right
+types of data to help prioritize infrastructure investments, such as bike lanes
+and crosswalks, and evaluate their effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-UAV Speed Control with Collision Avoidance and Handover-aware Cell
+  Association: DRL with Action Branching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijiang Yan, Wael Jaafar, Bassant Selim, Hina Tabassum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a deep reinforcement learning solution for optimizing
+multi-UAV cell-association decisions and their moving velocity on a 3D aerial
+highway. The objective is to enhance transportation and communication
+performance, including collision avoidance, connectivity, and handovers. The
+problem is formulated as a Markov decision process (MDP) with UAVs' states
+defined by velocities and communication data rates. We propose a neural
+architecture with a shared decision module and multiple network branches, each
+dedicated to a specific action dimension in a 2D transportation-communication
+space. This design efficiently handles the multi-dimensional action space,
+allowing independence for individual action dimensions. We introduce two
+models, Branching Dueling Q-Network (BDQ) and Branching Dueling Double Deep
+Q-Network (Dueling DDQN), to demonstrate the approach. Simulation results show
+a significant improvement of 18.32% compared to existing benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discovering interpretable elastoplasticity models via the neural
+  polynomial method enabled symbolic regressions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13149v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13149v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bahador Bahmani, Hyoung Suk Suh, WaiChing Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional neural network elastoplasticity models are often perceived as
+lacking interpretability. This paper introduces a two-step machine-learning
+approach that returns mathematical models interpretable by human experts. In
+particular, we introduce a surrogate model where yield surfaces are expressed
+in terms of a set of single-variable feature mappings obtained from supervised
+learning. A postprocessing step is then used to re-interpret the set of
+single-variable neural network mapping functions into mathematical form through
+symbolic regression. This divide-and-conquer approach provides several
+important advantages. First, it enables us to overcome the scaling issue of
+symbolic regression algorithms. From a practical perspective, it enhances the
+portability of learned models for partial differential equation solvers written
+in different programming languages. Finally, it enables us to have a concrete
+understanding of the attributes of the materials, such as convexity and
+symmetries of models, through automated derivations and reasoning. Numerical
+examples have been provided, along with an open-source code to enable
+third-party validation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extending Path-Dependent NJ-ODEs to Noisy Observations and a Dependent
+  Observation Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Andersson, Jakob Heiss, Florian Krach, Josef Teichmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Path-Dependent Neural Jump ODE (PD-NJ-ODE) is a model for predicting
+continuous-time stochastic processes with irregular and incomplete
+observations. In particular, the method learns optimal forecasts given
+irregularly sampled time series of incomplete past observations. So far the
+process itself and the coordinate-wise observation times were assumed to be
+independent and observations were assumed to be noiseless. In this work we
+discuss two extensions to lift these restrictions and provide theoretical
+guarantees as well as empirical examples for them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does Progress On Object Recognition Benchmarks Improve Real-World
+  Generalization? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Megan Richards, Polina Kirichenko, Diane Bouchacourt, Mark Ibrahim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For more than a decade, researchers have measured progress in object
+recognition on ImageNet-based generalization benchmarks such as ImageNet-A, -C,
+and -R. Recent advances in foundation models, trained on orders of magnitude
+more data, have begun to saturate these standard benchmarks, but remain brittle
+in practice. This suggests standard benchmarks, which tend to focus on
+predefined or synthetic changes, may not be sufficient for measuring real world
+generalization. Consequently, we propose studying generalization across
+geography as a more realistic measure of progress using two datasets of objects
+from households across the globe. We conduct an extensive empirical evaluation
+of progress across nearly 100 vision models up to most recent foundation
+models. We first identify a progress gap between standard benchmarks and
+real-world, geographical shifts: progress on ImageNet results in up to 2.5x
+more progress on standard generalization benchmarks than real-world
+distribution shifts. Second, we study model generalization across geographies
+by measuring the disparities in performance across regions, a more fine-grained
+measure of real world generalization. We observe all models have large
+geographic disparities, even foundation CLIP models, with differences of 7-20%
+in accuracy between regions. Counter to modern intuition, we discover progress
+on standard benchmarks fails to improve geographic disparities and often
+exacerbates them: geographic disparities between the least performant models
+and today's best models have more than tripled. Our results suggest scaling
+alone is insufficient for consistent robustness to real-world distribution
+shifts. Finally, we highlight in early experiments how simple last layer
+retraining on more representative, curated data can complement scaling as a
+promising direction of future work, reducing geographic disparity on both
+benchmarks by over two-thirds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ simPLE: a visuotactile method learned in simulation to precisely pick,
+  localize, regrasp, and place objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13133v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13133v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Bauza, Antonia Bronars, Yifan Hou, Ian Taylor, Nikhil Chavan-Dafle, Alberto Rodriguez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing robotic systems have a clear tension between generality and
+precision. Deployed solutions for robotic manipulation tend to fall into the
+paradigm of one robot solving a single task, lacking precise generalization,
+i.e., the ability to solve many tasks without compromising on precision. This
+paper explores solutions for precise and general pick-and-place. In precise
+pick-and-place, i.e. kitting, the robot transforms an unstructured arrangement
+of objects into an organized arrangement, which can facilitate further
+manipulation. We propose simPLE (simulation to Pick Localize and PLacE) as a
+solution to precise pick-and-place. simPLE learns to pick, regrasp and place
+objects precisely, given only the object CAD model and no prior experience. We
+develop three main components: task-aware grasping, visuotactile perception,
+and regrasp planning. Task-aware grasping computes affordances of grasps that
+are stable, observable, and favorable to placing. The visuotactile perception
+model relies on matching real observations against a set of simulated ones
+through supervised learning. Finally, we compute the desired robot motion by
+solving a shortest path problem on a graph of hand-to-hand regrasps. On a
+dual-arm robot equipped with visuotactile sensing, we demonstrate
+pick-and-place of 15 diverse objects with simPLE. The objects span a wide range
+of shapes and simPLE achieves successful placements into structured
+arrangements with 1mm clearance over 90% of the time for 6 objects, and over
+80% of the time for 11 objects. Videos are available at
+http://mcube.mit.edu/research/simPLE.html .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 6 figures, 2 tables, submitted to Science Robotics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Differentially Private Weighted Empirical Risk Minimization Procedure
+  and its Application to Outcome Weighted Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Spencer Giddens, Yiwang Zhou, Kevin R. Krull, Tara M. Brinkman, Peter X. K. Song, Fang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is commonplace to use data containing personal information to build
+predictive models in the framework of empirical risk minimization (ERM). While
+these models can be highly accurate in prediction, results obtained from these
+models with the use of sensitive data may be susceptible to privacy attacks.
+Differential privacy (DP) is an appealing framework for addressing such data
+privacy issues by providing mathematically provable bounds on the privacy loss
+incurred when releasing information from sensitive data. Previous work has
+primarily concentrated on applying DP to unweighted ERM. We consider an
+important generalization to weighted ERM (wERM). In wERM, each individual's
+contribution to the objective function can be assigned varying weights. In this
+context, we propose the first differentially private wERM algorithm, backed by
+a rigorous theoretical proof of its DP guarantees under mild regularity
+conditions. Extending the existing DP-ERM procedures to wERM paves a path to
+deriving privacy-preserving learning methods for individualized treatment
+rules, including the popular outcome weighted learning (OWL). We evaluate the
+performance of the DP-wERM application to OWL in a simulation study and in a
+real clinical trial of melatonin for sleep health. All empirical results
+demonstrate the viability of training OWL models via wERM with DP guarantees
+while maintaining sufficiently useful model performance. Therefore, we
+recommend practitioners consider implementing the proposed privacy-preserving
+OWL procedure in real-world scenarios involving sensitive data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages and 2 figures for the main manuscript, 5 pages and 2 figures
+  for the supplementary materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conformal prediction for frequency-severity modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Helton Graziadei, Paulo C. Marques F., Eduardo F. L. de Melo, Rodrigo S. Targino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a nonparametric model-agnostic framework for building prediction
+intervals of insurance claims, with finite sample statistical guarantees,
+extending the technique of split conformal prediction to the domain of
+two-stage frequency-severity modeling. The effectiveness of the framework is
+showcased with simulated and real datasets. When the underlying severity model
+is a random forest, we extend the two-stage split conformal prediction
+procedure, showing how the out-of-bag mechanism can be leveraged to eliminate
+the need for a calibration set and to enable the production of prediction
+intervals with adaptive width.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Explainable Geometric-Weighted Graph Attention Network for
+  Identifying Functional Networks Associated with Gait Impairment <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13108v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13108v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Favour Nerrise, Qingyu Zhao, Kathleen L. Poston, Kilian M. Pohl, Ehsan Adeli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the hallmark symptoms of Parkinson's Disease (PD) is the progressive
+loss of postural reflexes, which eventually leads to gait difficulties and
+balance problems. Identifying disruptions in brain function associated with
+gait impairment could be crucial in better understanding PD motor progression,
+thus advancing the development of more effective and personalized therapeutics.
+In this work, we present an explainable, geometric, weighted-graph attention
+neural network (xGW-GAT) to identify functional networks predictive of the
+progression of gait difficulties in individuals with PD. xGW-GAT predicts the
+multi-class gait impairment on the MDS Unified PD Rating Scale (MDS-UPDRS). Our
+computational- and data-efficient model represents functional connectomes as
+symmetric positive definite (SPD) matrices on a Riemannian manifold to
+explicitly encode pairwise interactions of entire connectomes, based on which
+we learn an attention mask yielding individual- and group-level explainability.
+Applied to our resting-state functional MRI (rs-fMRI) dataset of individuals
+with PD, xGW-GAT identifies functional connectivity patterns associated with
+gait impairment in PD and offers interpretable explanations of functional
+subnetworks associated with motor impairment. Our model successfully
+outperforms several existing methods while simultaneously revealing
+clinically-relevant connectivity patterns. The source code is available at
+https://github.com/favour-nerrise/xGW-GAT .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 26th International Conference on Medical Image
+  Computing and Computer Assisted Intervention (MICCAI 2023). MICCAI
+  Student-Author Registration (STAR) Award. 11 pages, 2 figures, 1 table,
+  appendix. Source Code: https://github.com/favour-nerrise/xGW-GAT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Example-Based Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Hatch, Benjamin Eysenbach, Rafael Rafailov, Tianhe Yu, Ruslan Salakhutdinov, Sergey Levine, Chelsea Finn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While many real-world problems that might benefit from reinforcement
+learning, these problems rarely fit into the MDP mold: interacting with the
+environment is often expensive and specifying reward functions is challenging.
+Motivated by these challenges, prior work has developed data-driven approaches
+that learn entirely from samples from the transition dynamics and examples of
+high-return states. These methods typically learn a reward function from
+high-return states, use that reward function to label the transitions, and then
+apply an offline RL algorithm to these transitions. While these methods can
+achieve good results on many tasks, they can be complex, often requiring
+regularization and temporal difference updates. In this paper, we propose a
+method for offline, example-based control that learns an implicit model of
+multi-step transitions, rather than a reward function. We show that this
+implicit model can represent the Q-values for the example-based control
+problem. Across a range of state-based and image-based offline control tasks,
+our method outperforms baselines that use learned reward functions; additional
+experiments demonstrate improved robustness and scaling with dataset size.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is an updated version of a manuscript that originally appeared
+  at L4DC 2023. The project website is here
+  https://sites.google.com/view/laeo-rl</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Label Noise: Correcting a Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Toner, Amos Storkey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training neural network classifiers on datasets with label noise poses a risk
+of overfitting them to the noisy labels. To address this issue, researchers
+have explored alternative loss functions that aim to be more robust. However,
+many of these alternatives are heuristic in nature and still vulnerable to
+overfitting or underfitting. In this work, we propose a more direct approach to
+tackling overfitting caused by label noise. We observe that the presence of
+label noise implies a lower bound on the noisy generalised risk. Building upon
+this observation, we propose imposing a lower bound on the empirical risk
+during training to mitigate overfitting. Our main contribution is providing
+theoretical results that yield explicit, easily computable bounds on the
+minimum achievable noisy risk for different loss functions. We empirically
+demonstrate that using these bounds significantly enhances robustness in
+various settings, with virtually no additional computational cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fairness Under Demographic Scarce Regime 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrik Joslin Kenfack, Samira Ebrahimi Kahou, Ulrich Aïvodji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing works on fairness assume the model has full access to
+demographic information. However, there exist scenarios where demographic
+information is partially available because a record was not maintained
+throughout data collection or due to privacy reasons. This setting is known as
+demographic scarce regime. Prior research have shown that training an attribute
+classifier to replace the missing sensitive attributes (proxy) can still
+improve fairness. However, the use of proxy-sensitive attributes worsens
+fairness-accuracy trade-offs compared to true sensitive attributes. To address
+this limitation, we propose a framework to build attribute classifiers that
+achieve better fairness-accuracy trade-offs. Our method introduces uncertainty
+awareness in the attribute classifier and enforces fairness on samples with
+demographic information inferred with the lowest uncertainty. We show
+empirically that enforcing fairness constraints on samples with uncertain
+sensitive attributes is detrimental to fairness and accuracy. Our experiments
+on two datasets showed that the proposed framework yields models with
+significantly better fairness-accuracy trade-offs compared to classic attribute
+classifiers. Surprisingly, our framework outperforms models trained with
+constraints on the true sensitive attributes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Certified Training: Towards Better Accuracy-Robustness
+  Tradeoffs <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhakshylyk Nurlanov, Frank R. Schmidt, Florian Bernard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As deep learning models continue to advance and are increasingly utilized in
+real-world systems, the issue of robustness remains a major challenge. Existing
+certified training methods produce models that achieve high provable robustness
+guarantees at certain perturbation levels. However, the main problem of such
+models is a dramatically low standard accuracy, i.e. accuracy on clean
+unperturbed data, that makes them impractical. In this work, we consider a more
+realistic perspective of maximizing the robustness of a model at certain levels
+of (high) standard accuracy. To this end, we propose a novel certified training
+method based on a key insight that training with adaptive certified radii helps
+to improve both the accuracy and robustness of the model, advancing
+state-of-the-art accuracy-robustness tradeoffs. We demonstrate the
+effectiveness of the proposed method on MNIST, CIFAR-10, and TinyImageNet
+datasets. Particularly, on CIFAR-10 and TinyImageNet, our method yields models
+with up to two times higher robustness, measured as an average certified radius
+of a test set, at the same levels of standard accuracy compared to baseline
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at ICML 2023 workshop "New Frontiers in Adversarial Machine
+  Learning"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ General-Purpose Multi-Modal OOD Detection Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viet Duong, Qiong Wu, Zhengyi Zhou, Eric Zavesky, Jiahe Chen, Xiangzhou Liu, Wen-Ling Hsu, Huajie Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection identifies test samples that differ from
+the training data, which is critical to ensuring the safety and reliability of
+machine learning (ML) systems. While a plethora of methods have been developed
+to detect uni-modal OOD samples, only a few have focused on multi-modal OOD
+detection. Current contrastive learning-based methods primarily study
+multi-modal OOD detection in a scenario where both a given image and its
+corresponding textual description come from a new domain. However, real-world
+deployments of ML systems may face more anomaly scenarios caused by multiple
+factors like sensor faults, bad weather, and environmental changes. Hence, the
+goal of this work is to simultaneously detect from multiple different OOD
+scenarios in a fine-grained manner. To reach this goal, we propose a
+general-purpose weakly-supervised OOD detection framework, called WOOD, that
+combines a binary classifier and a contrastive learning component to reap the
+benefits of both. In order to better distinguish the latent representations of
+in-distribution (ID) and OOD samples, we adopt the Hinge loss to constrain
+their similarity. Furthermore, we develop a new scoring metric to integrate the
+prediction results from both the binary classifier and contrastive learning for
+identifying OOD samples. We evaluate the proposed WOOD model on multiple
+real-world datasets, and the experimental results demonstrate that the WOOD
+model outperforms the state-of-the-art methods for multi-modal OOD detection.
+Importantly, our approach is able to achieve high accuracy in OOD detection in
+three different OOD scenarios simultaneously. The source code will be made
+publicly available upon publication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Gradient Flow for Interpreting Deep Neural Networks in Head and
+  Neck Cancer Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinzhu Jin, Jonathan C. Garneau, P. Thomas Fletcher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces feature gradient flow, a new technique for interpreting
+deep learning models in terms of features that are understandable to humans.
+The gradient flow of a model locally defines nonlinear coordinates in the input
+data space representing the information the model is using to make its
+decisions. Our idea is to measure the agreement of interpretable features with
+the gradient flow of a model. To then evaluate the importance of a particular
+feature to the model, we compare that feature's gradient flow measure versus
+that of a baseline noise feature. We then develop a technique for training
+neural networks to be more interpretable by adding a regularization term to the
+loss function that encourages the model gradients to align with those of chosen
+interpretable features. We test our method in a convolutional neural network
+prediction of distant metastasis of head and neck cancer from a computed
+tomography dataset from the Cancer Imaging Archive.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exphormer: Sparse <span class="highlight-title">Transformer</span>s for Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06147v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06147v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamed Shirzad, Ameya Velingker, Balaji Venkatachalam, Danica J. Sutherland, Ali Kemal Sinop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph transformers have emerged as a promising architecture for a variety of
+graph learning and representation tasks. Despite their successes, though, it
+remains challenging to scale graph transformers to large graphs while
+maintaining accuracy competitive with message-passing networks. In this paper,
+we introduce Exphormer, a framework for building powerful and scalable graph
+transformers. Exphormer consists of a sparse attention mechanism based on two
+mechanisms: virtual global nodes and expander graphs, whose mathematical
+characteristics, such as spectral expansion, pseduorandomness, and sparsity,
+yield graph transformers with complexity only linear in the size of the graph,
+while allowing us to prove desirable theoretical properties of the resulting
+transformer models. We show that incorporating Exphormer into the
+recently-proposed GraphGPS framework produces models with competitive empirical
+results on a wide variety of graph datasets, including state-of-the-art results
+on three datasets. We also show that Exphormer can scale to datasets on larger
+graphs than shown in previous graph transformer architectures. Code can be
+found at \url{https://github.com/hamed1375/Exphormer}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Segmenting Known Objects and Unseen Unknowns without Prior Knowledge <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05407v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05407v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefano Gasperini, Alvaro Marcos-Ramiro, Michael Schmidt, Nassir Navab, Benjamin Busam, Federico Tombari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic segmentation methods assign a known class to each pixel given in
+input. Even for state-of-the-art approaches, this inevitably enforces decisions
+that systematically lead to wrong predictions for objects outside the training
+categories. However, robustness against out-of-distribution samples and corner
+cases is crucial in safety-critical settings to avoid dangerous consequences.
+Since real-world datasets cannot contain enough data points to adequately
+sample the long tail of the underlying distribution, models must be able to
+deal with unseen and unknown scenarios as well. Previous methods targeted this
+by re-identifying already-seen unlabeled objects. In this work, we propose the
+necessary step to extend segmentation with a new setting which we term holistic
+segmentation. Holistic segmentation aims to identify and separate objects of
+unseen unknown categories into instances, without any prior knowledge about
+them, while performing panoptic segmentation of known classes. We tackle this
+new problem with U3HS, which finds unknowns as highly uncertain regions and
+clusters their corresponding instance-aware embeddings into individual objects.
+By doing so, for the first time in panoptic segmentation with unknown objects,
+our U3HS is trained without unknown categories, reducing assumptions and
+leaving the settings as unconstrained as in real-life scenarios. Extensive
+experiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate
+the effectiveness of U3HS for this new, challenging, and assumptions-free
+setting called holistic segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Do <span class="highlight-title">Transformer</span>s Learn Topic Structure: Towards a Mechanistic
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04245v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04245v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Li, Yuanzhi Li, Andrej Risteski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the successes of transformers across many domains are indisputable,
+accurate understanding of the learning mechanics is still largely lacking.
+Their capabilities have been probed on benchmarks which include a variety of
+structured and reasoning tasks -- but mathematical understanding is lagging
+substantially behind. Recent lines of work have begun studying representational
+aspects of this question: that is, the size/depth/complexity of attention-based
+networks to perform certain tasks. However, there is no guarantee the learning
+dynamics will converge to the constructions proposed. In our paper, we provide
+fine-grained mechanistic understanding of how transformers learn "semantic
+structure", understood as capturing co-occurrence structure of words.
+Precisely, we show, through a combination of mathematical analysis and
+experiments on Wikipedia data and synthetic data modeled by Latent Dirichlet
+Allocation (LDA), that the embedding layer and the self-attention layer encode
+the topical structure. In the former case, this manifests as higher average
+inner product of embeddings between same-topic words. In the latter, it
+manifests as higher average pairwise attention between same-topic words. The
+mathematical results involve several assumptions to make the analysis
+tractable, which we verify on data, and might be of independent interest as
+well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting the Robustness of the Minimum Error Entropy Criterion: A
+  Transfer Learning Case Study <span class="chip">ECAI-23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08572v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08572v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis Pedro Silvestrin, Shujian Yu, Mark Hoogendoorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coping with distributional shifts is an important part of transfer learning
+methods in order to perform well in real-life tasks. However, most of the
+existing approaches in this area either focus on an ideal scenario in which the
+data does not contain noises or employ a complicated training paradigm or model
+design to deal with distributional shifts. In this paper, we revisit the
+robustness of the minimum error entropy (MEE) criterion, a widely used
+objective in statistical signal processing to deal with non-Gaussian noises,
+and investigate its feasibility and usefulness in real-life transfer learning
+regression tasks, where distributional shifts are common. Specifically, we put
+forward a new theoretical result showing the robustness of MEE against
+covariate shift. We also show that by simply replacing the mean squared error
+(MSE) loss with the MEE on basic transfer learning algorithms such as
+fine-tuning and linear probing, we can achieve competitive performance with
+respect to state-of-the-art transfer learning algorithms. We justify our
+arguments on both synthetic data and 5 real-world time-series data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Manuscript accepted at ECAI-23. Code available at
+  https://github.com/lpsilvestrin/mee-finetune</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Approximation Theory for Metric Space-Valued Functions With A View
+  Towards Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12231v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12231v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasis Kratsios, Chong Liu, Matti Lassas, Maarten V. de Hoop, Ivan Dokmanić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the developing mathematics of deep learning, we build universal
+functions approximators of continuous maps between arbitrary Polish metric
+spaces $\mathcal{X}$ and $\mathcal{Y}$ using elementary functions between
+Euclidean spaces as building blocks. Earlier results assume that the target
+space $\mathcal{Y}$ is a topological vector space. We overcome this limitation
+by ``randomization'': our approximators output discrete probability measures
+over $\mathcal{Y}$. When $\mathcal{X}$ and $\mathcal{Y}$ are Polish without
+additional structure, we prove very general qualitative guarantees; when they
+have suitable combinatorial structure, we prove quantitative guarantees for
+H\"{o}lder-like maps, including maps between finite graphs, solution operators
+to rough differential equations between certain Carnot groups, and continuous
+non-linear operators between Banach spaces arising in inverse problems. In
+particular, we show that the required number of Dirac measures is determined by
+the combinatorial structure of $\mathcal{X}$ and $\mathcal{Y}$. For barycentric
+$\mathcal{Y}$, including Banach spaces, $\mathbb{R}$-trees, Hadamard manifolds,
+or Wasserstein spaces on Polish metric spaces, our approximators reduce to
+$\mathcal{Y}$-valued functions. When the Euclidean approximators are neural
+networks, our constructions generalize transformer networks, providing a new
+probabilistic viewpoint of geometric deep learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 Figures, 3 Tables, 78 Pages (Main 40, Proofs 26, Acknowledgments
+  and References 12)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> Learning for Human Activity Recognition Using 700,000
+  Person-days of Wearable Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02909v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02909v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Yuan, Shing Chan, Andrew P. Creagh, Catherine Tong, David A. Clifton, Aiden Doherty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in deep learning for human activity recognition have been relatively
+limited due to the lack of large labelled datasets. In this study, we leverage
+self-supervised learning techniques on the UK-Biobank activity tracker
+dataset--the largest of its kind to date--containing more than 700,000
+person-days of unlabelled wearable sensor data. Our resulting activity
+recognition model consistently outperformed strong baselines across seven
+benchmark datasets, with an F1 relative improvement of 2.5%-100% (median
+18.4%), the largest improvements occurring in the smaller datasets. In contrast
+to previous studies, our results generalise across external datasets, devices,
+and environments. Our open-source model will help researchers and developers to
+build customisable and generalisable activity classifiers with high
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification of US Supreme Court Cases using <span class="highlight-title">BERT</span>-Based Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08649v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08649v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Vatsal, Adam Meyers, John E. Ortega
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models based on bidirectional encoder representations from transformers
+(BERT) produce state of the art (SOTA) results on many natural language
+processing (NLP) tasks such as named entity recognition (NER), part-of-speech
+(POS) tagging etc. An interesting phenomenon occurs when classifying long
+documents such as those from the US supreme court where BERT-based models can
+be considered difficult to use on a first-pass or out-of-the-box basis. In this
+paper, we experiment with several BERT-based classification techniques for US
+supreme court decisions or supreme court database (SCDB) and compare them with
+the previous SOTA results. We then compare our results specifically with SOTA
+models for long documents. We compare our results for two classification tasks:
+(1) a broad classification task with 15 categories and (2) a fine-grained
+classification task with 279 categories. Our best result produces an accuracy
+of 80\% on the 15 broad categories and 60\% on the fine-grained 279 categories
+which marks an improvement of 8\% and 28\% respectively from previously
+reported SOTA results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Optimal Prescriptive Trees from Observational Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.13628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.13628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathanael Jo, Sina Aghaei, Andrés Gómez, Phebe Vayanos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of learning an optimal prescriptive tree (i.e., an
+interpretable treatment assignment policy in the form of a binary tree) of
+moderate depth, from observational data. This problem arises in numerous
+socially important domains such as public health and personalized medicine,
+where interpretable and data-driven interventions are sought based on data
+gathered in deployment -- through passive collection of data -- rather than
+from randomized trials. We propose a method for learning optimal prescriptive
+trees using mixed-integer optimization (MIO) technology. We show that under
+mild conditions our method is asymptotically exact in the sense that it
+converges to an optimal out-of-sample treatment assignment policy as the number
+of historical data samples tends to infinity. Contrary to existing literature,
+our approach: 1) does not require data to be randomized, 2) does not impose
+stringent assumptions on the learned trees, and 3) has the ability to model
+domain specific constraints. Through extensive computational experiments, we
+demonstrate that our asymptotic guarantees translate to significant performance
+improvements in finite samples, as well as showcase our uniquely flexible
+modeling power by incorporating budget and fairness constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Approximate blocked Gibbs sampling for Bayesian neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.11389v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.11389v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Theodore Papamarkou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, minibatch MCMC sampling for feedforward neural networks is made
+more feasible. To this end, it is proposed to sample subgroups of parameters
+via a blocked Gibbs sampling scheme. By partitioning the parameter space,
+sampling is possible irrespective of layer width. It is also possible to
+alleviate vanishing acceptance rates for increasing depth by reducing the
+proposal variance in deeper layers. Increasing the length of a non-convergent
+chain increases the predictive accuracy in classification tasks, so avoiding
+vanishing acceptance rates and consequently enabling longer chain runs have
+practical benefits. Moreover, non-convergent chain realizations aid in the
+quantification of predictive uncertainty. An open problem is how to perform
+minibatch MCMC sampling for feedforward neural networks in the presence of
+augmented data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizing similarity in noisy setups: the DIBS phenomenon <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.12803v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.12803v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nayara Fonseca, Veronica Guidetti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work uncovers an interplay among data density, noise, and the
+generalization ability in similarity learning. We consider Siamese Neural
+Networks (SNNs), which are the basic form of contrastive learning, and explore
+two types of noise that can impact SNNs, Pair Label Noise (PLN) and Single
+Label Noise (SLN). Our investigation reveals that SNNs exhibit double descent
+behaviour regardless of the training setup and that it is further exacerbated
+by noise. We demonstrate that the density of data pairs is crucial for
+generalization. When SNNs are trained on sparse datasets with the same amount
+of PLN or SLN, they exhibit comparable generalization properties. However, when
+using dense datasets, PLN cases generalize worse than SLN ones in the
+overparametrized region, leading to a phenomenon we call Density-Induced Break
+of Similarity (DIBS). In this regime, PLN similarity violation becomes
+macroscopical, corrupting the dataset to the point where complete interpolation
+cannot be achieved, regardless of the number of model parameters. Our analysis
+also delves into the correspondence between online optimization and offline
+generalization in similarity learning. The results show that this equivalence
+fails in the presence of label noise in all the scenarios considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v3: version accepted at ECAI 2023 + Supplementary Material</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ (Ab)using Images and Sounds for Indirect Instruction Injection in
+  Multi-Modal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10490v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10490v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugene Bagdasaryan, Tsung-Yin Hsieh, Ben Nassi, Vitaly Shmatikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate how images and sounds can be used for indirect prompt and
+instruction injection in multi-modal LLMs. An attacker generates an adversarial
+perturbation corresponding to the prompt and blends it into an image or audio
+recording. When the user asks the (unmodified, benign) model about the
+perturbed image or audio, the perturbation steers the model to output the
+attacker-chosen text and/or make the subsequent dialog follow the attacker's
+instruction. We illustrate this attack with several proof-of-concept examples
+targeting LLaVa and PandaGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span> Training Strategies for Forecasting Multiple Load Time
+  Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10891v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10891v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Hertel, Maximilian Beichter, Benedikt Heidrich, Oliver Neumann, Benjamin Schäfer, Ralf Mikut, Veit Hagenmeyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the smart grid of the future, accurate load forecasts on the level of
+individual clients can help to balance supply and demand locally and to prevent
+grid outages. While the number of monitored clients will increase with the
+ongoing smart meter rollout, the amount of data per client will always be
+limited. We evaluate whether a Transformer load forecasting model benefits from
+a transfer learning strategy, where a global univariate model is trained on the
+load time series from multiple clients. In experiments with two datasets
+containing load time series from several hundred clients, we find that the
+global training strategy is superior to the multivariate and local training
+strategies used in related work. On average, the global training strategy
+results in 21.8% and 12.8% lower forecasting errors than the two other
+strategies, measured across forecasting horizons from one day to one month into
+the future. A comparison to linear models, multi-layer perceptrons and LSTMs
+shows that Transformers are effective for load forecasting when they are
+trained with the global training strategy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated patent extraction powers generative modeling in focused
+  chemical spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08272v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08272v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akshay Subramanian, Kevin P. Greenman, Alexis Gervaix, Tzuhsiung Yang, Rafael Gómez-Bombarelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep generative models have emerged as an exciting avenue for inverse
+molecular design, with progress coming from the interplay between training
+algorithms and molecular representations. One of the key challenges in their
+applicability to materials science and chemistry has been the lack of access to
+sizeable training datasets with property labels. Published patents contain the
+first disclosure of new materials prior to their publication in journals, and
+are a vast source of scientific knowledge that has remained relatively untapped
+in the field of data-driven molecular design. Because patents are filed seeking
+to protect specific uses, molecules in patents can be considered to be weakly
+labeled into application classes. Furthermore, patents published by the US
+Patent and Trademark Office (USPTO) are downloadable and have machine-readable
+text and molecular structures. In this work, we train domain-specific
+generative models using patent data sources by developing an automated pipeline
+to go from USPTO patent digital files to the generation of novel candidates
+with minimal human intervention. We test the approach on two in-class extracted
+datasets, one in organic electronics and another in tyrosine kinase inhibitors.
+We then evaluate the ability of generative models trained on these in-class
+datasets on two categories of tasks (distribution learning and property
+optimization), identify strengths and limitations, and suggest possible
+explanations and remedies that could be used to overcome these in practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Digital Discovery (2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning when to observe: A frugal reinforcement learning framework for
+  a high-cost world <span class="chip">ECML-PKDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02620v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02620v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Colin Bellinger, Mark Crowley, Isaac Tamblyn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) has been shown to learn sophisticated control
+policies for complex tasks including games, robotics, heating and cooling
+systems and text generation. The action-perception cycle in RL, however,
+generally assumes that a measurement of the state of the environment is
+available at each time step without a cost. In applications such as materials
+design, deep-sea and planetary robot exploration and medicine, however, there
+can be a high cost associated with measuring, or even approximating, the state
+of the environment. In this paper, we survey the recently growing literature
+that adopts the perspective that an RL agent might not need, or even want, a
+costly measurement at each time step. Within this context, we propose the Deep
+Dynamic Multi-Step Observationless Agent (DMSOA), contrast it with the
+literature and empirically evaluate it on OpenAI gym and Atari Pong
+environments. Our results, show that DMSOA learns a better policy with fewer
+decision steps and measurements than the considered alternative from the
+literature. The corresponding code is available at:
+\url{https://github.com/cbellinger27/Learning-when-to-observe-in-RL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at ECML-PKDD 2023 workshop track:
+  Simplification, Compression, Efficiency and Frugality for Artificial
+  Intelligence (SCEFA)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CPDG: A Contrastive <span class="highlight-title">Pre-Train</span>ing Method for Dynamic Graph Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02813v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02813v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanchen Bei, Hao Xu, Sheng Zhou, Huixuan Chi, Haishuai Wang, Mengdi Zhang, Zhao Li, Jiajun Bu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic graph data mining has gained popularity in recent years due to the
+rich information contained in dynamic graphs and their widespread use in the
+real world. Despite the advances in dynamic graph neural networks (DGNNs), the
+rich information and diverse downstream tasks have posed significant
+difficulties for the practical application of DGNNs in industrial scenarios. To
+this end, in this paper, we propose to address them by pre-training and present
+the Contrastive Pre-Training Method for Dynamic Graph Neural Networks (CPDG).
+CPDG tackles the challenges of pre-training for DGNNs, including generalization
+capability and long-short term modeling capability, through a flexible
+structural-temporal subgraph sampler along with structural-temporal contrastive
+pre-training schemes. Extensive experiments conducted on both large-scale
+research and industrial dynamic graph datasets show that CPDG outperforms
+existing methods in dynamic graph pre-training for various downstream tasks
+under three transfer settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Temporally Extended Skills in Continuous Domains as Symbolic
+  Actions for Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.05018v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.05018v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Achterhold, Markus Krimmel, Joerg Stueckler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Problems which require both long-horizon planning and continuous control
+capabilities pose significant challenges to existing reinforcement learning
+agents. In this paper we introduce a novel hierarchical reinforcement learning
+agent which links temporally extended skills for continuous control with a
+forward model in a symbolic discrete abstraction of the environment's state for
+planning. We term our agent SEADS for Symbolic Effect-Aware Diverse Skills. We
+formulate an objective and corresponding algorithm which leads to unsupervised
+learning of a diverse set of skills through intrinsic motivation given a known
+state abstraction. The skills are jointly learned with the symbolic forward
+model which captures the effect of skill execution in the state abstraction.
+After training, we can leverage the skills as symbolic actions using the
+forward model for long-horizon planning and subsequently execute the plan using
+the learned continuous-action control skills. The proposed algorithm learns
+skills and forward models that can be used to solve complex tasks which require
+both continuous control and long-horizon planning capabilities with high
+success rate. It compares favorably with other flat and hierarchical
+reinforcement learning baseline agents and is successfully demonstrated with a
+real robot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website (including video) is available at
+  https://seads.is.tue.mpg.de/. (v2) Accepted for publication at the 6th
+  Conference on Robot Learning (CoRL) 2022, Auckland, New Zealand. (v3) Added
+  details on checkpointing (S.8.1), with references on p.7, p.8, p.21 to
+  clarify number of env. steps of reported results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaBest: Minimizing Client Drift in Federated Learning via Adaptive Bias
+  Estimation <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.13170v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.13170v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farshid Varno, Marzie Saghayi, Laya Rafiee Sevyeri, Sharut Gupta, Stan Matwin, Mohammad Havaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Federated Learning (FL), a number of clients or devices collaborate to
+train a model without sharing their data. Models are optimized locally at each
+client and further communicated to a central hub for aggregation. While FL is
+an appealing decentralized training paradigm, heterogeneity among data from
+different clients can cause the local optimization to drift away from the
+global objective. In order to estimate and therefore remove this drift,
+variance reduction techniques have been incorporated into FL optimization
+recently. However, these approaches inaccurately estimate the clients' drift
+and ultimately fail to remove it properly. In this work, we propose an adaptive
+algorithm that accurately estimates drift across clients. In comparison to
+previous works, our approach necessitates less storage and communication
+bandwidth, as well as lower compute costs. Additionally, our proposed
+methodology induces stability by constraining the norm of estimates for client
+drift, making it more practical for large scale FL. Experimental findings
+demonstrate that the proposed algorithm converges significantly faster and
+achieves higher accuracy than the baselines across various FL benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at ECCV 2022; Corrected some typos in
+  the text and a baseline algorithm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deployment of Image Analysis Algorithms under Prevalence Shifts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12540v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12540v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Godau, Piotr Kalinowski, Evangelia Christodoulou, Annika Reinke, Minu Tizabi, Luciana Ferrer, Paul Jäger, Lena Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain gaps are among the most relevant roadblocks in the clinical
+translation of machine learning (ML)-based solutions for medical image
+analysis. While current research focuses on new training paradigms and network
+architectures, little attention is given to the specific effect of prevalence
+shifts on an algorithm deployed in practice. Such discrepancies between class
+frequencies in the data used for a method's development/validation and that in
+its deployment environment(s) are of great importance, for example in the
+context of artificial intelligence (AI) democratization, as disease prevalences
+may vary widely across time and location. Our contribution is twofold. First,
+we empirically demonstrate the potentially severe consequences of missing
+prevalence handling by analyzing (i) the extent of miscalibration, (ii) the
+deviation of the decision threshold from the optimum, and (iii) the ability of
+validation metrics to reflect neural network performance on the deployment
+population as a function of the discrepancy between development and deployment
+prevalence. Second, we propose a workflow for prevalence-aware image
+classification that uses estimated deployment prevalences to adjust a trained
+classifier to a new environment, without requiring additional annotated
+deployment data. Comprehensive experiments based on a diverse set of 30 medical
+classification tasks showcase the benefit of the proposed workflow in
+generating better classifier decisions and more reliable performance estimates
+compared to current practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rényi Divergence Deep Mutual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05732v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05732v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weipeng Huang, Junjie Tao, Changbo Deng, Ming Fan, Wenqiang Wan, Qi Xiong, Guangyuan Piao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper revisits Deep Mutual Learning (DML), a simple yet effective
+computing paradigm. We propose using R\'{e}nyi divergence instead of the KL
+divergence, which is more flexible and tunable, to improve vanilla DML. This
+modification is able to consistently improve performance over vanilla DML with
+limited additional complexity. The convergence properties of the proposed
+paradigm are analyzed theoretically, and Stochastic Gradient Descent with a
+constant learning rate is shown to converge with $\mathcal{O}(1)$-bias in the
+worst case scenario for nonconvex optimization tasks. That is, learning will
+reach nearby local optima but continue searching within a bounded scope, which
+may help mitigate overfitting. Finally, our extensive empirical results
+demonstrate the advantage of combining DML and R\'{e}nyi divergence, leading to
+further improvement in model generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning-based Anonymization of Chest Radiographs: A
+  Utility-preserving Measure for Patient Privacy <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.11531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.11531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Packhäuser, Sebastian Gündel, Florian Thamm, Felix Denzinger, Andreas Maier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust and reliable anonymization of chest radiographs constitutes an
+essential step before publishing large datasets of such for research purposes.
+The conventional anonymization process is carried out by obscuring personal
+information in the images with black boxes and removing or replacing
+meta-information. However, such simple measures retain biometric information in
+the chest radiographs, allowing patients to be re-identified by a linkage
+attack. Therefore, there is an urgent need to obfuscate the biometric
+information appearing in the images. We propose the first deep learning-based
+approach (PriCheXy-Net) to targetedly anonymize chest radiographs while
+maintaining data utility for diagnostic and machine learning purposes. Our
+model architecture is a composition of three independent neural networks that,
+when collectively used, allow for learning a deformation field that is able to
+impede patient re-identification. Quantitative results on the ChestX-ray14
+dataset show a reduction of patient re-identification from 81.8% to 57.7% (AUC)
+after re-training with little impact on the abnormality classification
+performance. This indicates the ability to preserve underlying abnormality
+patterns while increasing patient privacy. Lastly, we compare our proposed
+anonymization approach with two other obfuscation-based methods (Privacy-Net,
+DP-Pix) and demonstrate the superiority of our method towards resolving the
+privacy-utility trade-off for chest radiographs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizable Embeddings with Cross-batch Metric Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07620v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07620v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeti Z. Gurbuz, A. Aydin Alatan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global average pooling (GAP) is a popular component in deep metric learning
+(DML) for aggregating features. Its effectiveness is often attributed to
+treating each feature vector as a distinct semantic entity and GAP as a
+combination of them. Albeit substantiated, such an explanation's algorithmic
+implications to learn generalizable entities to represent unseen classes, a
+crucial DML goal, remain unclear. To address this, we formulate GAP as a convex
+combination of learnable prototypes. We then show that the prototype learning
+can be expressed as a recursive process fitting a linear predictor to a batch
+of samples. Building on that perspective, we consider two batches of disjoint
+classes at each iteration and regularize the learning by expressing the samples
+of a batch with the prototypes that are fitted to the other batch. We validate
+our approach on 4 popular DML benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>\c{opyright} 2023 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Shuffled Multi-Channel Sparse Signal Recovery <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.07368v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.07368v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taulant Koka, Manolis C. Tsakiris, Michael Muma, Benjamín Béjar Haro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mismatches between samples and their respective channel or target commonly
+arise in several real-world applications. For instance, whole-brain calcium
+imaging of freely moving organisms, multiple-target tracking or multi-person
+contactless vital sign monitoring may be severely affected by mismatched
+sample-channel assignments. To systematically address this fundamental problem,
+we pose it as a signal reconstruction problem where we have lost
+correspondences between the samples and their respective channels. Assuming
+that we have a sensing matrix for the underlying signals, we show that the
+problem is equivalent to a structured unlabeled sensing problem, and establish
+sufficient conditions for unique recovery. To the best of our knowledge, a
+sampling result for the reconstruction of shuffled multi-channel signals has
+not been considered in the literature and existing methods for unlabeled
+sensing cannot be directly applied. We extend our results to the case where the
+signals admit a sparse representation in an overcomplete dictionary (i.e., the
+sensing matrix is not precisely known), and derive sufficient conditions for
+the reconstruction of shuffled sparse signals. We propose a robust
+reconstruction method that combines sparse signal recovery with robust linear
+regression for the two-channel case. The performance and robustness of the
+proposed approach is illustrated in an application related to whole-brain
+calcium imaging. The proposed methodology can be generalized to sparse signal
+representations other than the ones considered in this work to be applied in a
+variety of real-world problems with imprecise measurement or channel
+assignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to TSP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reducing Training Time in Cross-Silo Federated Learning using Multigraph
+  Topology <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.09657v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.09657v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tuong Do, Binh X. Nguyen, Vuong Pham, Toan Tran, Erman Tjiputra, Quang Tran, Anh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning is an active research topic since it enables several
+participants to jointly train a model without sharing local data. Currently,
+cross-silo federated learning is a popular training setting that utilizes a few
+hundred reliable data silos with high-speed access links to training a model.
+While this approach has been widely applied in real-world scenarios, designing
+a robust topology to reduce the training time remains an open problem. In this
+paper, we present a new multigraph topology for cross-silo federated learning.
+We first construct the multigraph using the overlay graph. We then parse this
+multigraph into different simple graphs with isolated nodes. The existence of
+isolated nodes allows us to perform model aggregation without waiting for other
+nodes, hence effectively reducing the training time. Intensive experiments on
+three public datasets show that our proposed method significantly reduces the
+training time compared with recent state-of-the-art topologies while
+maintaining the accuracy of the learned model. Our code can be found at
+https://github.com/aioz-ai/MultigraphFL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted in ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BiofilmScanner: A Computational Intelligence Approach to Obtain
+  Bacterial Cell Morphological Attributes from Biofilm Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09629v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09629v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Hafizur Rahman, Md Ali Azam, Md Abir Hossen, Shankarachary Ragi, Venkataramana Gadhamshetty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Desulfovibrio alaskensis G20 (DA-G20) is utilized as a model for
+sulfate-reducing bacteria (SRB) that are associated with corrosion issues
+caused by microorganisms. SRB-based biofilms are thought to be responsible for
+the billion-dollar-per-year bio-corrosion of metal infrastructure.
+Understanding the extraction of the bacterial cells' shape and size properties
+in the SRB-biofilm at different growth stages will assist with the design of
+anti-corrosion techniques. However, numerous issues affect current approaches,
+including time-consuming geometric property extraction, low efficiency, and
+high error rates. This paper proposes BiofilScanner, a Yolact-based deep
+learning method integrated with invariant moments to address these problems.
+Our approach efficiently detects and segments bacterial cells in an SRB image
+while simultaneously invariant moments measure the geometric characteristics of
+the segmented cells with low errors. The numerical experiments of the proposed
+method demonstrate that the BiofilmScanner is 2.1x and 6.8x faster than our
+earlier Mask-RCNN and DLv3+ methods for detecting, segmenting, and measuring
+the geometric properties of the cell. Furthermore, the BiofilmScanner achieved
+an F1-score of 85.28% while Mask-RCNN and DLv3+ obtained F1-scores of 77.67%
+and 75.18%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Pattern Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Defining data science: a new field of inquiry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16177v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16177v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael L Brodie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data science is not a science. It is a research paradigm. Its power, scope,
+and scale will surpass science, our most powerful research paradigm, to enable
+knowledge discovery and change our world. We have yet to understand and define
+it, vital to realizing its potential and managing its risks. Modern data
+science is in its infancy. Emerging slowly since 1962 and rapidly since 2000,
+it is a fundamentally new field of inquiry, one of the most active, powerful,
+and rapidly evolving 21st century innovations. Due to its value, power, and
+applicability, it is emerging in over 40 disciplines, hundreds of research
+areas, and thousands of applications. Millions of data science publications
+contain myriad definitions of data science and data science problem solving.
+Due to its infancy, many definitions are independent, application specific,
+mutually incomplete, redundant, or inconsistent, hence so is data science. This
+research addresses this data science multiple definitions challenge by
+proposing the development of coherent, unified definition based on a data
+science reference framework using a data science journal for the data science
+community to achieve such a definition. This paper provides candidate
+definitions for essential data science artifacts that are required to discuss
+such a definition. They are based on the classical research paradigm concept
+consisting of a philosophy of data science, the data science problem solving
+paradigm, and the six component data science reference framework (axiology,
+ontology, epistemology, methodology, methods, technology) that is a frequently
+called for unifying framework with which to define, unify, and evolve data
+science. It presents challenges for defining data science, solution approaches,
+i.e., means for defining data science, and their requirements and benefits as
+the basis of a comprehensive solution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeRF-GAN Distillation for Efficient 3D-Aware Generation with
+  Convolutions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12865v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12865v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamad Shahbazi, Evangelos Ntavelis, Alessio Tonioni, Edo Collins, Danda Pani Paudel, Martin Danelljan, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pose-conditioned convolutional generative models struggle with high-quality
+3D-consistent image generation from single-view datasets, due to their lack of
+sufficient 3D priors. Recently, the integration of Neural Radiance Fields
+(NeRFs) and generative models, such as Generative Adversarial Networks (GANs),
+has transformed 3D-aware generation from single-view images. NeRF-GANs exploit
+the strong inductive bias of neural 3D representations and volumetric rendering
+at the cost of higher computational complexity. This study aims at revisiting
+pose-conditioned 2D GANs for efficient 3D-aware generation at inference time by
+distilling 3D knowledge from pretrained NeRF-GANs. We propose a simple and
+effective method, based on re-using the well-disentangled latent space of a
+pre-trained NeRF-GAN in a pose-conditioned convolutional network to directly
+generate 3D-consistent images corresponding to the underlying 3D
+representations. Experiments on several datasets demonstrate that the proposed
+method obtains results comparable with volumetric rendering in terms of quality
+and 3D consistency while benefiting from the computational advantage of
+convolutional networks. The code will be available at:
+https://github.com/mshahbazi72/NeRF-GAN-Distillation
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Typical and atypical solutions in non-convex neural networks with
+  discrete and continuous weights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13871v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13871v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlo Baldassi, Enrico M. Malatesta, Gabriele Perugini, Riccardo Zecchina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the binary and continuous negative-margin perceptrons as simple
+non-convex neural network models learning random rules and associations. We
+analyze the geometry of the landscape of solutions in both models and find
+important similarities and differences. Both models exhibit subdominant
+minimizers which are extremely flat and wide. These minimizers coexist with a
+background of dominant solutions which are composed by an exponential number of
+algorithmically inaccessible small clusters for the binary case (the frozen
+1-RSB phase) or a hierarchical structure of clusters of different sizes for the
+spherical case (the full RSB phase). In both cases, when a certain threshold in
+constraint density is crossed, the local entropy of the wide flat minima
+becomes non-monotonic, indicating a break-up of the space of robust solutions
+into disconnected components. This has a strong impact on the behavior of
+algorithms in binary models, which cannot access the remaining isolated
+clusters. For the spherical case the behaviour is different, since even beyond
+the disappearance of the wide flat minima the remaining solutions are shown to
+always be surrounded by a large number of other solutions at any distance, up
+to capacity. Indeed, we exhibit numerical evidence that algorithms seem to find
+solutions up to the SAT/UNSAT transition, that we compute here using an 1RSB
+approximation. For both models, the generalization performance as a learning
+device is shown to be greatly improved by the existence of wide flat minimizers
+even when trained in the highly underconstrained regime of very negative
+margins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lipschitz-regularized gradient flows and generative particle algorithms
+  for high-dimensional scarce data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.17230v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.17230v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyemin Gu, Panagiota Birmpa, Yannis Pantazis, Luc Rey-Bellet, Markos A. Katsoulakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We build a new class of generative algorithms capable of efficiently learning
+an arbitrary target distribution from possibly scarce, high-dimensional data
+and subsequently generate new samples. These generative algorithms are
+particle-based and are constructed as gradient flows of Lipschitz-regularized
+Kullback-Leibler or other $f$-divergences, where data from a source
+distribution can be stably transported as particles, towards the vicinity of
+the target distribution. As a highlighted result in data integration, we
+demonstrate that the proposed algorithms correctly transport gene expression
+data points with dimension exceeding 54K, while the sample size is typically
+only in the hundreds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Automated Hemorrhage Detection in Sparse-view Computed
+  Tomography via Deep Convolutional Neural Network based Artifact Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09340v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09340v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Thalhammer, Manuel Schultheiss, Tina Dorosti, Tobias Lasser, Franz Pfeiffer, Daniela Pfeiffer, Florian Schaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Sparse-view computed tomography (CT) is an effective way to reduce
+dose by lowering the total number of views acquired, albeit at the expense of
+image quality, which, in turn, can impact the ability to detect diseases. We
+explore deep learning-based artifact reduction in sparse-view cranial CT scans
+and its impact on automated hemorrhage detection. Methods: We trained a U-Net
+for artefact reduction on simulated sparse-view cranial CT scans from 3000
+patients obtained from a public dataset and reconstructed with varying levels
+of sub-sampling. Additionally, we trained a convolutional neural network on
+fully sampled CT data from 17,545 patients for automated hemorrhage detection.
+We evaluated the classification performance using the area under the receiver
+operator characteristic curves (AUC-ROCs) with corresponding 95% confidence
+intervals (CIs) and the DeLong test, along with confusion matrices. The
+performance of the U-Net was compared to an analytical approach based on total
+variation (TV). Results: The U-Net performed superior compared to unprocessed
+and TV-processed images with respect to image quality and automated hemorrhage
+diagnosis. With U-Net post-processing, the number of views can be reduced from
+4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;
+0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256
+views (0.967; 0.964-0.969) with a slight performance decrease (P<.001).
+Conclusion: The results suggest that U-Net based artifact reduction
+substantially enhances automated hemorrhage detection in sparse-view cranial
+CTs. Our findings highlight that appropriate post-processing is crucial for
+optimal image quality and diagnostic accuracy while minimizing radiation dose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive Learning and the Emergence of Attributes Associations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10763v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10763v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel N. Nissani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In response to an object presentation, supervised learning schemes generally
+respond with a parsimonious label. Upon a similar presentation we humans
+respond again with a label, but are flooded, in addition, by a myriad of
+associations. A significant portion of these consist of the presented object
+attributes. Contrastive learning is a semi-supervised learning scheme based on
+the application of identity preserving transformations on the object input
+representations. It is conjectured in this work that these same applied
+transformations preserve, in addition to the identity of the presented object,
+also the identity of its semantically meaningful attributes. The corollary of
+this is that the output representations of such a contrastive learning scheme
+contain valuable information not only for the classification of the presented
+object, but also for the presence or absence decision of any attribute of
+interest. Simulation results which demonstrate this idea and the feasibility of
+this conjecture are presented.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Learning of Discrete-Time Dynamics for Uncertainty-Aware Model
+  Predictive Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.12583v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.12583v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Saviolo, Jonathan Frey, Abhishek Rathod, Moritz Diehl, Giuseppe Loianno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model-based control requires an accurate model of the system dynamics for
+precisely and safely controlling the robot in complex and dynamic environments.
+Moreover, in the presence of variations in the operating conditions, the model
+should be continuously refined to compensate for dynamics changes. In this
+paper, we present a self-supervised learning approach that actively models the
+dynamics of nonlinear robotic systems. We combine offline learning from past
+experience and online learning from current robot interaction with the unknown
+environment. These two ingredients enable a highly sample-efficient and
+adaptive learning process, capable of accurately inferring model dynamics in
+real-time even in operating regimes that greatly differ from the training
+distribution. Moreover, we design an uncertainty-aware model predictive
+controller that is heuristically conditioned to the aleatoric (data)
+uncertainty of the learned dynamics. This controller actively chooses the
+optimal control actions that (i) optimize the control performance and (ii)
+improve the efficiency of online learning sample collection. We demonstrate the
+effectiveness of our method through a series of challenging real-world
+experiments using a quadrotor system. Our approach showcases high resilience
+and generalization capabilities by consistently adapting to unseen flight
+conditions, while it significantly outperforms classical and adaptive control
+baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predicting protein variants with equivariant graph neural networks <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12231v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12231v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonia Boca, Simon Mathis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained models have been successful in many protein engineering tasks.
+Most notably, sequence-based models have achieved state-of-the-art performance
+on protein fitness prediction while structure-based models have been used
+experimentally to develop proteins with enhanced functions. However, there is a
+research gap in comparing structure- and sequence-based methods for predicting
+protein variants that are better than the wildtype protein. This paper aims to
+address this gap by conducting a comparative study between the abilities of
+equivariant graph neural networks (EGNNs) and sequence-based approaches to
+identify promising amino-acid mutations. The results show that our proposed
+structural approach achieves a competitive performance to sequence-based
+methods while being trained on significantly fewer molecules. Additionally, we
+find that combining assay labelled data with structure pre-trained models
+yields similar trends as with sequence pre-trained models.
+  Our code and trained models can be found at:
+https://github.com/semiluna/partIII-amino-acid-prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 2 figures, accepted to the 2023 ICML Workshop on
+  Computational Biology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-inspired Open Set Learning for Retinal Anomaly
+  Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03981v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03981v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Wang, Tian Lin, Lianyu Wang, Aidi Lin, Ke Zou, Xinxing Xu, Yi Zhou, Yuanyuan Peng, Qingquan Meng, Yiming Qian, Guoyao Deng, Zhiqun Wu, Junhong Chen, Jianhong Lin, Mingzhi Zhang, Weifang Zhu, Changqing Zhang, Daoqiang Zhang, Rick Siow Mong Goh, Yong Liu, Chi Pui Pang, Xinjian Chen, Haoyu Chen, Huazhu Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Failure to recognize samples from the classes unseen during training is a
+major limitation of artificial intelligence in the real-world implementation
+for recognition and classification of retinal anomalies. We established an
+uncertainty-inspired open-set (UIOS) model, which was trained with fundus
+images of 9 retinal conditions. Besides assessing the probability of each
+category, UIOS also calculated an uncertainty score to express its confidence.
+Our UIOS model with thresholding strategy achieved an F1 score of 99.55%,
+97.01% and 91.91% for the internal testing set, external target categories
+(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1
+score of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS
+correctly predicted high uncertainty scores, which would prompt the need for a
+manual check in the datasets of non-target categories retinal diseases,
+low-quality fundus images, and non-fundus images. UIOS provides a robust method
+for real-world screening of retinal anomalies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Minimax Optimal Kernel Operator Learning via Multilevel Training <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14430v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14430v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jikai Jin, Yiping Lu, Jose Blanchet, Lexing Ying
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning mappings between infinite-dimensional function spaces has achieved
+empirical success in many disciplines of machine learning, including generative
+modeling, functional data analysis, causal inference, and multi-agent
+reinforcement learning. In this paper, we study the statistical limit of
+learning a Hilbert-Schmidt operator between two infinite-dimensional Sobolev
+reproducing kernel Hilbert spaces. We establish the information-theoretic lower
+bound in terms of the Sobolev Hilbert-Schmidt norm and show that a
+regularization that learns the spectral components below the bias contour and
+ignores the ones that are above the variance contour can achieve the optimal
+learning rate. At the same time, the spectral components between the bias and
+variance contours give us flexibility in designing computationally feasible
+machine learning algorithms. Based on this observation, we develop a multilevel
+kernel operator learning algorithm that is optimal when learning linear
+operators between infinite-dimensional function spaces.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2023 spotlight</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Does Circuit Analysis Interpretability Scale? Evidence from Multiple
+  Choice Capabilities in Chinchilla 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09458v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09458v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Lieberum, Matthew Rahtz, János Kramár, Neel Nanda, Geoffrey Irving, Rohin Shah, Vladimir Mikulik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  \emph{Circuit analysis} is a promising technique for understanding the
+internal mechanisms of language models. However, existing analyses are done in
+small models far from the state of the art. To address this, we present a case
+study of circuit analysis in the 70B Chinchilla model, aiming to test the
+scalability of circuit analysis. In particular, we study multiple-choice
+question answering, and investigate Chinchilla's capability to identify the
+correct answer \emph{label} given knowledge of the correct answer \emph{text}.
+We find that the existing techniques of logit attribution, attention pattern
+visualization, and activation patching naturally scale to Chinchilla, allowing
+us to identify and categorize a small set of `output nodes' (attention heads
+and MLPs).
+  We further study the `correct letter' category of attention heads aiming to
+understand the semantics of their features, with mixed results. For normal
+multiple-choice question answers, we significantly compress the query, key and
+value subspaces of the head without loss of performance when operating on the
+answer labels for multiple-choice questions, and we show that the query and key
+subspaces represent an `Nth item in an enumeration' feature to at least some
+extent. However, when we attempt to use this explanation to understand the
+heads' behaviour on a more general distribution including randomized answer
+labels, we find that it is only a partial explanation, suggesting there is more
+to learn about the operation of `correct letter' heads on multiple choice
+question answering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ S3M: Scalable Statistical Shape Modeling through Unsupervised
+  Correspondences <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07515v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07515v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lennart Bastian, Alexander Baumann, Emily Hoppe, Vincent Bürgin, Ha Young Kim, Mahdi Saleh, Benjamin Busam, Nassir Navab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical shape models (SSMs) are an established way to represent the
+anatomy of a population with various clinically relevant applications. However,
+they typically require domain expertise, and labor-intensive landmark
+annotations to construct. We address these shortcomings by proposing an
+unsupervised method that leverages deep geometric features and functional
+correspondences to simultaneously learn local and global shape structures
+across population anatomies. Our pipeline significantly improves unsupervised
+correspondence estimation for SSMs compared to baseline methods, even on highly
+irregular surface topologies. We demonstrate this for two different anatomical
+structures: the thyroid and a multi-chamber heart dataset. Furthermore, our
+method is robust enough to learn from noisy neural network predictions,
+potentially enabling scaling SSMs to larger patient populations without manual
+segmentation annotation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023. 13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep learning based Meta-modeling for Multi-objective Technology
+  Optimization of Electrical Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09087v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09087v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Parekh, Dominik Flore, Sebastian Schöps
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimization of rotating electrical machines is both time- and
+computationally expensive. Because of the different parametrization, design
+optimization is commonly executed separately for each machine technology. In
+this paper, we present the application of a variational auto-encoder (VAE) to
+optimize two different machine technologies simultaneously, namely an
+asynchronous machine and a permanent magnet synchronous machine. After
+training, we employ a deep neural network and a decoder as meta-models to
+predict global key performance indicators (KPIs) and generate associated new
+designs, respectively, through unified latent space in the optimization loop.
+Numerical results demonstrate concurrent parametric multi-objective technology
+optimization in the high-dimensional design space. The VAE-based approach is
+quantitatively compared to a classical deep learning-based direct approach for
+KPIs prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Unfolded Simulated Bifurcation for Massive MIMO Signal Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satoshi Takabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple-input multiple-output (MIMO) is a key ingredient of next-generation
+wireless communications. Recently, various MIMO signal detectors based on deep
+learning techniques and quantum(-inspired) algorithms have been proposed to
+improve the detection performance compared with conventional detectors. This
+paper focuses on the simulated bifurcation (SB) algorithm, a quantum-inspired
+algorithm. This paper proposes two techniques to improve its detection
+performance. The first is modifying the algorithm inspired by the
+Levenberg-Marquardt algorithm to eliminate local minima of maximum likelihood
+detection. The second is the use of deep unfolding, a deep learning technique
+to train the internal parameters of an iterative algorithm. We propose a
+deep-unfolded SB by making the update rule of SB differentiable. The numerical
+results show that these proposed detectors significantly improve the signal
+detection performance in massive MIMO systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5pages, 4 figures; codes are available at
+  https://github.com/s-takabe/unfolded_simbif</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning-Augmented B-Trees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyuan Cao, Jingbang Chen, Li Chen, Chris Lambert, Richard Peng, Daniel Sleator
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study learning-augmented binary search trees (BSTs) and B-Trees via Treaps
+with composite priorities. The result is a simple search tree where the depth
+of each item is determined by its predicted weight $w_x$. To achieve the
+result, each item $x$ has its composite priority
+$-\lfloor\log\log(1/w_x)\rfloor + U(0, 1)$ where $U(0, 1)$ is the uniform
+random variable. This generalizes the recent learning-augmented BSTs
+[Lin-Luo-Woodruff ICML`22], which only work for Zipfian distributions, to
+arbitrary inputs and predictions. It also gives the first B-Tree data structure
+that can provably take advantage of localities in the access sequence via
+online self-reorganization. The data structure is robust to prediction errors
+and handles insertions, deletions, as well as prediction updates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unmasking Falsehoods in <span class="highlight-title">Review</span>s: An Exploration of NLP Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10617v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10617v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anusuya Baby Hari Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the contemporary digital landscape, online reviews have become an
+indispensable tool for promoting products and services across various
+businesses. Marketers, advertisers, and online businesses have found incentives
+to create deceptive positive reviews for their products and negative reviews
+for their competitors' offerings. As a result, the writing of deceptive reviews
+has become an unavoidable practice for businesses seeking to promote themselves
+or undermine their rivals. Detecting such deceptive reviews has become an
+intense and ongoing area of research. This research paper proposes a machine
+learning model to identify deceptive reviews, with a particular focus on
+restaurants. This study delves into the performance of numerous experiments
+conducted on a dataset of restaurant reviews known as the Deceptive Opinion
+Spam Corpus. To accomplish this, an n-gram model and max features are developed
+to effectively identify deceptive content, particularly focusing on fake
+reviews. A benchmark study is undertaken to explore the performance of two
+different feature extraction techniques, which are then coupled with five
+distinct machine learning classification algorithms. The experimental results
+reveal that the passive aggressive classifier stands out among the various
+algorithms, showcasing the highest accuracy not only in text classification but
+also in identifying fake reviews. Moreover, the research delves into data
+augmentation and implements various deep learning techniques to further enhance
+the process of detecting deceptive reviews. The findings shed light on the
+efficacy of the proposed machine learning approach and offer valuable insights
+into dealing with deceptive reviews in the realm of online businesses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic MPC for energy hubs using data driven demand forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12438v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12438v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Varsha Behrunani, Francesco Micheli, Jonas Mehr, Philipp Heer, John Lygeros
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Energy hubs convert and distribute energy resources by combining different
+energy inputs through multiple conversion and storage components. The optimal
+operation of the energy hub exploits its flexibility to increase the energy
+efficiency and reduce the operational costs. However, uncertainties in the
+demand present challenges to energy hub optimization. In this paper, we propose
+a stochastic MPC controller to minimize energy costs using chance constraints
+for the uncertain electricity and thermal demands. Historical data is used to
+build a demand prediction model based on Gaussian processes to generate a
+forecast of the future electricity and heat demands. The stochastic
+optimization problem is solved via the Scenario Approach by sampling multi-step
+demand trajectories from the derived prediction model. The performance of the
+proposed predictor and of the stochastic controller is verified on a simulated
+energy hub model and demand data from a real building.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures. Submitted to IFAC World Congress 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Style Classification of Rabbinic Literature for Detection of Lost
+  Midrash Tanhuma Material 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09710v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09710v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shlomo Tannor, Nachum Dershowitz, Moshe Lavee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Midrash collections are complex rabbinic works that consist of text in
+multiple languages, which evolved through long processes of unstable oral and
+written transmission. Determining the origin of a given passage in such a
+compilation is not always straightforward and is often a matter of dispute
+among scholars, yet it is essential for scholars' understanding of the passage
+and its relationship to other texts in the rabbinic corpus. To help solve this
+problem, we propose a system for classification of rabbinic literature based on
+its style, leveraging recent advances in natural language processing for Hebrew
+texts. Additionally, we demonstrate how this method can be applied to uncover
+lost material from a specific midrash genre, Tan\d{h}uma-Yelammedenu, that has
+been preserved in later anthologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RED-PSM: Regularization by Denoising of Partially Separable Models for
+  Dynamic Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03483v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03483v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Berk Iskender, Marc L. Klasky, Yoram Bresler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at
+each time instant using its undersampled measurements. In particular, in the
+case of dynamic tomography, only a single projection at a single view angle may
+be available at a time, making the problem severely ill-posed. In this work, we
+propose an approach, RED-PSM, which combines for the first time two powerful
+techniques to address this challenging imaging problem. The first, are
+partially separable models, which have been used to efficiently introduce a
+low-rank prior for the spatio-temporal object. The second is the recent
+Regularization by Denoising (RED), which provides a flexible framework to
+exploit the impressive performance of state-of-the-art image denoising
+algorithms, for various inverse problems. We propose a partially separable
+objective with RED and a computationally efficient and scalable optimization
+scheme with variable splitting and ADMM. Theoretical analysis proves the
+convergence of our objective to a value corresponding to a stationary point
+satisfying the first-order optimality conditions. Convergence is accelerated by
+a particular projection-domain-based initialization. We demonstrate the
+performance and computational improvements of our proposed RED-PSM with a
+learned image denoiser by comparing it to a recent deep-prior-based method
+known as TD-DIP. Although the main focus is on dynamic tomography, we also show
+the performance advantages of RED-PSM in a cardiac dynamic MRI setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nexus sine qua non: Essentially Connected Networks for Traffic
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01482v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01482v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Nie, Guoyang Qin, Yunpeng Wang, Jian Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial-temporal graph neural networks (STGNNs) have become the de facto
+models for learning spatiotemporal representations of traffic flow. However,
+modern STGNNs often contain superfluous or obscure components, along with
+complex techniques, posing significant challenges in terms of complexity and
+scalability. Such concerns prompt us to rethink the design of neural
+architectures and to identify the key challenges in traffic forecasting as
+spatial-temporal contextualization. Here, we present an essentially connected
+model based on an efficient message-passing backbone, powered by learnable node
+embedding, without any complex sequential techniques such as TCNs, RNNs, and
+Transformers. Intriguingly, empirical results demonstrate how a simple and
+elegant model with contextualization capability compares favorably w.r.t. the
+state-of-the-art with elaborate structures, while being much more interpretable
+and computationally efficient for traffic forecasting. We anticipate that our
+findings will open new horizons for further research to explore the possibility
+of creating simple but effective neural forecasting architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Choosing Well Your Opponents: How to Guide the Synthesis of Programmatic
+  Strategies <span class="chip">IJCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04893v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04893v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rubens O. Moraes, David S. Aleixo, Lucas N. Ferreira, Levi H. S. Lelis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Local Learner (2L), an algorithm for providing a set of
+reference strategies to guide the search for programmatic strategies in
+two-player zero-sum games. Previous learning algorithms, such as Iterated Best
+Response (IBR), Fictitious Play (FP), and Double-Oracle (DO), can be
+computationally expensive or miss important information for guiding search
+algorithms. 2L actively selects a set of reference strategies to improve the
+search signal. We empirically demonstrate the advantages of our approach while
+guiding a local search algorithm for synthesizing strategies in three games,
+including MicroRTS, a challenging real-time strategy game. Results show that 2L
+learns reference strategies that provide a stronger search signal than IBR, FP,
+and DO. We also simulate a tournament of MicroRTS, where a synthesizer using 2L
+outperformed the winners of the two latest MicroRTS competitions, which were
+programmatic strategies written by human programmers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Joint Conference on Artificial Intelligence (IJCAI)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics-aware Graph Neural Network for Accurate RNA 3D Structure
+  Prediction <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.16392v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.16392v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Zhang, Yang Liu, Lei Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biological functions of RNAs are determined by their three-dimensional (3D)
+structures. Thus, given the limited number of experimentally determined RNA
+structures, the prediction of RNA structures will facilitate elucidating RNA
+functions and RNA-targeted drug discovery, but remains a challenging task. In
+this work, we propose a Graph Neural Network (GNN)-based scoring function
+trained only with the atomic types and coordinates on limited solved RNA 3D
+structures for distinguishing accurate structural models. The proposed
+Physics-aware Multiplex Graph Neural Network (PaxNet) separately models the
+local and non-local interactions inspired by molecular mechanics. Furthermore,
+PaxNet contains an attention-based fusion module that learns the individual
+contribution of each interaction type for the final prediction. We rigorously
+evaluate the performance of PaxNet on two benchmarks and compare it with
+several state-of-the-art baselines. The results show that PaxNet significantly
+outperforms all the baselines overall, and demonstrate the potential of PaxNet
+for improving the 3D structure modeling of RNA and other macromolecules. Our
+code is available at https://github.com/zetayue/Physics-aware-Multiplex-GNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the Machine Learning for Structural Biology Workshop
+  (MLSB) at the 36th Conference on Neural Information Processing Systems
+  (NeurIPS 2022)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A$^2$-UAV: Application-Aware Content and Network Optimization of
+  Edge-Assisted UAV Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06363v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06363v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Coletta, Flavio Giorgi, Gaia Maselli, Matteo Prata, Domenicomichele Silvestri, Jonathan Ashdown, Francesco Restuccia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To perform advanced surveillance, Unmanned Aerial Vehicles (UAVs) require the
+execution of edge-assisted computer vision (CV) tasks. In multi-hop UAV
+networks, the successful transmission of these tasks to the edge is severely
+challenged due to severe bandwidth constraints. For this reason, we propose a
+novel A$^2$-UAV framework to optimize the number of correctly executed tasks at
+the edge. In stark contrast with existing art, we take an application-aware
+approach and formulate a novel pplication-Aware Task Planning Problem
+(A$^2$-TPP) that takes into account (i) the relationship between deep neural
+network (DNN) accuracy and image compression for the classes of interest based
+on the available dataset, (ii) the target positions, (iii) the current
+energy/position of the UAVs to optimize routing, data pre-processing and target
+assignment for each UAV. We demonstrate A$^2$-TPP is NP-Hard and propose a
+polynomial-time algorithm to solve it efficiently. We extensively evaluate
+A$^2$-UAV through real-world experiments with a testbed composed by four DJI
+Mavic Air 2 UAVs. We consider state-of-the-art image classification tasks with
+four different DNN models (i.e., DenseNet, ResNet152, ResNet50 and
+MobileNet-V2) and object detection tasks using YoloV4 trained on the ImageNet
+dataset. Results show that A$^2$-UAV attains on average around 38% more
+accomplished tasks than the state-of-the-art, with 400% more accomplished tasks
+when the number of targets increases significantly. To allow full
+reproducibility, we pledge to share datasets and code with the research
+community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INFOCOM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Non-stationary Linear Bandits for Large-Scale Recommender
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.03167v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.03167v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saeed Ghoorchian, Evgenii Kortukov, Setareh Maghsudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Taking advantage of contextual information can potentially boost the
+performance of recommender systems. In the era of big data, such side
+information often has several dimensions. Thus, developing decision-making
+algorithms to cope with such a high-dimensional context in real time is
+essential. That is specifically challenging when the decision-maker has a
+variety of items to recommend. In addition, changes in items' popularity or
+users' preferences can hinder the performance of the deployed recommender
+system due to a lack of robustness to distribution shifts in the environment.
+In this paper, we build upon the linear contextual multi-armed bandit framework
+to address this problem. We develop a decision-making policy for a linear
+bandit problem with high-dimensional feature vectors, a large set of arms, and
+non-stationary reward-generating processes. Our Thompson sampling-based policy
+reduces the dimension of feature vectors using random projection and uses
+exponentially increasing weights to decrease the influence of past observations
+with time. Our proposed recommender system employs this policy to learn the
+users' item preferences online while minimizing runtime. We prove a regret
+bound that scales as a factor of the reduced dimension instead of the original
+one. To evaluate our proposed recommender system numerically, we apply it to
+three real-world datasets. The theoretical and numerical results demonstrate
+the effectiveness of our proposed algorithm in making a trade-off between
+computational complexity and regret performance compared to the
+state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tensor and Matrix Low-Rank Value-Function Approximation in Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.09736v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.09736v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Rozada, Santiago Paternain, Antonio G. Marques
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Value-function (VF) approximation is a central problem in Reinforcement
+Learning (RL). Classical non-parametric VF estimation suffers from the curse of
+dimensionality. As a result, parsimonious parametric models have been adopted
+to approximate VFs in high-dimensional spaces, with most efforts being focused
+on linear and neural-network-based approaches. Differently, this paper puts
+forth a a \emph{parsimonious non-parametric} approach, where we use
+\emph{stochastic low-rank algorithms} to estimate the VF matrix in an online
+and model-free fashion. Furthermore, as VFs tend to be multi-dimensional, we
+propose replacing the classical VF matrix representation with a tensor
+(multi-way array) representation and, then, use the PARAFAC decomposition to
+design an online model-free tensor low-rank algorithm. Different versions of
+the algorithms are proposed, their complexity is analyzed, and their
+performance is assessed numerically using standardized RL environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures, 2 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Estimation of Generic Dynamics by Path-Dependent Neural Jump
+  ODEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.14284v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.14284v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Krach, Marc Nübel, Josef Teichmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the problem of forecasting general stochastic processes
+using a path-dependent extension of the Neural Jump ODE (NJ-ODE) framework.
+While NJ-ODE was the first framework to establish convergence guarantees for
+the prediction of irregularly observed time series, these results were limited
+to data stemming from It\^o-diffusions with complete observations, in
+particular Markov processes where all coordinates are observed simultaneously.
+In this work, we generalise these results to generic, possibly non-Markovian or
+discontinuous, stochastic processes with incomplete observations, by utilising
+the reconstruction properties of the signature transform. These theoretical
+results are supported by empirical studies, where it is shown that the
+path-dependent NJ-ODE outperforms the original NJ-ODE framework in the case of
+non-Markovian data. Moreover, we show that PD-NJ-ODE can be applied
+successfully to classical stochastic filtering problems and to limit order book
+(LOB) data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Membership Inference Attack under Local Differential Privacy in
+  Federated Learning <span class="chip">AISTATS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12685v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12685v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Truc Nguyen, Phung Lai, Khang Tran, NhatHai Phan, My T. Thai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) was originally regarded as a framework for
+collaborative learning among clients with data privacy protection through a
+coordinating server. In this paper, we propose a new active membership
+inference (AMI) attack carried out by a dishonest server in FL. In AMI attacks,
+the server crafts and embeds malicious parameters into global models to
+effectively infer whether a target data sample is included in a client's
+private training data or not. By exploiting the correlation among data features
+through a non-linear decision boundary, AMI attacks with a certified guarantee
+of success can achieve severely high success rates under rigorous local
+differential privacy (LDP) protection; thereby exposing clients' training data
+to significant privacy risk. Theoretical and experimental results on several
+benchmark datasets show that adding sufficient privacy-preserving noise to
+prevent our attack would significantly damage FL's model utility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at AISTATS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synthetic Control Methods by Density Matching under Implicit Endogeneity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11127v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11127v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masahiro Kato, Akari Ohda, Masaaki Imaizumi, Kenichiro McAlinn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic control methods (SCMs) have become a crucial tool for causal
+inference in comparative case studies. The fundamental idea of SCMs is to
+estimate counterfactual outcomes for a treated unit by using a weighted sum of
+observed outcomes from untreated units. The accuracy of the synthetic control
+(SC) is critical for estimating the causal effect, and hence, the estimation of
+SC weights has been the focus of much research. In this paper, we first point
+out that existing SCMs suffer from an implicit endogeneity problem, which is
+the correlation between the outcomes of untreated units and the error term in
+the model of a counterfactual outcome. We show that this problem yields a bias
+in the causal effect estimator. We then propose a novel SCM based on density
+matching, assuming that the density of outcomes of the treated unit can be
+approximated by a weighted average of the densities of untreated units (i.e., a
+mixture model). Based on this assumption, we estimate SC weights by matching
+moments of treated outcomes and the weighted sum of moments of untreated
+outcomes. Our proposed method has three advantages over existing methods.
+First, our estimator is asymptotically unbiased under the assumption of the
+mixture model. Second, due to the asymptotic unbiasedness, we can reduce the
+mean squared error for counterfactual prediction. Third, our method generates
+full densities of the treatment effect, not only expected values, which
+broadens the applicability of SCMs. We provide experimental results to
+demonstrate the effectiveness of our proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification of Consumer Belief Statements From Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.15498v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.15498v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Wenbin Le, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media offer plenty of information to perform market research in order
+to meet the requirements of customers. One way how this research is conducted
+is that a domain expert gathers and categorizes user-generated content into a
+complex and fine-grained class structure. In many of such cases, little data
+meets complex annotations. It is not yet fully understood how this can be
+leveraged successfully for classification. We examine the classification
+accuracy of expert labels when used with a) many fine-grained classes and b)
+few abstract classes. For scenario b) we compare abstract class labels given by
+the domain expert as baseline and by automatic hierarchical clustering. We
+compare this to another baseline where the entire class structure is given by a
+completely unsupervised clustering approach. By doing so, this work can serve
+as an example of how complex expert annotations are potentially beneficial and
+can be utilized in the most optimal way for opinion mining in highly specific
+domains. By exploring across a range of techniques and experiments, we find
+that automated class abstraction approaches in particular the unsupervised
+approach performs remarkably well against domain expert baseline on text
+classification tasks. This has the potential to inspire opinion mining
+applications in order to support market researchers in practice and to inspire
+fine-grained automated content analysis on a large scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ End-to-End Annotator Bias Approximation on Crowdsourced Single-Label
+  Sentiment Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.02326v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.02326v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, David Szabo, Andreas Koch, Maria Luisa Ripoll Dominguez, Christian Widmer, Maximilian Wich, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sentiment analysis is often a crowdsourcing task prone to subjective labels
+given by many annotators. It is not yet fully understood how the annotation
+bias of each annotator can be modeled correctly with state-of-the-art methods.
+However, resolving annotator bias precisely and reliably is the key to
+understand annotators' labeling behavior and to successfully resolve
+corresponding individual misconceptions and wrongdoings regarding the
+annotation task. Our contribution is an explanation and improvement for precise
+neural end-to-end bias modeling and ground truth estimation, which reduces an
+undesired mismatch in that regard of the existing state-of-the-art.
+Classification experiments show that it has potential to improve accuracy in
+cases where each sample is annotated only by one single annotator. We provide
+the whole source code publicly and release an own domain-specific sentiment
+dataset containing 10,000 sentences discussing organic food products. These are
+crawled from social media and are singly labeled by 10 non-expert annotators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, 2 tables, full conference paper, peer-reviewed</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Long-Tail Theory under Gaussian Mixtures <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10736v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10736v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arman Bolatov, Maxat Tezekbayev, Igor Melnykov, Artur Pak, Vassilina Nikoulina, Zhenisbek Assylbekov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We suggest a simple Gaussian mixture model for data generation that complies
+with Feldman's long tail theory (2020). We demonstrate that a linear classifier
+cannot decrease the generalization error below a certain level in the proposed
+model, whereas a nonlinear classifier with a memorization capacity can. This
+confirms that for long-tailed distributions, rare training examples must be
+considered for optimal generalization to new data. Finally, we show that the
+performance gap between linear and nonlinear models can be lessened as the tail
+becomes shorter in the subpopulation frequency distribution, as confirmed by
+experiments on synthetic and real data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Bayesian approach to quantifying uncertainties and improving
+  generalizability in traffic prediction models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05946v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05946v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Agnimitra Sengupta, Sudeepta Mondal, Adway Das, S. Ilgin Guler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep-learning models for traffic data prediction can have superior
+performance in modeling complex functions using a multi-layer architecture.
+However, a major drawback of these approaches is that most of these approaches
+do not offer forecasts with uncertainty estimates, which are essential for
+traffic operations and control. Without uncertainty estimates, it is difficult
+to place any level of trust to the model predictions, and operational
+strategies relying on overconfident predictions can lead to worsening traffic
+conditions. In this study, we propose a Bayesian recurrent neural network
+framework for uncertainty quantification in traffic prediction with higher
+generalizability by introducing spectral normalization to its hidden layers. In
+our paper, we have shown that normalization alters the training process of deep
+neural networks by controlling the model's complexity and reducing the risk of
+overfitting to the training data. This, in turn, helps improve the
+generalization performance of the model on out-of-distribution datasets.
+Results demonstrate that spectral normalization improves uncertainty estimates
+and significantly outperforms both the layer normalization and model without
+normalization in single-step prediction horizons. This improved performance can
+be attributed to the ability of spectral normalization to better localize the
+feature space of the data under perturbations. Our findings are especially
+relevant to traffic management applications, where predicting traffic
+conditions across multiple locations is the goal, but the availability of
+training data from multiple locations is limited. Spectral normalization,
+therefore, provides a more generalizable approach that can effectively capture
+the underlying patterns in traffic data without requiring location-specific
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeepMem: ML Models as storage channels and their (mis-)applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08811v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08811v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abdullah Al Mamun, Quazi Mishkatul Alam, Erfan Shaigani, Pedram Zaree, Ihsen Alouani, Nael Abu-Ghazaleh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) models are overparameterized to support generality and
+avoid overfitting. Prior works have shown that these additional parameters can
+be used for both malicious (e.g., hiding a model covertly within a trained
+model) and beneficial purposes (e.g., watermarking a model). In this paper, we
+propose a novel information theoretic perspective of the problem; we consider
+the ML model as a storage channel with a capacity that increases with
+overparameterization. Specifically, we consider a sender that embeds arbitrary
+information in the model at training time, which can be extracted by a receiver
+with a black-box access to the deployed model. We derive an upper bound on the
+capacity of the channel based on the number of available parameters. We then
+explore black-box write and read primitives that allow the attacker to: (i)
+store data in an optimized way within the model by augmenting the training data
+at the transmitter side, and (ii) to read it by querying the model after it is
+deployed. We also analyze the detectability of the writing primitive and
+consider a new version of the problem which takes information storage
+covertness into account. Specifically, to obtain storage covertness, we
+introduce a new constraint such that the data augmentation used for the write
+primitives minimizes the distribution shift with the initial (baseline task)
+distribution. This constraint introduces a level of "interference" with the
+initial task, thereby limiting the channel's effective capacity. Therefore, we
+develop optimizations to improve the capacity in this case, including a novel
+ML-specific substitution based error correction protocol. We believe that the
+proposed modeling of the problem offers new tools to better understand and
+mitigate potential vulnerabilities of ML, especially in the context of
+increasingly large models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Plugin estimators for selective classification with out-of-distribution
+  detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.12386v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.12386v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harikrishna Narasimhan, Aditya Krishna Menon, Wittawat Jitkrittum, Sanjiv Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world classifiers can benefit from the option of abstaining from
+predicting on samples where they have low confidence. Such abstention is
+particularly useful on samples which are close to the learned decision
+boundary, or which are outliers with respect to the training sample. These
+settings have been the subject of extensive but disjoint study in the selective
+classification (SC) and out-of-distribution (OOD) detection literature. Recent
+work on selective classification with OOD detection (SCOD) has argued for the
+unified study of these problems; however, the formal underpinnings of this
+problem are still nascent, and existing techniques are heuristic in nature. In
+this paper, we propose new plugin estimators for SCOD that are theoretically
+grounded, effective, and generalise existing approaches from the SC and OOD
+detection literature. In the course of our analysis, we formally explicate how
+na\"{i}ve use of existing SC and OOD detection baselines may be inadequate for
+SCOD. We empirically demonstrate that our approaches yields competitive SC and
+OOD detection performance compared to baselines from both literatures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Simple Regret in Bayesian Best Arm Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.09885v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.09885v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junpei Komiyama, Kaito Ariu, Masahiro Kato, Chao Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider best arm identification in the multi-armed bandit problem.
+Assuming certain continuity conditions of the prior, we characterize the rate
+of the Bayesian simple regret. Differing from Bayesian regret minimization
+(Lai, 1987), the leading term in the Bayesian simple regret derives from the
+region where the gap between optimal and suboptimal arms is smaller than
+$\sqrt{\frac{\log T}{T}}$. We propose a simple and easy-to-compute algorithm
+with its leading term matching with the lower bound up to a constant factor;
+simulation results support our theoretical findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to appear in Mathematics of Operations Research</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fourier-DeepONet: Fourier-enhanced deep operator networks for full
+  waveform inversion with improved accuracy, generalizability, and robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17289v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17289v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Zhu, Shihang Feng, Youzuo Lin, Lu Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Full waveform inversion (FWI) infers the subsurface structure information
+from seismic waveform data by solving a non-convex optimization problem.
+Data-driven FWI has been increasingly studied with various neural network
+architectures to improve accuracy and computational efficiency. Nevertheless,
+the applicability of pre-trained neural networks is severely restricted by
+potential discrepancies between the source function used in the field survey
+and the one utilized during training. Here, we develop a Fourier-enhanced deep
+operator network (Fourier-DeepONet) for FWI with the generalization of seismic
+sources, including the frequencies and locations of sources. Specifically, we
+employ the Fourier neural operator as the decoder of DeepONet, and we utilize
+source parameters as one input of Fourier-DeepONet, facilitating the resolution
+of FWI with variable sources. To test Fourier-DeepONet, we develop three new
+and realistic FWI benchmark datasets (FWI-F, FWI-L, and FWI-FL) with varying
+source frequencies, locations, or both. Our experiments demonstrate that
+compared with existing data-driven FWI methods, Fourier-DeepONet obtains more
+accurate predictions of subsurface structures in a wide range of source
+parameters. Moreover, the proposed Fourier-DeepONet exhibits superior
+robustness when handling data with Gaussian noise or missing traces and sources
+with Gaussian noise, paving the way for more reliable and accurate subsurface
+imaging across diverse real conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Similarity search in the blink of an eye with compressed indices <span class="chip">VLDB 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cecilia Aguerrebere, Ishwar Bhati, Mark Hildebrand, Mariano Tepper, Ted Willke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, data is represented by vectors. Retrieving those vectors, among
+millions and billions, that are similar to a given query is a ubiquitous
+problem, known as similarity search, of relevance for a wide range of
+applications. Graph-based indices are currently the best performing techniques
+for billion-scale similarity search. However, their random-access memory
+pattern presents challenges to realize their full potential. In this work, we
+present new techniques and systems for creating faster and smaller graph-based
+indices. To this end, we introduce a novel vector compression method,
+Locally-adaptive Vector Quantization (LVQ), that uses per-vector scaling and
+scalar quantization to improve search performance with fast similarity
+computations and a reduced effective bandwidth, while decreasing memory
+footprint and barely impacting accuracy. LVQ, when combined with a new
+high-performance computing system for graph-based similarity search,
+establishes the new state of the art in terms of performance and memory
+footprint. For billions of vectors, LVQ outcompetes the second-best
+alternatives: (1) in the low-memory regime, by up to 20.7x in throughput with
+up to a 3x memory footprint reduction, and (2) in the high-throughput regime by
+5.8x with 1.4x less memory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>VLDB 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-23T00:00:00Z">2023-07-23</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">26</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Effectiveness of Offline RL for Dialogue Response Generation <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paloma Sodhi, Felix Wu, Ethan R. Elenberg, Kilian Q. Weinberger, Ryan McDonald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common training technique for language models is teacher forcing (TF). TF
+attempts to match human language exactly, even though identical meanings can be
+expressed in different ways. This motivates use of sequence-level objectives
+for dialogue response generation. In this paper, we study the efficacy of
+various offline reinforcement learning (RL) methods to maximize such
+objectives. We present a comprehensive evaluation across multiple datasets,
+models, and metrics. Offline RL shows a clear performance improvement over
+teacher forcing while not inducing training instability or sacrificing
+practical training budgets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2023. 18 pages, 12 figures. Code available at
+  https://github.com/asappresearch/dialogue-offline-rl</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Testing Hateful Speeches against Policies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12418v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12418v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangrui Zheng, Xueqing Liu, Girish Budhrani, Wei Yang, Ravishka Rathnasuriya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the recent years, many software systems have adopted AI techniques,
+especially deep learning techniques. Due to their black-box nature, AI-based
+systems brought challenges to traceability, because AI system behaviors are
+based on models and data, whereas the requirements or policies are rules in the
+form of natural or programming language. To the best of our knowledge, there is
+a limited amount of studies on how AI and deep neural network-based systems
+behave against rule-based requirements/policies. This experience paper examines
+deep neural network behaviors against rule-based requirements described in
+natural language policies. In particular, we focus on a case study to check
+AI-based content moderation software against content moderation policies.
+First, using crowdsourcing, we collect natural language test cases which match
+each moderation policy, we name this dataset HateModerate; second, using the
+test cases in HateModerate, we test the failure rates of state-of-the-art hate
+speech detection software, and we find that these models have high failure
+rates for certain policies; finally, since manual labeling is costly, we
+further proposed an automated approach to augument HateModerate by finetuning
+OpenAI's large language models to automatically match new examples to policies.
+The dataset and code of this work can be found on our anonymous website:
+\url{https://sites.google.com/view/content-moderation-project}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CommonsenseVIS: Visualizing and Understanding Commonsense Reasoning
+  Capabilities of Natural Language Models <span class="chip">IEEE VIS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingbo Wang, Renfei Huang, Zhihua Jin, Tianqing Fang, Huamin Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, large pretrained language models have achieved compelling
+performance on commonsense benchmarks. Nevertheless, it is unclear what
+commonsense knowledge the models learn and whether they solely exploit spurious
+patterns. Feature attributions are popular explainability techniques that
+identify important input concepts for model outputs. However, commonsense
+knowledge tends to be implicit and rarely explicitly presented in inputs. These
+methods cannot infer models' implicit reasoning over mentioned concepts. We
+present CommonsenseVIS, a visual explanatory system that utilizes external
+commonsense knowledge bases to contextualize model behavior for commonsense
+question-answering. Specifically, we extract relevant commonsense knowledge in
+inputs as references to align model behavior with human knowledge. Our system
+features multi-level visualization and interactive model probing and editing
+for different concepts and their underlying relations. Through a user study, we
+show that CommonsenseVIS helps NLP experts conduct a systematic and scalable
+visual analysis of models' relational reasoning over concepts in different
+situations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by IEEE VIS, 2023. To appear in IEEE
+  Transactions on Visualization and Computer Graphics (IEEE TVCG). 14 pages, 11
+  figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Learning in Large Language Models Learns Label Relationships
+  but Is Not Conventional Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Kossen, Tom Rainforth, Yarin Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of Large Language Models (LLMs) on downstream tasks often
+improves significantly when including examples of the input-label relationship
+in the context. However, there is currently no consensus about how this
+in-context learning (ICL) ability of LLMs works: for example, while Xie et al.
+(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)
+argue ICL does not even learn label relationships from in-context examples. In
+this paper, we study (1) how labels of in-context examples affect predictions,
+(2) how label relationships learned during pre-training interact with
+input-label examples provided in-context, and (3) how ICL aggregates label
+information across in-context examples. Our findings suggests LLMs usually
+incorporate information from in-context labels, but that pre-training and
+in-context label relationships are treated differently, and that the model does
+not consider all in-context information equally. Our results give insights into
+understanding and aligning LLM behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Emotional Nuances in Dialogue Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongxin Zhou, Fabien Ringeval, François Portet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic dialogue summarization is a well-established task that aims to
+identify the most important content from human conversations to create a short
+textual summary. Despite recent progress in the field, we show that most of the
+research has focused on summarizing the factual information, leaving aside the
+affective content, which can yet convey useful information to analyse, monitor,
+or support human interactions. In this paper, we propose and evaluate a set of
+measures $PEmo$, to quantify how much emotion is preserved in dialog summaries.
+Results show that, summarization models of the state-of-the-art do not preserve
+well the emotional content in the summaries. We also show that by reducing the
+training set to only emotional dialogues, the emotional content is better
+preserved in the generated summaries, while conserving the most salient factual
+information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early Prediction of Alzheimers Disease Leveraging Symptom Occurrences
+  from Longitudinal Electronic Health Records of US Military Veterans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rumeng Li, Xun Wang, Dan Berlowitz, Brian Silver, Wen Hu, Heather Keating, Raelene Goodwin, Weisong Liu, Honghuang Lin, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early prediction of Alzheimer's disease (AD) is crucial for timely
+intervention and treatment. This study aims to use machine learning approaches
+to analyze longitudinal electronic health records (EHRs) of patients with AD
+and identify signs and symptoms that can predict AD onset earlier. We used a
+case-control design with longitudinal EHRs from the U.S. Department of Veterans
+Affairs Veterans Health Administration (VHA) from 2004 to 2021. Cases were VHA
+patients with AD diagnosed after 1/1/2016 based on ICD-10-CM codes, matched 1:9
+with controls by age, sex and clinical utilization with replacement. We used a
+panel of AD-related keywords and their occurrences over time in a patient's
+longitudinal EHRs as predictors for AD prediction with four machine learning
+models. We performed subgroup analyses by age, sex, and race/ethnicity, and
+validated the model in a hold-out and "unseen" VHA stations group. Model
+discrimination, calibration, and other relevant metrics were reported for
+predictions up to ten years before ICD-based diagnosis. The study population
+included 16,701 cases and 39,097 matched controls. The average number of
+AD-related keywords (e.g., "concentration", "speaking") per year increased
+rapidly for cases as diagnosis approached, from around 10 to over 40, while
+remaining flat at 10 for controls. The best model achieved high discriminative
+accuracy (ROCAUC 0.997) for predictions using data from at least ten years
+before ICD-based diagnoses. The model was well-calibrated (Hosmer-Lemeshow
+goodness-of-fit p-value = 0.99) and consistent across subgroups of age, sex and
+race/ethnicity, except for patients younger than 65 (ROCAUC 0.746). Machine
+learning models using AD-related keywords identified from EHR notes can predict
+future AD diagnoses, suggesting its potential use for identifying AD risk using
+EHR notes, offering an affordable way for early screening on large population.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ X-CapsNet For Fake News Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Hadi Goldani, Reza Safabakhsh, Saeedeh Momtazi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  News consumption has significantly increased with the growing popularity and
+use of web-based forums and social media. This sets the stage for misinforming
+and confusing people. To help reduce the impact of misinformation on users'
+potential health-related decisions and other intents, it is desired to have
+machine learning models to detect and combat fake news automatically. This
+paper proposes a novel transformer-based model using Capsule neural
+Networks(CapsNet) called X-CapsNet. This model includes a CapsNet with dynamic
+routing algorithm paralyzed with a size-based classifier for detecting short
+and long fake news statements. We use two size-based classifiers, a Deep
+Convolutional Neural Network (DCNN) for detecting long fake news statements and
+a Multi-Layer Perceptron (MLP) for detecting short news statements. To resolve
+the problem of representing short news statements, we use indirect features of
+news created by concatenating the vector of news speaker profiles and a vector
+of polarity, sentiment, and counting words of news statements. For evaluating
+the proposed architecture, we use the Covid-19 and the Liar datasets. The
+results in terms of the F1-score for the Covid-19 dataset and accuracy for the
+Liar dataset show that models perform better than the state-of-the-art
+baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Milimili. Collecting Parallel Data via Crowdsourcing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Antonov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a methodology for gathering a parallel corpus through
+crowdsourcing, which is more cost-effective than hiring professional
+translators, albeit at the expense of quality. Additionally, we have made
+available experimental parallel data collected for Chechen-Russian and
+Fula-English language pairs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Automatic Boundary Detection for Human-AI Hybrid Essay in
+  Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Zeng, Lele Sha, Yuheng Li, Kaixun Yang, Dragan Gašević, Guanliang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-AI collaborative writing has been greatly facilitated with the help of
+modern large language models (LLM), e.g., ChatGPT. While admitting the
+convenience brought by technology advancement, educators also have concerns
+that students might leverage LLM to partially complete their writing assignment
+and pass off the human-AI hybrid text as their original work. Driven by such
+concerns, in this study, we investigated the automatic detection of Human-AI
+hybrid text in education, where we formalized the hybrid text detection as a
+boundary detection problem, i.e., identifying the transition points between
+human-written content and AI-generated content. We constructed a hybrid essay
+dataset by partially removing sentences from the original student-written
+essays and then instructing ChatGPT to fill in for the incomplete essays. Then
+we proposed a two-step detection approach where we (1) Separated AI-generated
+content from human-written content during the embedding learning process; and
+(2) Calculated the distances between every two adjacent prototypes (a prototype
+is the mean of a set of consecutive sentences from the hybrid text in the
+embedding space) and assumed that the boundaries exist between the two
+prototypes that have the furthest distance from each other. Through extensive
+experiments, we summarized the following main findings: (1) The proposed
+approach consistently outperformed the baseline methods across different
+experiment settings; (2) The embedding learning process (i.e., step 1) can
+significantly boost the performance of the proposed approach; (3) When
+detecting boundaries for single-boundary hybrid essays, the performance of the
+proposed approach could be enhanced by adopting a relatively large prototype
+size, leading to a $22$\% improvement (against the second-best baseline method)
+in the in-domain setting and an $18$\% improvement in the out-of-domain
+setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages including references, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>-based Joint Source Channel Coding for Textual Semantic
+  Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shicong Liu, Zhen Gao, Gaojie Chen, Yu Su, Lu Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Space-Air-Ground-Sea integrated network calls for more robust and secure
+transmission techniques against jamming. In this paper, we propose a textual
+semantic transmission framework for robust transmission, which utilizes the
+advanced natural language processing techniques to model and encode sentences.
+Specifically, the textual sentences are firstly split into tokens using
+wordpiece algorithm, and are embedded to token vectors for semantic extraction
+by Transformer-based encoder. The encoded data are quantized to a fixed length
+binary sequence for transmission, where binary erasure, symmetric, and deletion
+channels are considered for transmission. The received binary sequences are
+further decoded by the transformer decoders into tokens used for sentence
+reconstruction. Our proposed approach leverages the power of neural networks
+and attention mechanism to provide reliable and efficient communication of
+textual data in challenging wireless environments, and simulation results on
+semantic similarity and bilingual evaluation understudy prove the superiority
+of the proposed model in semantic transmission.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures. Accepted by IEEE/CIC ICCC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A meta learning scheme for fast accent domain expansion in Mandarin
+  speech recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziwei Zhu, Changhao Shan, Bihong Zhang, Jian Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spoken languages show significant variation across mandarin and accent.
+Despite the high performance of mandarin automatic speech recognition (ASR),
+accent ASR is still a challenge task. In this paper, we introduce meta-learning
+techniques for fast accent domain expansion in mandarin speech recognition,
+which expands the field of accents without deteriorating the performance of
+mandarin ASR. Meta-learning or learn-to-learn can learn general relation in
+multi domains not only for over-fitting a specific domain. So we select
+meta-learning in the domain expansion task. This more essential learning will
+cause improved performance on accent domain extension tasks. We combine the
+methods of meta learning and freeze of model parameters, which makes the
+recognition performance more stable in different cases and the training faster
+about 20%. Our approach significantly outperforms other methods about 3%
+relatively in the accent domain expansion task. Compared to the baseline model,
+it improves relatively 37% under the condition that the mandarin test set
+remains unchanged. In addition, it also proved this method to be effective on a
+large amount of data with a relative performance improvement of 4% on the
+accent test set.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Integration of Speech Separation and Recognition with
+  <span class="highlight-title">Self-Supervised</span> Learning Representation <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoshiki Masuyama, Xuankai Chang, Wangyou Zhang, Samuele Cornell, Zhong-Qiu Wang, Nobutaka Ono, Yanmin Qian, Shinji Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural speech separation has made remarkable progress and its integration
+with automatic speech recognition (ASR) is an important direction towards
+realizing multi-speaker ASR. This work provides an insightful investigation of
+speech separation in reverberant and noisy-reverberant scenarios as an ASR
+front-end. In detail, we explore multi-channel separation methods, mask-based
+beamforming and complex spectral mapping, as well as the best features to use
+in the ASR back-end model. We employ the recent self-supervised learning
+representation (SSLR) as a feature and improve the recognition performance from
+the case with filterbank features. To further improve multi-speaker recognition
+performance, we present a carefully designed training strategy for integrating
+speech separation and recognition with SSLR. The proposed integration using
+TF-GridNet-based complex spectral mapping and WavLM-based SSLR achieves a 2.5%
+word error rate in reverberant WHAMR! test set, significantly outperforming an
+existing mask-based MVDR beamforming and filterbank integration (28.9%).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE WASPAA 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FATRER: Full-Attention Topic Regularizer for Accurate and Robust
+  Conversational Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzhao Mao, Di Lu, Xiaojie Wang, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper concentrates on the understanding of interlocutors' emotions
+evoked in conversational utterances. Previous studies in this literature mainly
+focus on more accurate emotional predictions, while ignoring model robustness
+when the local context is corrupted by adversarial attacks. To maintain
+robustness while ensuring accuracy, we propose an emotion recognizer augmented
+by a full-attention topic regularizer, which enables an emotion-related global
+view when modeling the local context in a conversation. A joint topic modeling
+strategy is introduced to implement regularization from both representation and
+loss perspectives. To avoid over-regularization, we drop the constraints on
+prior distributions that exist in traditional topic modeling and perform
+probabilistic approximations based entirely on attention alignment. Experiments
+show that our models obtain more favorable results than state-of-the-art
+models, and gain convincing robustness under three types of adversarial
+attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MenuCraft: Interactive Menu System Design with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Hossein Kargaran, Nafiseh Nikeghbal, Abbas Heydarnoori, Hinrich Schütze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Menu system design is a challenging task involving many design options and
+various human factors. For example, one crucial factor that designers need to
+consider is the semantic and systematic relation of menu commands. However,
+capturing these relations can be challenging due to limited available
+resources. With the advancement of neural language models, large language
+models can utilize their vast pre-existing knowledge in designing and refining
+menu systems. In this paper, we propose MenuCraft, an AI-assisted designer for
+menu design that enables collaboration between the designer and a dialogue
+system to design menus. MenuCraft offers an interactive language-based menu
+design tool that simplifies the menu design process and enables easy
+customization of design options. MenuCraft supports a variety of interactions
+through dialog that allows performing zero/few-shot learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Natural Language Processing for Long Texts: A <span class="highlight-title">Survey</span> of the
+  State-of-the-Art 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16259v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16259v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitrios Tsirmpas, Ioannis Gkionis, Ioannis Mademlis, Georgios Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The adoption of Deep Neural Networks (DNNs) has greatly benefited Natural
+Language Processing (NLP) during the past decade. However, the demands of long
+document analysis are quite different from those of shorter texts, while the
+ever increasing size of documents uploaded on-line renders automated
+understanding of lengthy texts a critical issue. Relevant applications include
+automated Web mining, legal document review, medical records analysis,
+financial reports analysis, contract management, environmental impact
+assessment, news aggregation, etc. Despite the relatively recent development of
+efficient algorithms for analyzing long documents, practical tools in this
+field are currently flourishing. This article serves as an entry point into
+this dynamic domain and aims to achieve two objectives. Firstly, it provides an
+overview of the relevant neural building blocks, serving as a concise tutorial
+for the field. Secondly, it offers a brief examination of the current
+state-of-the-art in long document NLP, with a primary focus on two key tasks:
+document classification and document summarization. Sentiment analysis for long
+texts is also covered, since it is typically treated as a particular case of
+document classification. Consequently, this article presents an introductory
+exploration of document-level analysis, addressing the primary challenges,
+concerns, and existing solutions. Finally, the article presents publicly
+available annotated datasets that can facilitate further research in this area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>58 pages, 11 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Log-linear Guardedness and its Implications <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10012v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10012v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shauli Ravfogel, Yoav Goldberg, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods for erasing human-interpretable concepts from neural representations
+that assume linearity have been found to be tractable and useful. However, the
+impact of this removal on the behavior of downstream classifiers trained on the
+modified representations is not fully understood. In this work, we formally
+define the notion of log-linear guardedness as the inability of an adversary to
+predict the concept directly from the representation, and study its
+implications. We show that, in the binary case, under certain assumptions, a
+downstream log-linear model cannot recover the erased concept. However, we
+demonstrate that a multiclass log-linear model \emph{can} be constructed that
+indirectly recovers the concept in some cases, pointing to the inherent
+limitations of log-linear guardedness as a downstream bias mitigation
+technique. These findings shed light on the theoretical limitations of linear
+erasure methods and highlight the need for further research on the connections
+between intrinsic and extrinsic bias in neural models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a long paper in ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparing Apples to Apples: Generating Aspect-Aware Comparative
+  Sentences from User <span class="highlight-title">Review</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03691v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03691v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jessica Echterhoff, An Yan, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is time-consuming to find the best product among many similar
+alternatives. Comparative sentences can help to contrast one item from others
+in a way that highlights important features of an item that stand out. Given
+reviews of one or multiple items and relevant item features, we generate
+comparative review sentences to aid users to find the best fit. Specifically,
+our model consists of three successive components in a transformer: (i) an item
+encoding module to encode an item for comparison, (ii) a comparison generation
+module that generates comparative sentences in an autoregressive manner, (iii)
+a novel decoding method for user personalization. We show that our pipeline
+generates fluent and diverse comparative sentences. We run experiments on the
+relevance and fidelity of our generated sentences in a human evaluation study
+and find that our algorithm creates comparative review sentences that are
+relevant and truthful.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating the Factual Knowledge Boundary of Large Language Models
+  with Retrieval Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11019v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11019v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiyang Ren, Yuhao Wang, Yingqi Qu, Wayne Xin Zhao, Jing Liu, Hao Tian, Hua Wu, Ji-Rong Wen, Haifeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require
+a substantial amount of factual knowledge and often rely on external
+information for assistance. Recently, large language models (LLMs) (e.g.,
+ChatGPT), have demonstrated impressive prowess in solving a wide range of tasks
+with world knowledge, including knowledge-intensive tasks. However, it remains
+unclear how well LLMs are able to perceive their factual knowledge boundaries,
+particularly how they behave when incorporating retrieval augmentation. In this
+study, we present an initial analysis of the factual knowledge boundaries of
+LLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,
+we focus on three primary research questions and analyze them by examining QA
+performance, priori judgement and posteriori judgement of LLMs. We show
+evidence that LLMs possess unwavering confidence in their capabilities to
+respond to questions and the accuracy of their responses. Furthermore,
+retrieval augmentation proves to be an effective approach in enhancing LLMs'
+awareness of knowledge boundaries, thereby improving their judgemental
+abilities. Additionally, we also find that LLMs have a propensity to rely on
+the provided retrieval results when formulating answers, while the quality of
+these results significantly impacts their reliance. The code to reproduce this
+work is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LAno<span class="highlight-title">BERT</span>: System Log Anomaly Detection based on <span class="highlight-title">BERT</span> Masked Language
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.09564v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.09564v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukyung Lee, Jina Kim, Pilsung Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The system log generated in a computer system refers to large-scale data that
+are collected simultaneously and used as the basic data for determining errors,
+intrusion and abnormal behaviors. The aim of system log anomaly detection is to
+promptly identify anomalies while minimizing human intervention, which is a
+critical problem in the industry. Previous studies performed anomaly detection
+through algorithms after converting various forms of log data into a
+standardized template using a parser. Particularly, a template corresponding to
+a specific event should be defined in advance for all the log data using which
+the information within the log key may get lost. In this study, we propose
+LAnoBERT, a parser free system log anomaly detection method that uses the BERT
+model, exhibiting excellent natural language processing performance. The
+proposed method, LAnoBERT, learns the model through masked language modeling,
+which is a BERT-based pre-training method, and proceeds with unsupervised
+learning-based anomaly detection using the masked language modeling loss
+function per log key during the test process. In addition, we also propose an
+efficient inference process to establish a practically applicable pipeline to
+the actual system. Experiments on three well-known log datasets, i.e., HDFS,
+BGL, and Thunderbird, show that not only did LAnoBERT yield a higher anomaly
+detection performance compared to unsupervised learning-based benchmark models,
+but also it resulted in a comparable performance with supervised learning-based
+benchmark models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ELVIS: Empowering Locality of Vision Language <span class="highlight-title">Pre-train</span>ing with
+  Intra-modal Similarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sumin Seo, JaeWoong Shin, Jaewoo Kang, Tae Soo Kim, Thijs Kooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has shown great potential in assisting radiologists in reading
+chest X-ray (CXR) images, but its need for expensive annotations for improving
+performance prevents widespread clinical application. Visual language
+pre-training (VLP) can alleviate the burden and cost of annotation by
+leveraging routinely generated reports for radiographs, which exist in large
+quantities as well as in paired form (image-text pairs). Additionally,
+extensions to localization-aware VLPs are being proposed to address the needs
+for accurate localization of abnormalities for computer-aided diagnosis (CAD)
+in CXR. However, we find that the formulation proposed by locality-aware VLP
+literature actually leads to a loss in spatial relationships required for
+downstream localization tasks. Therefore, we propose Empowering Locality of VLP
+with Intra-modal Similarity, ELVIS, a VLP aware of intra-modal locality, to
+better preserve the locality within radiographs or reports, which enhances the
+ability to comprehend location references in text reports. Our locality-aware
+VLP method significantly outperforms state-of-the art baselines in multiple
+segmentation tasks and the MS-CXR phrase grounding task. Qualitatively, we show
+that ELVIS focuses well on regions of interest described in the report text
+compared to prior approaches, allowing for enhanced interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DSTEA: Improving Dialogue State Tracking via Entity Adaptive
+  <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.03858v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.03858v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukyung Lee, Takyoung Kim, Hoonsang Yoon, Pilsung Kang, Junseong Bang, Misuk Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dialogue State Tracking (DST) is critical for comprehensively interpreting
+user and system utterances, thereby forming the cornerstone of efficient
+dialogue systems. Despite past research efforts focused on enhancing DST
+performance through alterations to the model structure or integrating
+additional features like graph relations, they often require additional
+pre-training with external dialogue corpora. In this study, we propose DSTEA,
+improving Dialogue State Tracking via Entity Adaptive pre-training, which can
+enhance the encoder through by intensively training key entities in dialogue
+utterances. DSTEA identifies these pivotal entities from input dialogues
+utilizing four different methods: ontology information, named-entity
+recognition, the spaCy, and the flair library. Subsequently, it employs
+selective knowledge masking to train the model effectively. Remarkably, DSTEA
+only requires pre-training without the direct infusion of extra knowledge into
+the DST model. This approach resulted in substantial performance improvements
+of four robust DST models on MultiWOZ 2.0, 2.1, and 2.2, with joint goal
+accuracy witnessing an increase of up to 2.69% (from 52.41% to 55.10%). Further
+validation of DSTEA's efficacy was provided through comparative experiments
+considering various entity types and different entity adaptive pre-training
+configurations such as masking strategy and masking rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sentiment<span class="highlight-title">GPT</span>: Exploiting <span class="highlight-title">GPT</span> for Advanced Sentiment Analysis and its
+  Departure from Current Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10234v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10234v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kiana Kheiri, Hamid Karimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a thorough examination of various Generative Pretrained
+Transformer (GPT) methodologies in sentiment analysis, specifically in the
+context of Task 4 on the SemEval 2017 dataset. Three primary strategies are
+employed: 1) prompt engineering using the advanced GPT-3.5 Turbo, 2)
+fine-tuning GPT models, and 3) an inventive approach to embedding
+classification. The research yields detailed comparative insights among these
+strategies and individual GPT models, revealing their unique strengths and
+potential limitations. Additionally, the study compares these GPT-based
+methodologies with other current, high-performing models previously used with
+the same dataset. The results illustrate the significant superiority of the GPT
+approaches in terms of predictive performance, more than 22\% in F1-score
+compared to the state-of-the-art. Further, the paper sheds light on common
+challenges in sentiment analysis tasks, such as understanding context and
+detecting sarcasm. It underscores the enhanced capabilities of the GPT models
+to effectively handle these complexities. Taken together, these findings
+highlight the promising potential of GPT models in sentiment analysis, setting
+the stage for future research in this field. The code can be found at
+https://github.com/DSAatUSU/SentimentGPT
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Launchpad<span class="highlight-title">GPT</span>: Language Model as Music Visualization Designer on
+  Launchpad 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siting Xu, Yunlong Tang, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Launchpad is a musical instrument that allows users to create and perform
+music by pressing illuminated buttons. To assist and inspire the design of the
+Launchpad light effect, and provide a more accessible approach for beginners to
+create music visualization with this instrument, we proposed the LaunchpadGPT
+model to generate music visualization designs on Launchpad automatically. Based
+on the language model with excellent generation ability, our proposed
+LaunchpadGPT takes an audio piece of music as input and outputs the lighting
+effects of Launchpad-playing in the form of a video (Launchpad-playing video).
+We collect Launchpad-playing videos and process them to obtain music and
+corresponding video frame of Launchpad-playing as prompt-completion pairs, to
+train the language model. The experiment result shows the proposed method can
+create better music visualization than random generation methods and hold the
+potential for a broader range of music visualization applications. Our code is
+available at https://github.com/yunlong10/LaunchpadGPT/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Computer Music Conference (ICMC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MGR: Multi-generator Based Rationalization <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04492v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04492v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Liu, Haozhao Wang, Jun Wang, Ruixuan Li, Xinyang Li, Yuankai Zhang, Yang Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rationalization is to employ a generator and a predictor to construct a
+self-explaining NLP model in which the generator selects a subset of
+human-intelligible pieces of the input text to the following predictor.
+However, rationalization suffers from two key challenges, i.e., spurious
+correlation and degeneration, where the predictor overfits the spurious or
+meaningless pieces solely selected by the not-yet well-trained generator and in
+turn deteriorates the generator. Although many studies have been proposed to
+address the two challenges, they are usually designed separately and do not
+take both of them into account. In this paper, we propose a simple yet
+effective method named MGR to simultaneously solve the two problems. The key
+idea of MGR is to employ multiple generators such that the occurrence stability
+of real pieces is improved and more meaningful pieces are delivered to the
+predictor. Empirically, we show that MGR improves the F1 score by up to 20.9%
+as compared to state-of-the-art methods. Codes are available at
+https://github.com/jugechengzi/Rationalization-MGR .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, oral presentation. Fixed some typos and clarified some
+  implementation details. arXiv admin note: text overlap with arXiv:2209.08285</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Syllable Discovery and Cross-Lingual Generalization in a Visually
+  Grounded, <span class="highlight-title">Self-Supervised</span> Speech Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11435v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11435v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Puyuan Peng, Shang-Wen Li, Okko Räsänen, Abdelrahman Mohamed, David Harwath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we show that representations capturing syllabic units emerge
+when training a self-supervised speech model with a visually-grounded training
+objective. We demonstrate that a nearly identical model architecture (HuBERT)
+trained with a masked language modeling loss does not exhibit this same
+ability, suggesting that the visual grounding objective is responsible for the
+emergence of this phenomenon. We propose the use of a minimum cut algorithm to
+automatically predict syllable boundaries in speech, followed by a 2-stage
+clustering method to group identical syllables together. We show that our model
+not only outperforms a state-of-the-art syllabic segmentation method on the
+language it was trained on (English), but also generalizes in a zero-shot
+fashion to Estonian. Finally, we show that the same model is capable of
+zero-shot generalization for a word segmentation task on 4 other languages from
+the Zerospeech Challenge, in some cases beating the previous state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Interspeech 2023. Code & Model:
+  https://github.com/jasonppy/syllable-discovery</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Detect<span class="highlight-title">GPT</span>: Zero-Shot Machine-Generated Text Detection using Probability
+  Curvature <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11305v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11305v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Mitchell, Yoonho Lee, Alexander Khazatsky, Christopher D. Manning, Chelsea Finn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing fluency and widespread usage of large language models (LLMs)
+highlight the desirability of corresponding tools aiding detection of
+LLM-generated text. In this paper, we identify a property of the structure of
+an LLM's probability function that is useful for such detection. Specifically,
+we demonstrate that text sampled from an LLM tends to occupy negative curvature
+regions of the model's log probability function. Leveraging this observation,
+we then define a new curvature-based criterion for judging if a passage is
+generated from a given LLM. This approach, which we call DetectGPT, does not
+require training a separate classifier, collecting a dataset of real or
+generated passages, or explicitly watermarking generated text. It uses only log
+probabilities computed by the model of interest and random perturbations of the
+passage from another generic pre-trained language model (e.g., T5). We find
+DetectGPT is more discriminative than existing zero-shot methods for model
+sample detection, notably improving detection of fake news articles generated
+by 20B parameter GPT-NeoX from 0.81 AUROC for the strongest zero-shot baseline
+to 0.95 AUROC for DetectGPT. See https://ericmitchell.ai/detectgpt for code,
+data, and other project information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">46</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProtoFL: Unsupervised Federated Learning via Prototypical Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansol Kim, Youngjun Kwak, Minyoung Jung, Jinho Shin, Youngsung Kim, Changick Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a promising approach for enhancing data privacy
+preservation, particularly for authentication systems. However, limited round
+communications, scarce representation, and scalability pose significant
+challenges to its deployment, hindering its full potential. In this paper, we
+propose 'ProtoFL', Prototypical Representation Distillation based unsupervised
+Federated Learning to enhance the representation power of a global model and
+reduce round communication costs. Additionally, we introduce a local one-class
+classifier based on normalizing flows to improve performance with limited data.
+Our study represents the first investigation of using FL to improve one-class
+classification performance. We conduct extensive experiments on five widely
+used benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and
+Keystroke-Dynamics, to demonstrate the superior performance of our proposed
+framework over previous methods in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed
+  equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EnTri: Ensemble Learning with Tri-level Representations for Explainable
+  Scene Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amirhossein Aminimehr, Amirali Molaei, Erik Cambria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene recognition based on deep-learning has made significant progress, but
+there are still limitations in its performance due to challenges posed by
+inter-class similarities and intra-class dissimilarities. Furthermore, prior
+research has primarily focused on improving classification accuracy, yet it has
+given less attention to achieving interpretable, precise scene classification.
+Therefore, we are motivated to propose EnTri, an ensemble scene recognition
+framework that employs ensemble learning using a hierarchy of visual features.
+EnTri represents features at three distinct levels of detail: pixel-level,
+semantic segmentation-level, and object class and frequency level. By
+incorporating distinct feature encoding schemes of differing complexity and
+leveraging ensemble strategies, our approach aims to improve classification
+accuracy while enhancing transparency and interpretability via visual and
+textual explanations. To achieve interpretability, we devised an extension
+algorithm that generates both visual and textual explanations highlighting
+various properties of a given scene that contribute to the final prediction of
+its category. This includes information about objects, statistics, spatial
+layout, and textural details. Through experiments on benchmark scene
+classification datasets, EnTri has demonstrated superiority in terms of
+recognition accuracy, achieving competitive performance compared to
+state-of-the-art approaches, with an accuracy of 87.69%, 75.56%, and 99.17% on
+the MIT67, SUN397, and UIUC8 datasets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Pattern Recognition journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SwIPE: Efficient and Robust Medical Image Segmentation with Implicit
+  Patch Embeddings <span class="chip">MICCAI'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yejia Zhang, Pengfei Gu, Nishchal Sapkota, Danny Z. Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern medical image segmentation methods primarily use discrete
+representations in the form of rasterized masks to learn features and generate
+predictions. Although effective, this paradigm is spatially inflexible, scales
+poorly to higher-resolution images, and lacks direct understanding of object
+shapes. To address these limitations, some recent works utilized implicit
+neural representations (INRs) to learn continuous representations for
+segmentation. However, these methods often directly adopted components designed
+for 3D shape reconstruction. More importantly, these formulations were also
+constrained to either point-based or global contexts, lacking contextual
+understanding or local fine-grained details, respectively--both critical for
+accurate segmentation. To remedy this, we propose a novel approach, SwIPE
+(Segmentation with Implicit Patch Embeddings), that leverages the advantages of
+INRs and predicts shapes at the patch level--rather than at the point level or
+image level--to enable both accurate local boundary delineation and global
+shape coherence. Extensive evaluations on two tasks (2D polyp segmentation and
+3D abdominal organ segmentation) show that SwIPE significantly improves over
+recent implicit approaches and outperforms state-of-the-art discrete methods
+with over 10x fewer parameters. Our method also demonstrates superior data
+efficiency and improved robustness to data shifts across image resolutions and
+datasets. Code is available on Github.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 2023 International Conference on Medical Image Computing
+  and Computer Assisted Intervention (MICCAI'23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Augmented Box Replay: Overcoming Foreground Shift for Incremental Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12427v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12427v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu Yuyang, Cong Yang, Goswami Dipam, Liu Xialei, Joost van de Weijer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In incremental learning, replaying stored samples from previous tasks
+together with current task samples is one of the most efficient approaches to
+address catastrophic forgetting. However, unlike incremental classification,
+image replay has not been successfully applied to incremental object detection
+(IOD). In this paper, we identify the overlooked problem of foreground shift as
+the main reason for this. Foreground shift only occurs when replaying images of
+previous tasks and refers to the fact that their background might contain
+foreground objects of the current task. To overcome this problem, a novel and
+efficient Augmented Box Replay (ABR) method is developed that only stores and
+replays foreground objects and thereby circumvents the foreground shift
+problem. In addition, we propose an innovative Attentive RoI Distillation loss
+that uses spatial attention from region-of-interest (RoI) features to constrain
+current model to focus on the most important information from old model. ABR
+significantly reduces forgetting of previous classes while maintaining high
+plasticity in current classes. Moreover, it considerably reduces the storage
+requirements when compared to standard image replay. Comprehensive experiments
+on Pascal-VOC and COCO datasets support the state-of-the-art performance of our
+model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TransNet: Transparent Object Manipulation Through Category-Level Pose
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huijie Zhang, Anthony Opipari, Xiaotong Chen, Jiyue Zhu, Zeren Yu, Odest Chadwicke Jenkins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transparent objects present multiple distinct challenges to visual perception
+systems. First, their lack of distinguishing visual features makes transparent
+objects harder to detect and localize than opaque objects. Even humans find
+certain transparent surfaces with little specular reflection or refraction,
+like glass doors, difficult to perceive. A second challenge is that depth
+sensors typically used for opaque object perception cannot obtain accurate
+depth measurements on transparent surfaces due to their unique reflective
+properties. Stemming from these challenges, we observe that transparent object
+instances within the same category, such as cups, look more similar to each
+other than to ordinary opaque objects of that same category. Given this
+observation, the present paper explores the possibility of category-level
+transparent object pose estimation rather than instance-level pose estimation.
+We propose \textit{\textbf{TransNet}}, a two-stage pipeline that estimates
+category-level transparent object pose using localized depth completion and
+surface normal estimation. TransNet is evaluated in terms of pose estimation
+accuracy on a large-scale transparent object dataset and compared to a
+state-of-the-art category-level pose estimation approach. Results from this
+comparison demonstrate that TransNet achieves improved pose estimation accuracy
+on transparent objects. Moreover, we use TransNet to build an autonomous
+transparent object manipulation system for robotic pick-and-place and pouring
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Robust Visual Grounding with Masked Reference based
+  Centerpoint Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12392v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12392v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menghao Li, Chunlei Wang, Wenquan Feng, Shuchang Lyu, Guangliang Cheng, Xiangtai Li, Binghao Liu, Qi Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Grounding (VG) aims at localizing target objects from an image based
+on given expressions and has made significant progress with the development of
+detection and vision transformer. However, existing VG methods tend to generate
+false-alarm objects when presented with inaccurate or irrelevant descriptions,
+which commonly occur in practical applications. Moreover, existing methods fail
+to capture fine-grained features, accurate localization, and sufficient context
+comprehension from the whole image and textual descriptions. To address both
+issues, we propose an Iterative Robust Visual Grounding (IR-VG) framework with
+Masked Reference based Centerpoint Supervision (MRCS). The framework introduces
+iterative multi-level vision-language fusion (IMVF) for better alignment. We
+use MRCS to ahieve more accurate localization with point-wised feature
+supervision. Then, to improve the robustness of VG, we also present a
+multi-stage false-alarm sensitive decoder (MFSD) to prevent the generation of
+false-alarm objects when presented with inaccurate expressions. The proposed
+framework is evaluated on five regular VG datasets and two newly constructed
+robust VG datasets. Extensive experiments demonstrate that IR-VG achieves new
+state-of-the-art (SOTA) results, with improvements of 25\% and 10\% compared to
+existing SOTA approaches on the two newly proposed robust VG datasets.
+Moreover, the proposed framework is also verified effective on five regular VG
+datasets. Codes and models will be publicly at
+https://github.com/cv516Buaa/IR-VG.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ComPtr: Towards Diverse Bi-source Dense Prediction Tasks via A Simple
+  yet General Complementary <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12349v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12349v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youwei Pang, Xiaoqi Zhao, Lihe Zhang, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) has advanced the field of dense prediction, while
+gradually dissolving the inherent barriers between different tasks. However,
+most existing works focus on designing architectures and constructing visual
+cues only for the specific task, which ignores the potential uniformity
+introduced by the DL paradigm. In this paper, we attempt to construct a novel
+\underline{ComP}lementary \underline{tr}ansformer, \textbf{ComPtr}, for diverse
+bi-source dense prediction tasks. Specifically, unlike existing methods that
+over-specialize in a single task or a subset of tasks, ComPtr starts from the
+more general concept of bi-source dense prediction. Based on the basic
+dependence on information complementarity, we propose consistency enhancement
+and difference awareness components with which ComPtr can evacuate and collect
+important visual semantic cues from different image sources for diverse tasks,
+respectively. ComPtr treats different inputs equally and builds an efficient
+dense interaction model in the form of sequence-to-sequence on top of the
+transformer. This task-generic design provides a smooth foundation for
+constructing the unified model that can simultaneously deal with various
+bi-source information. In extensive experiments across several representative
+vision tasks, i.e. remote sensing change detection, RGB-T crowd counting,
+RGB-D/T salient object detection, and RGB-D semantic segmentation, the proposed
+method consistently obtains favorable performance. The code will be available
+at \url{https://github.com/lartpang/ComPtr}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ResShift: Efficient Diffusion Model for Image Super-resolution by
+  Residual Shifting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12348v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12348v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongsheng Yue, Jianyi Wang, Chen Change Loy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based image super-resolution (SR) methods are mainly limited by the
+low inference speed due to the requirements of hundreds or even thousands of
+sampling steps. Existing acceleration sampling techniques inevitably sacrifice
+performance to some extent, leading to over-blurry SR results. To address this
+issue, we propose a novel and efficient diffusion model for SR that
+significantly reduces the number of diffusion steps, thereby eliminating the
+need for post-acceleration during inference and its associated performance
+deterioration. Our method constructs a Markov chain that transfers between the
+high-resolution image and the low-resolution image by shifting the residual
+between them, substantially improving the transition efficiency. Additionally,
+an elaborate noise schedule is developed to flexibly control the shifting speed
+and the noise strength during the diffusion process. Extensive experiments
+demonstrate that the proposed method obtains superior or at least comparable
+performance to current state-of-the-art methods on both synthetic and
+real-world datasets, even only with 15 sampling steps. Our code and model are
+available at https://github.com/zsyOAOA/ResShift.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Right for the Wrong Reason: Can Interpretable ML Techniques Detect
+  Spurious Correlations? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susu Sun, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep neural network models offer unmatched classification performance,
+they are prone to learning spurious correlations in the data. Such dependencies
+on confounding information can be difficult to detect using performance metrics
+if the test data comes from the same distribution as the training data.
+Interpretable ML methods such as post-hoc explanations or inherently
+interpretable classifiers promise to identify faulty model reasoning. However,
+there is mixed evidence whether many of these techniques are actually able to
+do so. In this paper, we propose a rigorous evaluation strategy to assess an
+explanation technique's ability to correctly identify spurious correlations.
+Using this strategy, we evaluate five post-hoc explanation techniques and one
+inherently interpretable method for their ability to detect three types of
+artificially added confounders in a chest x-ray diagnosis task. We find that
+the post-hoc technique SHAP, as well as the inherently interpretable Attri-Net
+provide the best performance and can be used to reliably identify faulty model
+behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generic and Controllable Attacks Against Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12342v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12342v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guopeng Li, Yue Xu, Jian Ding, Gui-Song Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing adversarial attacks against Object Detectors (ODs) suffer from two
+inherent limitations. Firstly, ODs have complicated meta-structure designs,
+hence most advanced attacks for ODs concentrate on attacking specific
+detector-intrinsic structures, which makes it hard for them to work on other
+detectors and motivates us to design a generic attack against ODs. Secondly,
+most works against ODs make Adversarial Examples (AEs) by generalizing
+image-level attacks from classification to detection, which brings redundant
+computations and perturbations in semantically meaningless areas (e.g.,
+backgrounds) and leads to an emergency for seeking controllable attacks for
+ODs. To this end, we propose a generic white-box attack, LGP (local
+perturbations with adaptively global attacks), to blind mainstream object
+detectors with controllable perturbations. For a detector-agnostic attack, LGP
+tracks high-quality proposals and optimizes three heterogeneous losses
+simultaneously. In this way, we can fool the crucial components of ODs with a
+part of their outputs without the limitations of specific structures. Regarding
+controllability, we establish an object-wise constraint that exploits
+foreground-background separation adaptively to induce the attachment of
+perturbations to foregrounds. Experimentally, the proposed LGP successfully
+attacked sixteen state-of-the-art object detectors on MS-COCO and DOTA
+datasets, with promising imperceptibility and transferability obtained. Codes
+are publicly released in https://github.com/liguopeng0923/LGP.git
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rapid detection of soil carbonates by means of NIR spectroscopy, deep
+  learning methods and phase quantification by powder Xray diffraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lykourgos Chiniadis, Petros Tamvakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Soil NIR spectral absorbance/reflectance libraries are utilized towards
+improving agricultural production and analysis of soil properties which are key
+prerequisite for agroecological balance and environmental sustainability.
+Carbonates in particular, represent a soil property which is mostly affected
+even by mild, let alone extreme, changes of environmental conditions during
+climate change. In this study we propose a rapid and efficient way to predict
+carbonates content in soil by means of FT NIR reflectance spectroscopy and by
+use of deep learning methods. We exploited multiple machine learning methods,
+such as: 1) a MLP Regressor and 2) a CNN and compare their performance with
+other traditional ML algorithms such as PLSR, Cubist and SVM on the combined
+dataset of two NIR spectral libraries: KSSL (USDA), a dataset of soil samples
+reflectance spectra collected nationwide, and LUCAS TopSoil (European Soil
+Library) which contains soil sample absorbance spectra from all over the
+European Union, and use them to predict carbonate content on never before seen
+soil samples. Soil samples in KSSL and in TopSoil spectral libraries were
+acquired in the spectral region of visNIR, however in this study, only the NIR
+spectral region was utilized. Quantification of carbonates by means of Xray
+Diffraction is in good agreement with the volumetric method and the MLP
+prediction. Our work contributes to rapid carbonates content prediction in soil
+samples in cases where: 1) no volumetric method is available and 2) only NIR
+spectra absorbance data are available. Up till now and to the best of our
+knowledge, there exists no other study, that presents a prediction model
+trained on such an extensive dataset with such promising results on unseen
+data, undoubtedly supporting the notion that deep learning models present
+excellent prediction tools for soil carbonates content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Navigational Visual Representations with Semantic Map
+  Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yicong Hong, Yang Zhou, Ruiyi Zhang, Franck Dernoncourt, Trung Bui, Stephen Gould, Hao Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Being able to perceive the semantics and the spatial structure of the
+environment is essential for visual navigation of a household robot. However,
+most existing works only employ visual backbones pre-trained either with
+independent images for classification or with self-supervised learning methods
+to adapt to the indoor navigation domain, neglecting the spatial relationships
+that are essential to the learning of navigation. Inspired by the behavior that
+humans naturally build semantically and spatially meaningful cognitive maps in
+their brains during navigation, in this paper, we propose a novel
+navigational-specific visual representation learning method by contrasting the
+agent's egocentric views and semantic maps (Ego$^2$-Map). We apply the visual
+transformer as the backbone encoder and train the model with data collected
+from the large-scale Habitat-Matterport3D environments. Ego$^2$-Map learning
+transfers the compact and rich information from a map, such as objects,
+structure and transition, to the agent's egocentric representations for
+navigation. Experiments show that agents using our learned representations on
+object-goal navigation outperform recent visual pre-training methods. Moreover,
+our representations significantly improve vision-and-language navigation in
+continuous environments for both high-level and low-level action spaces,
+achieving new state-of-the-art results of 47% SR and 41% SPL on the test
+server.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ES2Net: An Efficient Spectral-Spatial Network for Hyperspectral Image
+  Change Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingren Yao, Yuan Zhou, Wei Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral image change detection (HSI-CD) aims to identify the
+differences in bitemporal HSIs. To mitigate spectral redundancy and improve the
+discriminativeness of changing features, some methods introduced band selection
+technology to select bands conducive for CD. However, these methods are limited
+by the inability to end-to-end training with the deep learning-based feature
+extractor and lack considering the complex nonlinear relationship among bands.
+In this paper, we propose an end-to-end efficient spectral-spatial change
+detection network (ES2Net) to address these issues. Specifically, we devised a
+learnable band selection module to automatically select bands conducive to CD.
+It can be jointly optimized with a feature extraction network and capture the
+complex nonlinear relationships among bands. Moreover, considering the large
+spatial feature distribution differences among different bands, we design the
+cluster-wise spatial attention mechanism that assigns a spatial attention
+factor to each individual band to individually improve the feature
+discriminativeness for each band. Experiments on three widely used HSI-CD
+datasets demonstrate the effectiveness and superiority of this method compared
+with other state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development of pericardial fat count images using a combination of three
+  different deep-learning models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takaaki Matsunaga, Atsushi Kono, Hidetoshi Matsuo, Kaoru Kitagawab, Mizuho Nishio, Hiromi Hashimura, Yu Izawa, Takayoshi Toba, Kazuki Ishikawab, Akie Katsuki, Kazuyuki Ohmura, Takamichi Murakami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rationale and Objectives: Pericardial fat (PF), the thoracic visceral fat
+surrounding the heart, promotes the development of coronary artery disease by
+inducing inflammation of the coronary arteries. For evaluating PF, this study
+aimed to generate pericardial fat count images (PFCIs) from chest radiographs
+(CXRs) using a dedicated deep-learning model.
+  Materials and Methods: The data of 269 consecutive patients who underwent
+coronary computed tomography (CT) were reviewed. Patients with metal implants,
+pleural effusion, history of thoracic surgery, or that of malignancy were
+excluded. Thus, the data of 191 patients were used. PFCIs were generated from
+the projection of three-dimensional CT images, where fat accumulation was
+represented by a high pixel value. Three different deep-learning models,
+including CycleGAN, were combined in the proposed method to generate PFCIs from
+CXRs. A single CycleGAN-based model was used to generate PFCIs from CXRs for
+comparison with the proposed method. To evaluate the image quality of the
+generated PFCIs, structural similarity index measure (SSIM), mean squared error
+(MSE), and mean absolute error (MAE) of (i) the PFCI generated using the
+proposed method and (ii) the PFCI generated using the single model were
+compared.
+  Results: The mean SSIM, MSE, and MAE were as follows: 0.856, 0.0128, and
+0.0357, respectively, for the proposed model; and 0.762, 0.0198, and 0.0504,
+respectively, for the single CycleGAN-based model.
+  Conclusion: PFCIs generated from CXRs with the proposed model showed better
+performance than those with the single model. PFCI evaluation without CT may be
+possible with the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building Extraction from Remote Sensing Images via an Uncertainty-Aware
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei He, Jiepan Li, Weinan Cao, Liangpei Zhang, Hongyan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building extraction aims to segment building pixels from remote sensing
+images and plays an essential role in many applications, such as city planning
+and urban dynamic monitoring. Over the past few years, deep learning methods
+with encoder-decoder architectures have achieved remarkable performance due to
+their powerful feature representation capability. Nevertheless, due to the
+varying scales and styles of buildings, conventional deep learning models
+always suffer from uncertain predictions and cannot accurately distinguish the
+complete footprints of the building from the complex distribution of ground
+objects, leading to a large degree of omission and commission. In this paper,
+we realize the importance of uncertain prediction and propose a novel and
+straightforward Uncertainty-Aware Network (UANet) to alleviate this problem. To
+verify the performance of our proposed UANet, we conduct extensive experiments
+on three public building datasets, including the WHU building dataset, the
+Massachusetts building dataset, and the Inria aerial image dataset. Results
+demonstrate that the proposed UANet outperforms other state-of-the-art
+algorithms by a large margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen-Han Tsai, Yu-Shao Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image outlier detection (OD) is crucial for ensuring the quality and accuracy
+of image datasets used in computer vision tasks. The majority of OD algorithms,
+however, have not been targeted toward image data. Consequently, the results of
+applying such algorithms to images are often suboptimal. In this work, we
+propose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for
+images. By comparing images in a RANSAC-based approach, our algorithm
+automatically predicts the outlier score of each image without additional
+training or label information. We evaluate RANSAC-NN against state-of-the-art
+OD algorithms on 15 diverse datasets. Without any hyperparameter tuning,
+RANSAC-NN consistently performs favorably in contrast to other algorithms in
+almost every dataset category. Furthermore, we provide a detailed analysis to
+understand each RANSAC-NN component, and we demonstrate its potential
+applications in image mislabeled detection. Code for RANSAC-NN is provided at
+https://github.com/mxtsai/ransac-nn
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid-CSR: Coupling Explicit and Implicit Shape Representation for
+  Cortical Surface Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanlin Sun, Thanh-Tung Le, Chenyu You, Hao Tang, Kun Han, Haoyu Ma, Deying Kong, Xiangyi Yan, Xiaohui Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Hybrid-CSR, a geometric deep-learning model that combines explicit
+and implicit shape representations for cortical surface reconstruction.
+Specifically, Hybrid-CSR begins with explicit deformations of template meshes
+to obtain coarsely reconstructed cortical surfaces, based on which the oriented
+point clouds are estimated for the subsequent differentiable poisson surface
+reconstruction. By doing so, our method unifies explicit (oriented point
+clouds) and implicit (indicator function) cortical surface reconstruction.
+Compared to explicit representation-based methods, our hybrid approach is more
+friendly to capture detailed structures, and when compared with implicit
+representation-based methods, our method can be topology aware because of
+end-to-end training with a mesh-based deformation module. In order to address
+topology defects, we propose a new topology correction pipeline that relies on
+optimization-based diffeomorphic surface registration. Experimental results on
+three brain datasets show that our approach surpasses existing implicit and
+explicit cortical surface reconstruction methods in numeric metrics in terms of
+accuracy, regularity, and consistency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simultaneous temperature estimation and nonuniformity correction from
+  multiple frames 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12297v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12297v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navot Oz, Omri Berman, Nir Sochen, David Mendelovich, Iftach Klapp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared (IR) cameras are widely used for temperature measurements in various
+applications, including agriculture, medicine, and security. Low-cost IR camera
+have an immense potential to replace expansive radiometric cameras in these
+applications, however low-cost microbolometer-based IR cameras are prone to
+spatially-variant nonuniformity and to drift in temperature measurements, which
+limits their usability in practical scenarios.
+  To address these limitations, we propose a novel approach for simultaneous
+temperature estimation and nonuniformity correction from multiple frames
+captured by low-cost microbolometer-based IR cameras. We leverage the physical
+image acquisition model of the camera and incorporate it into a deep learning
+architecture called kernel estimation networks (KPN), which enables us to
+combine multiple frames despite imperfect registration between them. We also
+propose a novel offset block that incorporates the ambient temperature into the
+model and enables us to estimate the offset of the camera, which is a key
+factor in temperature estimation.
+  Our findings demonstrate that the number of frames has a significant impact
+on the accuracy of temperature estimation and nonuniformity correction.
+Moreover, our approach achieves a significant improvement in performance
+compared to vanilla KPN, thanks to the offset block. The method was tested on
+real data collected by a low-cost IR camera mounted on a UAV, showing only a
+small average error of $0.27^\circ C-0.54^\circ C$ relative to costly
+scientific-grade radiometric cameras.
+  Our method provides an accurate and efficient solution for simultaneous
+temperature estimation and nonuniformity correction, which has important
+implications for a wide range of practical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TransHuman: A <span class="highlight-title">Transformer</span>-based Human Representation for Generalizable
+  Neural Human Rendering <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Pan, Zongxin Yang, Jianxin Ma, Chang Zhou, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on the task of generalizable neural human rendering
+which trains conditional Neural Radiance Fields (NeRF) from multi-view videos
+of different characters. To handle the dynamic human motion, previous methods
+have primarily used a SparseConvNet (SPC)-based human representation to process
+the painted SMPL. However, such SPC-based representation i) optimizes under the
+volatile observation space which leads to the pose-misalignment between
+training and inference stages, and ii) lacks the global relationships among
+human parts that is critical for handling the incomplete painted SMPL. Tackling
+these issues, we present a brand-new framework named TransHuman, which learns
+the painted SMPL under the canonical space and captures the global
+relationships between human parts with transformers. Specifically, TransHuman
+is mainly composed of Transformer-based Human Encoding (TransHE), Deformable
+Partial Radiance Fields (DPaRF), and Fine-grained Detail Integration (FDI).
+TransHE first processes the painted SMPL under the canonical space via
+transformers for capturing the global relationships between human parts. Then,
+DPaRF binds each output token with a deformable radiance field for encoding the
+query point under the observation space. Finally, the FDI is employed to
+further integrate fine-grained information from reference images. Extensive
+experiments on ZJU-MoCap and H36M show that our TransHuman achieves a
+significantly new state-of-the-art performance with high efficiency. Project
+page: https://pansanity666.github.io/TransHuman/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Downstream-agnostic Adversarial Examples <span class="chip">ICCV '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Zhou, Shengshan Hu, Ruizhi Zhao, Qian Wang, Leo Yu Zhang, Junhui Hou, Hai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning usually uses a large amount of unlabeled data to
+pre-train an encoder which can be used as a general-purpose feature extractor,
+such that downstream users only need to perform fine-tuning operations to enjoy
+the benefit of "large model". Despite this promising prospect, the security of
+pre-trained encoder has not been thoroughly investigated yet, especially when
+the pre-trained encoder is publicly available for commercial use.
+  In this paper, we propose AdvEncoder, the first framework for generating
+downstream-agnostic universal adversarial examples based on the pre-trained
+encoder. AdvEncoder aims to construct a universal adversarial perturbation or
+patch for a set of natural images that can fool all the downstream tasks
+inheriting the victim pre-trained encoder. Unlike traditional adversarial
+example works, the pre-trained encoder only outputs feature vectors rather than
+classification labels. Therefore, we first exploit the high frequency component
+information of the image to guide the generation of adversarial examples. Then
+we design a generative attack framework to construct adversarial
+perturbations/patches by learning the distribution of the attack surrogate
+dataset to improve their attack success rates and transferability. Our results
+show that an attacker can successfully attack downstream tasks without knowing
+either the pre-training dataset or the downstream dataset. We also tailor four
+defenses for pre-trained encoders, the results of which further prove the
+attack ability of AdvEncoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by the International Conference on
+  Computer Vision (ICCV '23, October 2--6, 2023, Paris, France)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FDCT: Fast Depth Completion for Transparent Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianan Li, Zhehan Chen, Huan Liu, Chen Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth completion is crucial for many robotic tasks such as autonomous
+driving, 3-D reconstruction, and manipulation. Despite the significant
+progress, existing methods remain computationally intensive and often fail to
+meet the real-time requirements of low-power robotic platforms. Additionally,
+most methods are designed for opaque objects and struggle with transparent
+objects due to the special properties of reflection and refraction. To address
+these challenges, we propose a Fast Depth Completion framework for Transparent
+objects (FDCT), which also benefits downstream tasks like object pose
+estimation. To leverage local information and avoid overfitting issues when
+integrating it with global information, we design a new fusion branch and
+shortcuts to exploit low-level features and a loss function to suppress
+overfitting. This results in an accurate and user-friendly depth rectification
+framework which can recover dense depth estimation from RGB-D images alone.
+Extensive experiments demonstrate that FDCT can run about 70 FPS with a higher
+accuracy than the state-of-the-art methods. We also demonstrate that FDCT can
+improve pose estimation in object grasping tasks. The source code is available
+at https://github.com/Nonmy/FDCT
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9pages,7figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Context Perception Parallel Decoder for Scene Text Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongkun Du, Zhineng Chen, Caiyan Jia, Xiaoting Yin, Chenxia Li, Yuning Du, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene text recognition (STR) methods have struggled to attain high accuracy
+and fast inference speed. Autoregressive (AR)-based STR model uses the
+previously recognized characters to decode the next character iteratively. It
+shows superiority in terms of accuracy. However, the inference speed is slow
+also due to this iteration. Alternatively, parallel decoding (PD)-based STR
+model infers all the characters in a single decoding pass. It has advantages in
+terms of inference speed but worse accuracy, as it is difficult to build a
+robust recognition context in such a pass. In this paper, we first present an
+empirical study of AR decoding in STR. In addition to constructing a new AR
+model with the top accuracy, we find out that the success of AR decoder lies
+also in providing guidance on visual context perception rather than language
+modeling as claimed in existing studies. As a consequence, we propose Context
+Perception Parallel Decoder (CPPD) to decode the character sequence in a single
+PD pass. CPPD devises a character counting module and a character ordering
+module. Given a text instance, the former infers the occurrence count of each
+character, while the latter deduces the character reading order and
+placeholders. Together with the character prediction task, they construct a
+context that robustly tells what the character sequence is and where the
+characters appear, well mimicking the context conveyed by AR decoding.
+Experiments on both English and Chinese benchmarks demonstrate that CPPD models
+achieve highly competitive accuracy. Moreover, they run approximately 7x faster
+than their AR counterparts, and are also among the fastest recognizers. The
+code will be released soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building-road Collaborative Extraction from Remotely Sensed Images via
+  Cross-Interaction <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Guo, Xin Su, Chen Wu, Bo Du, Liangpei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Buildings are the basic carrier of social production and human life; roads
+are the links that interconnect social networks. Building and road information
+has important application value in the frontier fields of regional coordinated
+development, disaster prevention, auto-driving, etc. Mapping buildings and
+roads from very high-resolution (VHR) remote sensing images have become a hot
+research topic. However, the existing methods often ignore the strong spatial
+correlation between roads and buildings and extract them in isolation. To fully
+utilize the complementary advantages between buildings and roads, we propose a
+building-road collaborative extraction method based on multi-task and
+cross-scale feature interaction to improve the accuracy of both tasks in a
+complementary way. A multi-task interaction module is proposed to interact
+information across tasks and preserve the unique information of each task,
+which tackle the seesaw phenomenon in multitask learning. By considering the
+variation in appearance and structure between buildings and roads, a
+cross-scale interaction module is designed to automatically learn the optimal
+reception field for different tasks. Compared with many existing methods that
+train each task individually, the proposed collaborative extraction method can
+utilize the complementary advantages between buildings and roads by the
+proposed inter-task and inter-scale feature interactions, and automatically
+select the optimal reception field for different tasks. Experiments on a wide
+range of urban and rural scenarios show that the proposed algorithm can achieve
+building-road extraction with outstanding performance and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages,9 figures, submitted to ISPRS Journal of Photogrammetry and
+  Remote Sensing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ResWCAE: Biometric Pattern Image Denoising Using Residual
+  Wavelet-Conditioned Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youzhi Liang, Wen Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The utilization of biometric authentication with pattern images is
+increasingly popular in compact Internet of Things (IoT) devices. However, the
+reliability of such systems can be compromised by image quality issues,
+particularly in the presence of high levels of noise. While state-of-the-art
+deep learning algorithms designed for generic image denoising have shown
+promise, their large number of parameters and lack of optimization for unique
+biometric pattern retrieval make them unsuitable for these devices and
+scenarios. In response to these challenges, this paper proposes a lightweight
+and robust deep learning architecture, the Residual Wavelet-Conditioned
+Convolutional Autoencoder (Res-WCAE) with a Kullback-Leibler divergence (KLD)
+regularization, designed specifically for fingerprint image denoising. Res-WCAE
+comprises two encoders - an image encoder and a wavelet encoder - and one
+decoder. Residual connections between the image encoder and decoder are
+leveraged to preserve fine-grained spatial features, where the bottleneck layer
+conditioned on the compressed representation of features obtained from the
+wavelet encoder using approximation and detail subimages in the
+wavelet-transform domain. The effectiveness of Res-WCAE is evaluated against
+several state-of-the-art denoising methods, and the experimental results
+demonstrate that Res-WCAE outperforms these methods, particularly for heavily
+degraded fingerprint images in the presence of high levels of noise. Overall,
+Res-WCAE shows promise as a solution to the challenges faced by biometric
+authentication systems in compact IoT devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Depression Detection via Head Motion Patterns 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Monika Gahalawat, Raul Fernandez Rojas, Tanaya Guha, Ramanathan Subramanian, Roland Goecke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While depression has been studied via multimodal non-verbal behavioural cues,
+head motion behaviour has not received much attention as a biomarker. This
+study demonstrates the utility of fundamental head-motion units, termed
+\emph{kinemes}, for depression detection by adopting two distinct approaches,
+and employing distinctive features: (a) discovering kinemes from head motion
+data corresponding to both depressed patients and healthy controls, and (b)
+learning kineme patterns only from healthy controls, and computing statistics
+derived from reconstruction errors for both the patient and control classes.
+Employing machine learning methods, we evaluate depression classification
+performance on the \emph{BlackDog} and \emph{AVEC2013} datasets. Our findings
+indicate that: (1) head motion patterns are effective biomarkers for detecting
+depressive symptoms, and (2) explanatory kineme patterns consistent with prior
+findings can be observed for the two classes. Overall, we achieve peak F1
+scores of 0.79 and 0.82, respectively, over BlackDog and AVEC2013 for binary
+classification over episodic \emph{thin-slices}, and a peak F1 of 0.72 over
+videos for AVEC2013.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DQ-Det: Learning Dynamic Query Combinations for <span class="highlight-title">Transformer</span>-based Object
+  Detection and Segmentation <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Cui, Linjie Yang, Haichao Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based detection and segmentation methods use a list of learned
+detection queries to retrieve information from the transformer network and
+learn to predict the location and category of one specific object from each
+query. We empirically find that random convex combinations of the learned
+queries are still good for the corresponding models. We then propose to learn a
+convex combination with dynamic coefficients based on the high-level semantics
+of the image. The generated dynamic queries, named modulated queries, better
+capture the prior of object locations and categories in the different images.
+Equipped with our modulated queries, a wide range of DETR-based models achieve
+consistent and superior performance across multiple tasks including object
+detection, instance segmentation, panoptic segmentation, and video instance
+segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures, ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Modal Machine Learning for Assessing Gaming Skills in Online
+  Streaming: A Case Study with CS:GO 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longxiang Zhang, Wenping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online streaming is an emerging market that address much attention. Assessing
+gaming skills from videos is an important task for streaming service providers
+to discover talented gamers. Service providers require the information to offer
+customized recommendation and service promotion to their customers. Meanwhile,
+this is also an important multi-modal machine learning tasks since online
+streaming combines vision, audio and text modalities. In this study we begin by
+identifying flaws in the dataset and proceed to clean it manually. Then we
+propose several variants of latest end-to-end models to learn joint
+representation of multiple modalities. Through our extensive experimentation,
+we demonstrate the efficacy of our proposals. Moreover, we identify that our
+proposed models is prone to identifying users instead of learning meaningful
+representations. We purpose future work to address the issue in the end.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EchoGLAD: Hierarchical Graph Neural Networks for Left Ventricle Landmark
+  Detection on Echocardiograms <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masoud Mokhtari, Mobina Mahdavi, Hooman Vaseli, Christina Luong, Purang Abolmaesumi, Teresa S. M. Tsang, Renjie Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The functional assessment of the left ventricle chamber of the heart requires
+detecting four landmark locations and measuring the internal dimension of the
+left ventricle and the approximate mass of the surrounding muscle. The key
+challenge of automating this task with machine learning is the sparsity of
+clinical labels, i.e., only a few landmark pixels in a high-dimensional image
+are annotated, leading many prior works to heavily rely on isotropic label
+smoothing. However, such a label smoothing strategy ignores the anatomical
+information of the image and induces some bias. To address this challenge, we
+introduce an echocardiogram-based, hierarchical graph neural network (GNN) for
+left ventricle landmark detection (EchoGLAD). Our main contributions are: 1) a
+hierarchical graph representation learning framework for multi-resolution
+landmark detection via GNNs; 2) induced hierarchical supervision at different
+levels of granularity using a multi-level loss. We evaluate our model on a
+public and a private dataset under the in-distribution (ID) and
+out-of-distribution (OOD) settings. For the ID setting, we achieve the
+state-of-the-art mean absolute errors (MAEs) of 1.46 mm and 1.86 mm on the two
+datasets. Our model also shows better OOD generalization than prior works with
+a testing MAE of 4.3 mm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ASCON: Anatomy-aware Supervised Contrastive Learning Framework for
+  Low-dose CT Denoising <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Chen, Qi Gao, Yi Zhang, Hongming Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While various deep learning methods have been proposed for low-dose computed
+tomography (CT) denoising, most of them leverage the normal-dose CT images as
+the ground-truth to supervise the denoising process. These methods typically
+ignore the inherent correlation within a single CT image, especially the
+anatomical semantics of human tissues, and lack the interpretability on the
+denoising process. In this paper, we propose a novel Anatomy-aware Supervised
+CONtrastive learning framework, termed ASCON, which can explore the anatomical
+semantics for low-dose CT denoising while providing anatomical
+interpretability. The proposed ASCON consists of two novel designs: an
+efficient self-attention-based U-Net (ESAU-Net) and a multi-scale anatomical
+contrastive network (MAC-Net). First, to better capture global-local
+interactions and adapt to the high-resolution input, an efficient ESAU-Net is
+introduced by using a channel-wise self-attention mechanism. Second, MAC-Net
+incorporates a patch-wise non-contrastive module to capture inherent anatomical
+information and a pixel-wise contrastive module to maintain intrinsic
+anatomical consistency. Extensive experimental results on two public low-dose
+CT denoising datasets demonstrate superior performance of ASCON over
+state-of-the-art models. Remarkably, our ASCON provides anatomical
+interpretability for low-dose CT denoising for the first time. Source code is
+available at https://github.com/hao1635/ASCON.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Expediting Building Footprint Segmentation from High-resolution Remote
+  Sensing Images via progressive lenient supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Guo, Bo Du, Chen Wu, Xin Su, Liangpei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The efficacy of building footprint segmentation from remotely sensed images
+has been hindered by model transfer effectiveness. Many existing building
+segmentation methods were developed upon the encoder-decoder architecture of
+U-Net, in which the encoder is finetuned from the newly developed backbone
+networks that are pre-trained on ImageNet. However, the heavy computational
+burden of the existing decoder designs hampers the successful transfer of these
+modern encoder networks to remote sensing tasks. Even the widely-adopted deep
+supervision strategy fails to mitigate these challenges due to its invalid loss
+in hybrid regions where foreground and background pixels are intermixed. In
+this paper, we conduct a comprehensive evaluation of existing decoder network
+designs for building footprint segmentation and propose an efficient framework
+denoted as BFSeg to enhance learning efficiency and effectiveness.
+Specifically, a densely-connected coarse-to-fine feature fusion decoder network
+that facilitates easy and fast feature fusion across scales is proposed.
+Moreover, considering the invalidity of hybrid regions in the down-sampled
+ground truth during the deep supervision process, we present a lenient deep
+supervision and distillation strategy that enables the network to learn proper
+knowledge from deep supervision. Building upon these advancements, we have
+developed a new family of building segmentation networks, which consistently
+surpass prior works with outstanding performance and efficiency across a wide
+range of newly developed encoder networks. The code will be released on
+https://github.com/HaonanGuo/BFSeg-Efficient-Building-Footprint-Segmentation-Framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages,8 figures. Submitted to IEEE Transactions on Neural Networks
+  and Learning Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LoLep: Single-View View Synthesis with Locally-Learned Planes and
+  Self-Attention Occlusion Inference <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Wang, Yu-Ping Wang, Dinesh Manocha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel method, LoLep, which regresses Locally-Learned planes from
+a single RGB image to represent scenes accurately, thus generating better novel
+views. Without the depth information, regressing appropriate plane locations is
+a challenging problem. To solve this issue, we pre-partition the disparity
+space into bins and design a disparity sampler to regress local offsets for
+multiple planes in each bin. However, only using such a sampler makes the
+network not convergent; we further propose two optimizing strategies that
+combine with different disparity distributions of datasets and propose an
+occlusion-aware reprojection loss as a simple yet effective geometric
+supervision technique. We also introduce a self-attention mechanism to improve
+occlusion inference and present a Block-Sampling Self-Attention (BS-SA) module
+to address the problem of applying self-attention to large feature maps. We
+demonstrate the effectiveness of our approach and generate state-of-the-art
+results on different datasets. Compared to MINE, our approach has an LPIPS
+reduction of 4.8%-9.0% and an RV reduction of 83.1%-84.7%. We also evaluate the
+performance on real-world images and demonstrate the benefits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepCL: Deep Change Feature Learning on Remote Sensing Images in the
+  Metric Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Guo, Bo Du, Chen Wu, Chengxi Han, Liangpei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Change detection (CD) is an important yet challenging task in the Earth
+observation field for monitoring Earth surface dynamics. The advent of deep
+learning techniques has recently propelled automatic CD into a technological
+revolution. Nevertheless, deep learning-based CD methods are still plagued by
+two primary issues: 1) insufficient temporal relationship modeling and 2)
+pseudo-change misclassification. To address these issues, we complement the
+strong temporal modeling ability of metric learning with the prominent fitting
+ability of segmentation and propose a deep change feature learning (DeepCL)
+framework for robust and explainable CD. Firstly, we designed a hard
+sample-aware contrastive loss, which reweights the importance of hard and
+simple samples. This loss allows for explicit modeling of the temporal
+correlation between bi-temporal remote sensing images. Furthermore, the modeled
+temporal relations are utilized as knowledge prior to guide the segmentation
+process for detecting change regions. The DeepCL framework is thoroughly
+evaluated both theoretically and experimentally, demonstrating its superior
+feature discriminability, resilience against pseudo changes, and adaptability
+to a variety of CD algorithms. Extensive comparative experiments substantiate
+the quantitative and qualitative superiority of DeepCL over state-of-the-art CD
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages,7 figures, submitted to IEEE Transactions on Image
+  Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Magic123: One Image to High-Quality 3D Object Generation Using Both 2D
+  and 3D Diffusion Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17843v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17843v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guocheng Qian, Jinjie Mai, Abdullah Hamdi, Jian Ren, Aliaksandr Siarohin, Bing Li, Hsin-Ying Lee, Ivan Skorokhodov, Peter Wonka, Sergey Tulyakov, Bernard Ghanem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Magic123, a two-stage coarse-to-fine approach for high-quality,
+textured 3D meshes generation from a single unposed image in the wild using
+both2D and 3D priors. In the first stage, we optimize a neural radiance field
+to produce a coarse geometry. In the second stage, we adopt a memory-efficient
+differentiable mesh representation to yield a high-resolution mesh with a
+visually appealing texture. In both stages, the 3D content is learned through
+reference view supervision and novel views guided by a combination of 2D and 3D
+diffusion priors. We introduce a single trade-off parameter between the 2D and
+3D priors to control exploration (more imaginative) and exploitation (more
+precise) of the generated geometry. Additionally, we employ textual inversion
+and monocular depth regularization to encourage consistent appearances across
+views and to prevent degenerate solutions, respectively. Magic123 demonstrates
+a significant improvement over previous image-to-3D techniques, as validated
+through extensive experiments on synthetic benchmarks and diverse real-world
+images. Our code, models, and generated 3D assets are available at
+https://github.com/guochengqian/Magic123.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>webpage: https://guochengqian.github.io/project/magic123/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TokenFlow: Consistent Diffusion Features for Consistent Video Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10373v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10373v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Geyer, Omer Bar-Tal, Shai Bagon, Tali Dekel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generative AI revolution has recently expanded to videos. Nevertheless,
+current state-of-the-art video models are still lagging behind image models in
+terms of visual quality and user control over the generated content. In this
+work, we present a framework that harnesses the power of a text-to-image
+diffusion model for the task of text-driven video editing. Specifically, given
+a source video and a target text-prompt, our method generates a high-quality
+video that adheres to the target text, while preserving the spatial layout and
+motion of the input video. Our method is based on a key observation that
+consistency in the edited video can be obtained by enforcing consistency in the
+diffusion feature space. We achieve this by explicitly propagating diffusion
+features based on inter-frame correspondences, readily available in the model.
+Thus, our framework does not require any training or fine-tuning, and can work
+in conjunction with any off-the-shelf text-to-image editing method. We
+demonstrate state-of-the-art editing results on a variety of real-world videos.
+Webpage: https://diffusion-tokenflow.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What's Wrong with the Absolute Trajectory Error? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05376v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05376v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seong Hun Lee, Javier Civera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the limitations of the commonly used Absolute Trajectory Error (ATE)
+is that it is highly sensitive to outliers. As a result, in the presence of
+just a few outliers, it often fails to reflect the varying accuracy as the
+inlier trajectory error or the number of outliers varies. In this work, we
+propose an alternative error metric for evaluating the accuracy of the
+reconstructed camera trajectory. Our metric, named Discernible Trajectory Error
+(DTE), is computed in five steps: (1) Shift the ground-truth and estimated
+trajectories such that both of their geometric medians are located at the
+origin. (2) Rotate the estimated trajectory such that it minimizes the sum of
+geodesic distances between the corresponding camera orientations. (3) Scale the
+estimated trajectory such that the median distance of the cameras to their
+geometric median is the same as that of the ground truth. (4) Compute,
+winsorize and normalize the distances between the corresponding cameras. (5)
+Obtain the DTE by taking the average of the mean and the root-mean-square (RMS)
+of the resulting distances. This metric is an attractive alternative to the
+ATE, in that it is capable of discerning the varying trajectory accuracy as the
+inlier trajectory error or the number of outliers varies. Using the similar
+idea, we also propose a novel rotation error metric, named Discernible Rotation
+Error (DRE), which has similar advantages to the DTE. Furthermore, we propose a
+simple yet effective method for calibrating the camera-to-marker rotation,
+which is needed for the computation of our metrics. Our methods are verified
+through extensive simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ELVIS: Empowering Locality of Vision Language <span class="highlight-title">Pre-train</span>ing with
+  Intra-modal Similarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sumin Seo, JaeWoong Shin, Jaewoo Kang, Tae Soo Kim, Thijs Kooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has shown great potential in assisting radiologists in reading
+chest X-ray (CXR) images, but its need for expensive annotations for improving
+performance prevents widespread clinical application. Visual language
+pre-training (VLP) can alleviate the burden and cost of annotation by
+leveraging routinely generated reports for radiographs, which exist in large
+quantities as well as in paired form (image-text pairs). Additionally,
+extensions to localization-aware VLPs are being proposed to address the needs
+for accurate localization of abnormalities for computer-aided diagnosis (CAD)
+in CXR. However, we find that the formulation proposed by locality-aware VLP
+literature actually leads to a loss in spatial relationships required for
+downstream localization tasks. Therefore, we propose Empowering Locality of VLP
+with Intra-modal Similarity, ELVIS, a VLP aware of intra-modal locality, to
+better preserve the locality within radiographs or reports, which enhances the
+ability to comprehend location references in text reports. Our locality-aware
+VLP method significantly outperforms state-of-the art baselines in multiple
+segmentation tasks and the MS-CXR phrase grounding task. Qualitatively, we show
+that ELVIS focuses well on regions of interest described in the report text
+compared to prior approaches, allowing for enhanced interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLIPTER: Looking at the Bigger Picture in Scene Text Recognition <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.07464v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.07464v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aviad Aberdam, David Bensaïd, Alona Golts, Roy Ganz, Oren Nuriel, Royee Tichauer, Shai Mazor, Ron Litman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reading text in real-world scenarios often requires understanding the context
+surrounding it, especially when dealing with poor-quality text. However,
+current scene text recognizers are unaware of the bigger picture as they
+operate on cropped text images. In this study, we harness the representative
+capabilities of modern vision-language models, such as CLIP, to provide
+scene-level information to the crop-based recognizer. We achieve this by fusing
+a rich representation of the entire image, obtained from the vision-language
+model, with the recognizer word-level features via a gated cross-attention
+mechanism. This component gradually shifts to the context-enhanced
+representation, allowing for stable fine-tuning of a pretrained recognizer. We
+demonstrate the effectiveness of our model-agnostic framework, CLIPTER (CLIP
+TExt Recognition), on leading text recognition architectures and achieve
+state-of-the-art results across multiple benchmarks. Furthermore, our analysis
+highlights improved robustness to out-of-vocabulary words and enhanced
+generalization in low-data regimes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RepViT: Revisiting Mobile CNN From ViT Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09283v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09283v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ao Wang, Hui Chen, Zijia Lin, Hengjun Pu, Guiguang Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, lightweight Vision Transformers (ViTs) demonstrate superior
+performance and lower latency compared with lightweight Convolutional Neural
+Networks (CNNs) on resource-constrained mobile devices. This improvement is
+usually attributed to the multi-head self-attention module, which enables the
+model to learn global representations. However, the architectural disparities
+between lightweight ViTs and lightweight CNNs have not been adequately
+examined. In this study, we revisit the efficient design of lightweight CNNs
+and emphasize their potential for mobile devices. We incrementally enhance the
+mobile-friendliness of a standard lightweight CNN, specifically MobileNetV3, by
+integrating the efficient architectural choices of lightweight ViTs. This ends
+up with a new family of pure lightweight CNNs, namely RepViT. Extensive
+experiments show that RepViT outperforms existing state-of-the-art lightweight
+ViTs and exhibits favorable latency in various vision tasks. On ImageNet,
+RepViT achieves over 80\% top-1 accuracy with nearly 1ms latency on an iPhone
+12, which is the first time for a lightweight model, to the best of our
+knowledge. Our largest model, RepViT-M3, obtains 81.4\% accuracy with only
+1.3ms latency. The code and trained models are available at
+\url{https://github.com/jameslahm/RepViT}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Sparse to Precise: A Practical Editing Approach for Intracardiac
+  Echocardiography Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11041v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11041v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed H. Shahin, Yan Zhuang, Noha El-Zehiry
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and safe catheter ablation procedures for patients with atrial
+fibrillation require precise segmentation of cardiac structures in Intracardiac
+Echocardiography (ICE) imaging. Prior studies have suggested methods that
+employ 3D geometry information from the ICE transducer to create a sparse ICE
+volume by placing 2D frames in a 3D grid, enabling training of 3D segmentation
+models. However, the resulting 3D masks from these models can be inaccurate and
+may lead to serious clinical complications due to the sparse sampling in ICE
+data, frames misalignment, and cardiac motion. To address this issue, we
+propose an interactive editing framework that allows users to edit segmentation
+output by drawing scribbles on a 2D frame. The user interaction is mapped to
+the 3D grid and utilized to execute an editing step that modifies the
+segmentation in the vicinity of the interaction while preserving the previous
+segmentation away from the interaction. Furthermore, our framework accommodates
+multiple edits to the segmentation output in a sequential manner without
+compromising previous edits. This paper presents a novel loss function and a
+novel evaluation metric specifically designed for editing. Results from
+cross-validation and testing indicate that our proposed loss function
+outperforms standard losses and training strategies in terms of segmentation
+quality and following user input. Additionally, we show quantitatively and
+qualitatively that subsequent edits do not compromise previous edits when using
+our method, as opposed to standard segmentation losses. Overall, our approach
+enhances the accuracy of the segmentation while avoiding undesired changes away
+from user interactions and without compromising the quality of previously
+edited regions, leading to better patient outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Open Vocabulary Learning: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15880v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15880v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianzong Wu, Xiangtai Li, Shilin Xu, Haobo Yuan, Henghui Ding, Yibo Yang, Xia Li, Jiangning Zhang, Yunhai Tong, Xudong Jiang, Bernard Ghanem, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of visual scene understanding, deep neural networks have made
+impressive advancements in various core tasks like segmentation, tracking, and
+detection. However, most approaches operate on the close-set assumption,
+meaning that the model can only identify pre-defined categories that are
+present in the training set. Recently, open vocabulary settings were proposed
+due to the rapid progress of vision language pre-training. These new approaches
+seek to locate and recognize categories beyond the annotated label space. The
+open vocabulary approach is more general, practical, and effective compared to
+weakly supervised and zero-shot settings. This paper provides a thorough review
+of open vocabulary learning, summarizing and analyzing recent developments in
+the field. In particular, we begin by comparing it to related concepts such as
+zero-shot learning, open-set recognition, and out-of-distribution detection.
+Then, we review several closely related tasks in the case of segmentation and
+detection, including long-tail problems, few-shot, and zero-shot settings. For
+the method survey, we first present the basic knowledge of detection and
+segmentation in close-set as the preliminary knowledge. Next, we examine
+various scenarios in which open vocabulary learning is used, identifying common
+design elements and core ideas. Then, we compare the recent detection and
+segmentation approaches in commonly used datasets and benchmarks. Finally, we
+conclude with insights, issues, and discussions regarding future research
+directions. To our knowledge, this is the first comprehensive literature review
+of open vocabulary learning. We keep tracing related works at
+https://github.com/jianzongwu/Awesome-Open-Vocabulary.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://github.com/jianzongwu/Awesome-Open-Vocabulary</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Betrayed by Captions: Joint Caption Grounding and Generation for Open
+  Vocabulary Instance Segmentation <span class="chip">ICCV-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.00805v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.00805v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianzong Wu, Xiangtai Li, Henghui Ding, Xia Li, Guangliang Cheng, Yunhai Tong, Chen Change Loy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we focus on open vocabulary instance segmentation to expand a
+segmentation model to classify and segment instance-level novel categories.
+Previous approaches have relied on massive caption datasets and complex
+pipelines to establish one-to-one mappings between image regions and words in
+captions. However, such methods build noisy supervision by matching non-visible
+words to image regions, such as adjectives and verbs. Meanwhile, context words
+are also important for inferring the existence of novel objects as they show
+high inter-correlations with novel categories. To overcome these limitations,
+we devise a joint \textbf{Caption Grounding and Generation (CGG)} framework,
+which incorporates a novel grounding loss that only focuses on matching object
+nouns to improve learning efficiency. We also introduce a caption generation
+head that enables additional supervision and contextual modeling as a
+complementation to the grounding loss. Our analysis and results demonstrate
+that grounding and generation components complement each other, significantly
+enhancing the segmentation performance for novel classes. Experiments on the
+COCO dataset with two settings: Open Vocabulary Instance Segmentation (OVIS)
+and Open Set Panoptic Segmentation (OSPS) demonstrate the superiority of the
+CGG. Specifically, CGG achieves a substantial improvement of 6.8% mAP for novel
+classes without extra data on the OVIS task and 15% PQ improvements for novel
+classes on the OSPS benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust Referring Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.09554v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.09554v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianzong Wu, Xiangtai Li, Xia Li, Henghui Ding, Yunhai Tong, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Referring Image Segmentation (RIS) is a fundamental vision-language task that
+outputs object masks based on text descriptions. Many works have achieved
+considerable progress for RIS, including different fusion method designs. In
+this work, we explore an essential question, ``What if the text description is
+wrong or misleading?'' For example, the described objects are not in the image.
+We term such a sentence as a negative sentence. However, existing solutions for
+RIS cannot handle such a setting. To this end, we propose a new formulation of
+RIS, named Robust Referring Image Segmentation (R-RIS). It considers the
+negative sentence inputs besides the regular positive text inputs. To
+facilitate this new task, we create three R-RIS datasets by augmenting existing
+RIS datasets with negative sentences and propose new metrics to evaluate both
+types of inputs in a unified manner. Furthermore, we propose a new
+transformer-based model, called RefSegformer, with a token-based vision and
+language fusion module. Our design can be easily extended to our R-RIS setting
+by adding extra blank tokens. Our proposed RefSegformer achieves
+state-of-the-art results on both RIS and R-RIS datasets, establishing a solid
+baseline for both settings. Our project page is at
+\url{https://github.com/jianzongwu/robust-ref-seg}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>update more results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PDPP:Projected Diffusion for Procedure Planning in Instructional Videos <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanlin Wang, Yilu Wu, Sheng Guo, Limin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the problem of procedure planning in instructional
+videos, which aims to make goal-directed plans given the current visual
+observations in unstructured real-life videos. Previous works cast this problem
+as a sequence planning problem and leverage either heavy intermediate visual
+observations or natural language instructions as supervision, resulting in
+complex learning schemes and expensive annotation costs. In contrast, we treat
+this problem as a distribution fitting problem. In this sense, we model the
+whole intermediate action sequence distribution with a diffusion model (PDPP),
+and thus transform the planning problem to a sampling process from this
+distribution. In addition, we remove the expensive intermediate supervision,
+and simply use task labels from instructional videos as supervision instead.
+Our model is a U-Net based diffusion model, which directly samples action
+sequences from the learned distribution with the given start and end
+observations. Furthermore, we apply an efficient projection method to provide
+accurate conditional guides for our model during the learning and sampling
+process. Experiments on three datasets with different scales show that our PDPP
+model can achieve the state-of-the-art performance on multiple metrics, even
+without the task supervision. Code and trained models are available at
+https://github.com/MCG-NJU/PDPP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a highlight paper at CVPR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open-Vocabulary Affordance Detection in 3D Point Clouds <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02401v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02401v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Toan Nguyen, Minh Nhat Vu, An Vuong, Dzung Nguyen, Thieu Vo, Ngan Le, Anh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Affordance detection is a challenging problem with a wide variety of robotic
+applications. Traditional affordance detection methods are limited to a
+predefined set of affordance labels, hence potentially restricting the
+adaptability of intelligent robots in complex and dynamic environments. In this
+paper, we present the Open-Vocabulary Affordance Detection (OpenAD) method,
+which is capable of detecting an unbounded number of affordances in 3D point
+clouds. By simultaneously learning the affordance text and the point feature,
+OpenAD successfully exploits the semantic relationships between affordances.
+Therefore, our proposed method enables zero-shot detection and can be able to
+detect previously unseen affordances without a single annotation example.
+Intensive experimental results show that OpenAD works effectively on a wide
+range of affordance detection setups and outperforms other baselines by a large
+margin. Additionally, we demonstrate the practicality of the proposed OpenAD in
+real-world robotic applications with a fast inference speed (~100ms). Our
+project is available at https://openad2023.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at The 2023 IEEE/RSJ International Conference on Intelligent
+  Robots and Systems (IROS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Novel Category Discovery Over Domains with Soft Contrastive
+  Learning and All-in-One Classifier <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11262v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11262v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zelin Zang, Lei Shang, Senqiao Yang, Fei Wang, Baigui Sun, Xuansong Xie, Stan Z. Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation (UDA) has proven to be highly effective in
+transferring knowledge from a label-rich source domain to a label-scarce target
+domain. However, the presence of additional novel categories in the target
+domain has led to the development of open-set domain adaptation (ODA) and
+universal domain adaptation (UNDA). Existing ODA and UNDA methods treat all
+novel categories as a single, unified unknown class and attempt to detect it
+during training. However, we found that domain variance can lead to more
+significant view-noise in unsupervised data augmentation, which affects the
+effectiveness of contrastive learning (CL) and causes the model to be
+overconfident in novel category discovery. To address these issues, a framework
+named Soft-contrastive All-in-one Network (SAN) is proposed for ODA and UNDA
+tasks. SAN includes a novel data-augmentation-based soft contrastive learning
+(SCL) loss to fine-tune the backbone for feature transfer and a more
+human-intuitive classifier to improve new class discovery capability. The SCL
+loss weakens the adverse effects of the data augmentation view-noise problem
+which is amplified in domain transfer tasks. The All-in-One (AIO) classifier
+overcomes the overconfidence problem of current mainstream closed-set and
+open-set classifiers. Visualization and ablation experiments demonstrate the
+effectiveness of the proposed innovations. Furthermore, extensive experiment
+results on ODA and UNDA show that SAN outperforms existing state-of-the-art
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GMA3D: Local-Global Attention Learning to Estimate Occluded Motions of
+  Scene Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyang Lu, Ming Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene flow represents the motion information of each point in the 3D point
+clouds. It is a vital downstream method applied to many tasks, such as motion
+segmentation and object tracking. However, there are always occlusion points
+between two consecutive point clouds, whether from the sparsity data sampling
+or real-world occlusion. In this paper, we focus on addressing occlusion issues
+in scene flow by the semantic self-similarity and motion consistency of the
+moving objects. We propose a GMA3D module based on the transformer framework,
+which utilizes local and global semantic similarity to infer the motion
+information of occluded points from the motion information of local and global
+non-occluded points respectively, and then uses an offset aggregator to
+aggregate them. Our module is the first to apply the transformer-based
+architecture to gauge the scene flow occlusion problem on point clouds.
+Experiments show that our GMA3D can solve the occlusion problem in the scene
+flow, especially in the real scene. We evaluated the proposed method on the
+occluded version of point cloud datasets and get state-of-the-art results on
+the real scene KITTI dataset. To testify that GMA3D is still beneficial to
+non-occluded scene flow, we also conducted experiments on non-occluded version
+datasets and achieved promising performance on FlyThings3D and KITTI. The code
+is available at https://anonymous.4open.science/r/GMA3D-E100.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">6</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interface Design to Mitigate Inflation in Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rana Shahout, Yehonatan Peisakhovsky, Sasha Stoikov, Nikhil Garg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation systems rely on user-provided data to learn about item quality
+and provide personalized recommendations. An implicit assumption when
+aggregating ratings into item quality is that ratings are strong indicators of
+item quality. In this work, we test this assumption using data collected from a
+music discovery application. Our study focuses on two factors that cause rating
+inflation: heterogeneous user rating behavior and the dynamics of personalized
+recommendations. We show that user rating behavior substantially varies by
+user, leading to item quality estimates that reflect the users who rated an
+item more than the item quality itself. Additionally, items that are more
+likely to be shown via personalized recommendations can experience a
+substantial increase in their exposure and potential bias toward them. To
+mitigate these effects, we analyze the results of a randomized controlled trial
+in which the rating interface was modified. The test resulted in a substantial
+improvement in user rating behavior and a reduction in item quality inflation.
+These findings highlight the importance of carefully considering the
+assumptions underlying recommendation systems and designing interfaces that
+encourage accurate rating behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen-Han Tsai, Yu-Shao Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image outlier detection (OD) is crucial for ensuring the quality and accuracy
+of image datasets used in computer vision tasks. The majority of OD algorithms,
+however, have not been targeted toward image data. Consequently, the results of
+applying such algorithms to images are often suboptimal. In this work, we
+propose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for
+images. By comparing images in a RANSAC-based approach, our algorithm
+automatically predicts the outlier score of each image without additional
+training or label information. We evaluate RANSAC-NN against state-of-the-art
+OD algorithms on 15 diverse datasets. Without any hyperparameter tuning,
+RANSAC-NN consistently performs favorably in contrast to other algorithms in
+almost every dataset category. Furthermore, we provide a detailed analysis to
+understand each RANSAC-NN component, and we demonstrate its potential
+applications in image mislabeled detection. Code for RANSAC-NN is provided at
+https://github.com/mxtsai/ransac-nn
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recall, Robustness, and Lexicographic Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11370v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11370v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Diaz, Bhaskar Mitra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Researchers use recall to evaluate rankings across a variety of retrieval,
+recommendation, and machine learning tasks. While there is a colloquial
+interpretation of recall in set-based evaluation, the research community is far
+from a principled understanding of recall metrics for rankings. The lack of
+principled understanding of or motivation for recall has resulted in criticism
+amongst the retrieval community that recall is useful as a measure at all. In
+this light, we reflect on the measurement of recall in rankings from a formal
+perspective. Our analysis is composed of three tenets: recall, robustness, and
+lexicographic evaluation. First, we formally define `recall-orientation' as
+sensitivity to movement of the bottom-ranked relevant item. Second, we analyze
+our concept of recall orientation from the perspective of robustness with
+respect to possible searchers and content providers. Finally, we extend this
+conceptual and theoretical treatment of recall by developing a practical
+preference-based evaluation method based on lexicographic comparison. Through
+extensive empirical analysis across 17 TREC tracks, we establish that our new
+evaluation method, lexirecall, is correlated with existing recall metrics and
+exhibits substantially higher discriminative power and stability in the
+presence of missing labels. Our conceptual, theoretical, and empirical analysis
+substantially deepens our understanding of recall and motivates its adoption
+through connections to robustness and fairness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparing Apples to Apples: Generating Aspect-Aware Comparative
+  Sentences from User <span class="highlight-title">Review</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03691v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03691v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jessica Echterhoff, An Yan, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is time-consuming to find the best product among many similar
+alternatives. Comparative sentences can help to contrast one item from others
+in a way that highlights important features of an item that stand out. Given
+reviews of one or multiple items and relevant item features, we generate
+comparative review sentences to aid users to find the best fit. Specifically,
+our model consists of three successive components in a transformer: (i) an item
+encoding module to encode an item for comparison, (ii) a comparison generation
+module that generates comparative sentences in an autoregressive manner, (iii)
+a novel decoding method for user personalization. We show that our pipeline
+generates fluent and diverse comparative sentences. We run experiments on the
+relevance and fidelity of our generated sentences in a human evaluation study
+and find that our algorithm creates comparative review sentences that are
+relevant and truthful.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating the Factual Knowledge Boundary of Large Language Models
+  with Retrieval Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11019v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11019v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiyang Ren, Yuhao Wang, Yingqi Qu, Wayne Xin Zhao, Jing Liu, Hao Tian, Hua Wu, Ji-Rong Wen, Haifeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require
+a substantial amount of factual knowledge and often rely on external
+information for assistance. Recently, large language models (LLMs) (e.g.,
+ChatGPT), have demonstrated impressive prowess in solving a wide range of tasks
+with world knowledge, including knowledge-intensive tasks. However, it remains
+unclear how well LLMs are able to perceive their factual knowledge boundaries,
+particularly how they behave when incorporating retrieval augmentation. In this
+study, we present an initial analysis of the factual knowledge boundaries of
+LLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,
+we focus on three primary research questions and analyze them by examining QA
+performance, priori judgement and posteriori judgement of LLMs. We show
+evidence that LLMs possess unwavering confidence in their capabilities to
+respond to questions and the accuracy of their responses. Furthermore,
+retrieval augmentation proves to be an effective approach in enhancing LLMs'
+awareness of knowledge boundaries, thereby improving their judgemental
+abilities. Additionally, we also find that LLMs have a propensity to rely on
+the provided retrieval results when formulating answers, while the quality of
+these results significantly impacts their reliance. The code to reproduce this
+work is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ICPE: An Item Cluster-Wise Pareto-Efficient Framework for Recommendation
+  Debiasing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.12887v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.12887v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yule Wang, Xin Xin, Yue Ding, Yunzhe Li, Dong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender system based on historical user-item interactions is of vital
+importance for web-based services. However, the observed data used to train the
+recommender model suffers from severe bias issues. Practically, the item
+frequency distribution of the dataset is a highly skewed power-law
+distribution. Interactions of a small fraction of head items account for almost
+the whole training data. The normal training paradigm from such biased data
+tends to repetitively generate recommendations from the head items, which
+further exacerbates the biases and affects the exploration of potentially
+interesting items from the niche set. In this work, we innovatively explore the
+central theme of recommendation debiasing from an item cluster-wise
+multi-objective optimization perspective. Aiming to balance the learning on
+various item clusters that differ in popularity during the training process, we
+propose a model-agnostic framework namely Item Cluster-Wise Pareto-Efficient
+Recommendation (ICPE). In detail, we define our item cluster-wise optimization
+target as the recommender model should balance all item clusters that differ in
+popularity, thus we set the model learning on each item cluster as a unique
+optimization objective. To achieve this goal, we first explore items'
+popularity levels from a novel causal reasoning perspective. Then, we devise
+popularity discrepancy-based bisecting clustering to separate the item
+clusters. Next, we adaptively find the overall harmonious gradient direction
+for cluster-wise optimization objectives from a Pareto-efficient solver.
+Finally, in the prediction stage, we perform counterfactual inference to
+further eliminate the impact of global propensity. Extensive experimental
+results verify the superiorities of ICPE on overall recommendation performance
+and biases elimination.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">42</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Information-theoretic Analysis of Test Data Sensitivity in Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12456v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12456v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Futoshi Futami, Tomoharu Iwata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian inference is often utilized for uncertainty quantification tasks. A
+recent analysis by Xu and Raginsky 2022 rigorously decomposed the predictive
+uncertainty in Bayesian inference into two uncertainties, called aleatoric and
+epistemic uncertainties, which represent the inherent randomness in the
+data-generating process and the variability due to insufficient data,
+respectively. They analyzed those uncertainties in an information-theoretic
+way, assuming that the model is well-specified and treating the model's
+parameters as latent variables. However, the existing information-theoretic
+analysis of uncertainty cannot explain the widely believed property of
+uncertainty, known as the sensitivity between the test and training data. It
+implies that when test data are similar to training data in some sense, the
+epistemic uncertainty should become small. In this work, we study such
+uncertainty sensitivity using our novel decomposition method for the predictive
+uncertainty. Our analysis successfully defines such sensitivity using
+information-theoretic quantities. Furthermore, we extend the existing analysis
+of Bayesian meta-learning and show the novel sensitivities among tasks for the
+first time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiAMoNDBack: Diffusion-denoising Autoregressive Model for
+  Non-Deterministic Backmapping of Cα Protein Traces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael S. Jones, Kirill Shmilovich, Andrew L. Ferguson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coarse-grained molecular models of proteins permit access to length and time
+scales unattainable by all-atom models and the simulation of processes that
+occur on long-time scales such as aggregation and folding. The reduced
+resolution realizes computational accelerations but an atomistic representation
+can be vital for a complete understanding of mechanistic details. Backmapping
+is the process of restoring all-atom resolution to coarse-grained molecular
+models. In this work, we report DiAMoNDBack (Diffusion-denoising Autoregressive
+Model for Non-Deterministic Backmapping) as an autoregressive denoising
+diffusion probability model to restore all-atom details to coarse-grained
+protein representations retaining only C{\alpha} coordinates. The
+autoregressive generation process proceeds from the protein N-terminus to
+C-terminus in a residue-by-residue fashion conditioned on the C{\alpha} trace
+and previously backmapped backbone and side chain atoms within the local
+neighborhood. The local and autoregressive nature of our model makes it
+transferable between proteins. The stochastic nature of the denoising diffusion
+process means that the model generates a realistic ensemble of backbone and
+side chain all-atom configurations consistent with the coarse-grained C{\alpha}
+trace. We train DiAMoNDBack over 65k+ structures from Protein Data Bank (PDB)
+and validate it in applications to a hold-out PDB test set,
+intrinsically-disordered protein structures from the Protein Ensemble Database
+(PED), molecular dynamics simulations of fast-folding mini-proteins from DE
+Shaw Research, and coarse-grained simulation data. We achieve state-of-the-art
+reconstruction performance in terms of correct bond formation, avoidance of
+side chain clashes, and diversity of the generated side chain configurational
+states. We make DiAMoNDBack model publicly available as a free and open source
+Python package.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProtoFL: Unsupervised Federated Learning via Prototypical Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansol Kim, Youngjun Kwak, Minyoung Jung, Jinho Shin, Youngsung Kim, Changick Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a promising approach for enhancing data privacy
+preservation, particularly for authentication systems. However, limited round
+communications, scarce representation, and scalability pose significant
+challenges to its deployment, hindering its full potential. In this paper, we
+propose 'ProtoFL', Prototypical Representation Distillation based unsupervised
+Federated Learning to enhance the representation power of a global model and
+reduce round communication costs. Additionally, we introduce a local one-class
+classifier based on normalizing flows to improve performance with limited data.
+Our study represents the first investigation of using FL to improve one-class
+classification performance. We conduct extensive experiments on five widely
+used benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and
+Keystroke-Dynamics, to demonstrate the superior performance of our proposed
+framework over previous methods in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed
+  equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WEPRO: Weight Prediction for Efficient Optimization of Hybrid
+  Quantum-Classical Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12449v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12449v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satwik Kundu, Debarshi Kundu, Swaroop Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The exponential run time of quantum simulators on classical machines and long
+queue depths and high costs of real quantum devices present significant
+challenges in the effective training of Variational Quantum Algorithms (VQAs)
+like Quantum Neural Networks (QNNs), Variational Quantum Eigensolver (VQE) and
+Quantum Approximate Optimization Algorithm (QAOA). To address these
+limitations, we propose a new approach, WEPRO (Weight Prediction), which
+accelerates the convergence of VQAs by exploiting regular trends in the
+parameter weights. We introduce two techniques for optimal prediction
+performance namely, Naive Prediction (NaP) and Adaptive Prediction (AdaP).
+Through extensive experimentation and training of multiple QNN models on
+various datasets, we demonstrate that WEPRO offers a speedup of approximately
+$2.25\times$ compared to standard training methods, while also providing
+improved accuracy (up to $2.3\%$ higher) and loss (up to $6.1\%$ lower) with
+low storage and computational overheads. We also evaluate WEPRO's effectiveness
+in VQE for molecular ground-state energy estimation and in QAOA for graph
+MaxCut. Our results show that WEPRO leads to speed improvements of up to
+$3.1\times$ for VQE and $2.91\times$ for QAOA, compared to traditional
+optimization techniques, while using up to $3.3\times$ less number of shots
+(i.e., repeated circuit executions) per training iteration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multifidelity Covariance Estimation via Regression on the Manifold of
+  Symmetric Positive Definite Matrices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aimee Maurais, Terrence Alsup, Benjamin Peherstorfer, Youssef Marzouk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a multifidelity estimator of covariance matrices formulated as
+the solution to a regression problem on the manifold of symmetric positive
+definite matrices. The estimator is positive definite by construction, and the
+Mahalanobis distance minimized to obtain it possesses properties which enable
+practical computation. We show that our manifold regression multifidelity
+(MRMF) covariance estimator is a maximum likelihood estimator under a certain
+error model on manifold tangent space. More broadly, we show that our
+Riemannian regression framework encompasses existing multifidelity covariance
+estimators constructed from control variates. We demonstrate via numerical
+examples that our estimator can provide significant decreases, up to one order
+of magnitude, in squared estimation error relative to both single-fidelity and
+other multifidelity covariance estimators. Furthermore, preservation of
+positive definiteness ensures that our estimator is compatible with downstream
+tasks, such as data assimilation and metric learning, in which this property is
+essential.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages + 15-page supplement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Generalized Schwarz-type Non-overlapping Domain Decomposition Method
+  using Physics-constrained Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12435v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12435v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shamsulhaq Basir, Inanc Senocak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a meshless Schwarz-type non-overlapping domain decomposition
+method based on artificial neural networks for solving forward and inverse
+problems involving partial differential equations (PDEs). To ensure the
+consistency of solutions across neighboring subdomains, we adopt a generalized
+Robin-type interface condition, assigning unique Robin parameters to each
+subdomain. These subdomain-specific Robin parameters are learned to minimize
+the mismatch on the Robin interface condition, facilitating efficient
+information exchange during training. Our method is applicable to both the
+Laplace's and Helmholtz equations. It represents local solutions by an
+independent neural network model which is trained to minimize the loss on the
+governing PDE while strictly enforcing boundary and interface conditions
+through an augmented Lagrangian formalism. A key strength of our method lies in
+its ability to learn a Robin parameter for each subdomain, thereby enhancing
+information exchange with its neighboring subdomains. We observe that the
+learned Robin parameters adapt to the local behavior of the solution, domain
+partitioning and subdomain location relative to the overall domain. Extensive
+experiments on forward and inverse problems, including one-way and two-way
+decompositions with crosspoints, demonstrate the versatility and performance of
+our proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Augmented Box Replay: Overcoming Foreground Shift for Incremental Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12427v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12427v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu Yuyang, Cong Yang, Goswami Dipam, Liu Xialei, Joost van de Weijer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In incremental learning, replaying stored samples from previous tasks
+together with current task samples is one of the most efficient approaches to
+address catastrophic forgetting. However, unlike incremental classification,
+image replay has not been successfully applied to incremental object detection
+(IOD). In this paper, we identify the overlooked problem of foreground shift as
+the main reason for this. Foreground shift only occurs when replaying images of
+previous tasks and refers to the fact that their background might contain
+foreground objects of the current task. To overcome this problem, a novel and
+efficient Augmented Box Replay (ABR) method is developed that only stores and
+replays foreground objects and thereby circumvents the foreground shift
+problem. In addition, we propose an innovative Attentive RoI Distillation loss
+that uses spatial attention from region-of-interest (RoI) features to constrain
+current model to focus on the most important information from old model. ABR
+significantly reduces forgetting of previous classes while maintaining high
+plasticity in current classes. Moreover, it considerably reduces the storage
+requirements when compared to standard image replay. Comprehensive experiments
+on Pascal-VOC and COCO datasets support the state-of-the-art performance of our
+model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Practical Commercial 5G Standalone (SA) Uplink Throughput Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kasidis Arunruangsirilert, Jiro Katto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the 5G New Radio (NR) network promises a huge uplift of the uplink
+throughput, the improvement can only be seen when the User Equipment (UE) is
+connected to the high-frequency millimeter wave (mmWave) band. With the rise of
+uplink-intensive smartphone applications such as the real-time transmission of
+UHD 4K/8K videos, and Virtual Reality (VR)/Augmented Reality (AR) contents,
+uplink throughput prediction plays a huge role in maximizing the users' quality
+of experience (QoE). In this paper, we propose using a ConvLSTM-based neural
+network to predict the future uplink throughput based on past uplink throughput
+and RF parameters. The network is trained using the data from real-world drive
+tests on commercial 5G SA networks while riding commuter trains, which
+accounted for various frequency bands, handover, and blind spots. To make sure
+our model can be practically implemented, we then limited our model to only use
+the information available via Android API, then evaluate our model using the
+data from both commuter trains and other methods of transportation. The results
+show that our model reaches an average prediction accuracy of 98.9\% with an
+average RMSE of 1.80 Mbps across all unseen evaluation scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Machine Learning Approach to Two-Stage Adaptive Robust Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitris Bertsimas, Cheol Woo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an approach based on machine learning to solve two-stage linear
+adaptive robust optimization (ARO) problems with binary here-and-now variables
+and polyhedral uncertainty sets. We encode the optimal here-and-now decisions,
+the worst-case scenarios associated with the optimal here-and-now decisions,
+and the optimal wait-and-see decisions into what we denote as the strategy. We
+solve multiple similar ARO instances in advance using the column and constraint
+generation algorithm and extract the optimal strategies to generate a training
+set. We train a machine learning model that predicts high-quality strategies
+for the here-and-now decisions, the worst-case scenarios associated with the
+optimal here-and-now decisions, and the wait-and-see decisions. We also
+introduce an algorithm to reduce the number of different target classes the
+machine learning algorithm needs to be trained on. We apply the proposed
+approach to the facility location, the multi-item inventory control and the
+unit commitment problems. Our approach solves ARO problems drastically faster
+than the state-of-the-art algorithms with high accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Control of Multiclass Fluid Queueing Networks: A Machine
+  Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12405v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12405v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitris Bertsimas, Cheol Woo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a machine learning approach to the optimal control of multiclass
+fluid queueing networks (MFQNETs) that provides explicit and insightful control
+policies. We prove that a threshold type optimal policy exists for MFQNET
+control problems, where the threshold curves are hyperplanes passing through
+the origin. We use Optimal Classification Trees with hyperplane splits (OCT-H)
+to learn an optimal control policy for MFQNETs. We use numerical solutions of
+MFQNET control problems as a training set and apply OCT-H to learn explicit
+control policies. We report experimental results with up to 33 servers and 99
+classes that demonstrate that the learned policies achieve 100\% accuracy on
+the test set. While the offline training of OCT-H can take days in large
+networks, the online application takes milliseconds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-aware Grounded Action Transformation towards Sim-to-Real
+  Transfer for Traffic Signal Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12388v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12388v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longchao Da, Hao Mei, Romir Sharma, Hua Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic signal control (TSC) is a complex and important task that affects the
+daily lives of millions of people. Reinforcement Learning (RL) has shown
+promising results in optimizing traffic signal control, but current RL-based
+TSC methods are mainly trained in simulation and suffer from the performance
+gap between simulation and the real world. In this paper, we propose a
+simulation-to-real-world (sim-to-real) transfer approach called UGAT, which
+transfers a learned policy trained from a simulated environment to a real-world
+environment by dynamically transforming actions in the simulation with
+uncertainty to mitigate the domain gap of transition dynamics. We evaluate our
+method on a simulated traffic environment and show that it significantly
+improves the performance of the transferred RL policy in the real world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Learning in Large Language Models Learns Label Relationships
+  but Is Not Conventional Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Kossen, Tom Rainforth, Yarin Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of Large Language Models (LLMs) on downstream tasks often
+improves significantly when including examples of the input-label relationship
+in the context. However, there is currently no consensus about how this
+in-context learning (ICL) ability of LLMs works: for example, while Xie et al.
+(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)
+argue ICL does not even learn label relationships from in-context examples. In
+this paper, we study (1) how labels of in-context examples affect predictions,
+(2) how label relationships learned during pre-training interact with
+input-label examples provided in-context, and (3) how ICL aggregates label
+information across in-context examples. Our findings suggests LLMs usually
+incorporate information from in-context labels, but that pre-training and
+in-context label relationships are treated differently, and that the model does
+not consider all in-context information equally. Our results give insights into
+understanding and aligning LLM behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early Prediction of Alzheimers Disease Leveraging Symptom Occurrences
+  from Longitudinal Electronic Health Records of US Military Veterans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rumeng Li, Xun Wang, Dan Berlowitz, Brian Silver, Wen Hu, Heather Keating, Raelene Goodwin, Weisong Liu, Honghuang Lin, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early prediction of Alzheimer's disease (AD) is crucial for timely
+intervention and treatment. This study aims to use machine learning approaches
+to analyze longitudinal electronic health records (EHRs) of patients with AD
+and identify signs and symptoms that can predict AD onset earlier. We used a
+case-control design with longitudinal EHRs from the U.S. Department of Veterans
+Affairs Veterans Health Administration (VHA) from 2004 to 2021. Cases were VHA
+patients with AD diagnosed after 1/1/2016 based on ICD-10-CM codes, matched 1:9
+with controls by age, sex and clinical utilization with replacement. We used a
+panel of AD-related keywords and their occurrences over time in a patient's
+longitudinal EHRs as predictors for AD prediction with four machine learning
+models. We performed subgroup analyses by age, sex, and race/ethnicity, and
+validated the model in a hold-out and "unseen" VHA stations group. Model
+discrimination, calibration, and other relevant metrics were reported for
+predictions up to ten years before ICD-based diagnosis. The study population
+included 16,701 cases and 39,097 matched controls. The average number of
+AD-related keywords (e.g., "concentration", "speaking") per year increased
+rapidly for cases as diagnosis approached, from around 10 to over 40, while
+remaining flat at 10 for controls. The best model achieved high discriminative
+accuracy (ROCAUC 0.997) for predictions using data from at least ten years
+before ICD-based diagnoses. The model was well-calibrated (Hosmer-Lemeshow
+goodness-of-fit p-value = 0.99) and consistent across subgroups of age, sex and
+race/ethnicity, except for patients younger than 65 (ROCAUC 0.746). Machine
+learning models using AD-related keywords identified from EHR notes can predict
+future AD diagnoses, suggesting its potential use for identifying AD risk using
+EHR notes, offering an affordable way for early screening on large population.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Right for the Wrong Reason: Can Interpretable ML Techniques Detect
+  Spurious Correlations? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susu Sun, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep neural network models offer unmatched classification performance,
+they are prone to learning spurious correlations in the data. Such dependencies
+on confounding information can be difficult to detect using performance metrics
+if the test data comes from the same distribution as the training data.
+Interpretable ML methods such as post-hoc explanations or inherently
+interpretable classifiers promise to identify faulty model reasoning. However,
+there is mixed evidence whether many of these techniques are actually able to
+do so. In this paper, we propose a rigorous evaluation strategy to assess an
+explanation technique's ability to correctly identify spurious correlations.
+Using this strategy, we evaluate five post-hoc explanation techniques and one
+inherently interpretable method for their ability to detect three types of
+artificially added confounders in a chest x-ray diagnosis task. We find that
+the post-hoc technique SHAP, as well as the inherently interpretable Attri-Net
+provide the best performance and can be used to reliably identify faulty model
+behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Learning for Audio-Based Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peranut Nimitsurachat, Peter Washington
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotion recognition models using audio input data can enable the development
+of interactive systems with applications in mental healthcare, marketing,
+gaming, and social media analysis. While the field of affective computing using
+audio data is rich, a major barrier to achieve consistently high-performance
+models is the paucity of available training labels. Self-supervised learning
+(SSL) is a family of methods which can learn despite a scarcity of supervised
+labels by predicting properties of the data itself. To understand the utility
+of self-supervised learning for audio-based emotion recognition, we have
+applied self-supervised learning pre-training to the classification of emotions
+from the CMU- MOSEI's acoustic modality. Unlike prior papers that have
+experimented with raw acoustic data, our technique has been applied to encoded
+acoustic data. Our model is first pretrained to uncover the randomly-masked
+timestamps of the acoustic data. The pre-trained model is then fine-tuned using
+a small sample of annotated data. The performance of the final model is then
+evaluated via several evaluation metrics against a baseline deep learning model
+with an identical backbone architecture. We find that self-supervised learning
+consistently improves the performance of the model across all metrics. This
+work shows the utility of self-supervised learning for affective computing,
+demonstrating that self-supervised learning is most useful when the number of
+training examples is small, and that the effect is most pronounced for emotions
+which are easier to classify such as happy, sad and anger. This work further
+demonstrates that self-supervised learning works when applied to embedded
+feature representations rather than the traditional approach of pre-training on
+the raw input space.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 9 figures, submitted to IEEE Transactions on Affective
+  Computing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rapid detection of soil carbonates by means of NIR spectroscopy, deep
+  learning methods and phase quantification by powder Xray diffraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lykourgos Chiniadis, Petros Tamvakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Soil NIR spectral absorbance/reflectance libraries are utilized towards
+improving agricultural production and analysis of soil properties which are key
+prerequisite for agroecological balance and environmental sustainability.
+Carbonates in particular, represent a soil property which is mostly affected
+even by mild, let alone extreme, changes of environmental conditions during
+climate change. In this study we propose a rapid and efficient way to predict
+carbonates content in soil by means of FT NIR reflectance spectroscopy and by
+use of deep learning methods. We exploited multiple machine learning methods,
+such as: 1) a MLP Regressor and 2) a CNN and compare their performance with
+other traditional ML algorithms such as PLSR, Cubist and SVM on the combined
+dataset of two NIR spectral libraries: KSSL (USDA), a dataset of soil samples
+reflectance spectra collected nationwide, and LUCAS TopSoil (European Soil
+Library) which contains soil sample absorbance spectra from all over the
+European Union, and use them to predict carbonate content on never before seen
+soil samples. Soil samples in KSSL and in TopSoil spectral libraries were
+acquired in the spectral region of visNIR, however in this study, only the NIR
+spectral region was utilized. Quantification of carbonates by means of Xray
+Diffraction is in good agreement with the volumetric method and the MLP
+prediction. Our work contributes to rapid carbonates content prediction in soil
+samples in cases where: 1) no volumetric method is available and 2) only NIR
+spectra absorbance data are available. Up till now and to the best of our
+knowledge, there exists no other study, that presents a prediction model
+trained on such an extensive dataset with such promising results on unseen
+data, undoubtedly supporting the notion that deep learning models present
+excellent prediction tools for soil carbonates content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TabADM: Unsupervised Tabular Anomaly Detection with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guy Zamberg, Moshe Salhov, Ofir Lindenbaum, Amir Averbuch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tables are an abundant form of data with use cases across all scientific
+fields. Real-world datasets often contain anomalous samples that can negatively
+affect downstream analysis. In this work, we only assume access to contaminated
+data and present a diffusion-based probabilistic model effective for
+unsupervised anomaly detection. Our model is trained to learn the density of
+normal samples by utilizing a unique rejection scheme to attenuate the
+influence of anomalies on the density estimation. At inference, we identify
+anomalies as samples in low-density regions. We use real data to demonstrate
+that our method improves detection capabilities over baselines. Furthermore,
+our method is relatively stable to the dimension of the data and does not
+require extensive hyperparameter tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An axiomatized PDE model of deep neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tangjun Wang, Wenqi Tao, Chenglong Bao, Zuoqiang Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by the relation between deep neural network (DNN) and partial
+differential equations (PDEs), we study the general form of the PDE models of
+deep neural networks. To achieve this goal, we formulate DNN as an evolution
+operator from a simple base model. Based on several reasonable assumptions, we
+prove that the evolution operator is actually determined by
+convection-diffusion equation. This convection-diffusion equation model gives
+mathematical explanation for several effective networks. Moreover, we show that
+the convection-diffusion model improves the robustness and reduces the
+Rademacher complexity. Based on the convection-diffusion equation, we design a
+new training method for ResNets. Experiments validate the performance of the
+proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tackling the Curse of Dimensionality with Physics-Informed Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheyuan Hu, Khemraj Shukla, George Em Karniadakis, Kenji Kawaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The curse-of-dimensionality (CoD) taxes computational resources heavily with
+exponentially increasing computational cost as the dimension increases. This
+poses great challenges in solving high-dimensional PDEs as Richard Bellman
+first pointed out over 60 years ago. While there has been some recent success
+in solving numerically partial differential equations (PDEs) in high
+dimensions, such computations are prohibitively expensive, and true scaling of
+general nonlinear PDEs to high dimensions has never been achieved. In this
+paper, we develop a new method of scaling up physics-informed neural networks
+(PINNs) to solve arbitrary high-dimensional PDEs. The new method, called
+Stochastic Dimension Gradient Descent (SDGD), decomposes a gradient of PDEs
+into pieces corresponding to different dimensions and samples randomly a subset
+of these dimensional pieces in each iteration of training PINNs. We
+theoretically prove the convergence guarantee and other desired properties of
+the proposed method. We experimentally demonstrate that the proposed method
+allows us to solve many notoriously hard high-dimensional PDEs, including the
+Hamilton-Jacobi-Bellman and the Schr\"{o}dinger equations in thousands of
+dimensions very fast on a single GPU using the PINNs mesh-free approach. For
+example, we solve nontrivial nonlinear PDEs (the HJB-Lin equation and the BSB
+equation) in 100,000 dimensions in 6 hours on a single GPU using SDGD with
+PINNs. Since SDGD is a general training methodology of PINNs, SDGD can be
+applied to any current and future variants of PINNs to scale them up for
+arbitrary high-dimensional PDEs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-Informed Machine Learning of Argon Gas-Driven Melt Pool Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        R. Sharma, W. Grace Guo, M. Raissi, Y. B. Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Melt pool dynamics in metal additive manufacturing (AM) is critical to
+process stability, microstructure formation, and final properties of the
+printed materials. Physics-based simulation including computational fluid
+dynamics (CFD) is the dominant approach to predict melt pool dynamics. However,
+the physics-based simulation approaches suffer from the inherent issue of very
+high computational cost. This paper provides a physics-informed machine
+learning (PIML) method by integrating neural networks with the governing
+physical laws to predict the melt pool dynamics such as temperature, velocity,
+and pressure without using any training data on velocity. This approach avoids
+solving the highly non-linear Navier-Stokes equation numerically, which
+significantly reduces the computational cost. The difficult-to-determine model
+constants of the governing equations of the melt pool can also be inferred
+through data-driven discovery. In addition, the physics-informed neural network
+(PINN) architecture has been optimized for efficient model training. The
+data-efficient PINN model is attributed to the soft penalty by incorporating
+governing partial differential equations (PDEs), initial conditions, and
+boundary conditions in the PINN model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen-Han Tsai, Yu-Shao Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image outlier detection (OD) is crucial for ensuring the quality and accuracy
+of image datasets used in computer vision tasks. The majority of OD algorithms,
+however, have not been targeted toward image data. Consequently, the results of
+applying such algorithms to images are often suboptimal. In this work, we
+propose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for
+images. By comparing images in a RANSAC-based approach, our algorithm
+automatically predicts the outlier score of each image without additional
+training or label information. We evaluate RANSAC-NN against state-of-the-art
+OD algorithms on 15 diverse datasets. Without any hyperparameter tuning,
+RANSAC-NN consistently performs favorably in contrast to other algorithms in
+almost every dataset category. Furthermore, we provide a detailed analysis to
+understand each RANSAC-NN component, and we demonstrate its potential
+applications in image mislabeled detection. Code for RANSAC-NN is provided at
+https://github.com/mxtsai/ransac-nn
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative analysis using classification methods versus early stage
+  diabetes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alca-Vilca Gabriel Anthony, Carpio-Vargas Eloy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this research work, a comparative analysis was carried out using
+classification methods such as: Discriminant Analysis and Logistic Regression
+to subsequently predict whether a person may have the presence of early stage
+diabetes. For this purpose, use was made of a database of the UC IRVINE
+platform of the year 2020 where specific variables that influence diabetes were
+used for a better result. Likewise in terms of methodology, the corresponding
+analysis was performed for each of the 3 classification methods and then take
+them to a comparative table and analyze the results obtained. Finally we can
+add that the majority of the studies carried out applying the classification
+methods to the diseases can be clearly seen that there is a certain attachment
+and more use of the logistic regression classification method, on the other
+hand, in the results we could see significant differences in terms of the 2
+classification methods that were applied, which was valuable information for
+later drawing final conclusions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ResWCAE: Biometric Pattern Image Denoising Using Residual
+  Wavelet-Conditioned Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youzhi Liang, Wen Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The utilization of biometric authentication with pattern images is
+increasingly popular in compact Internet of Things (IoT) devices. However, the
+reliability of such systems can be compromised by image quality issues,
+particularly in the presence of high levels of noise. While state-of-the-art
+deep learning algorithms designed for generic image denoising have shown
+promise, their large number of parameters and lack of optimization for unique
+biometric pattern retrieval make them unsuitable for these devices and
+scenarios. In response to these challenges, this paper proposes a lightweight
+and robust deep learning architecture, the Residual Wavelet-Conditioned
+Convolutional Autoencoder (Res-WCAE) with a Kullback-Leibler divergence (KLD)
+regularization, designed specifically for fingerprint image denoising. Res-WCAE
+comprises two encoders - an image encoder and a wavelet encoder - and one
+decoder. Residual connections between the image encoder and decoder are
+leveraged to preserve fine-grained spatial features, where the bottleneck layer
+conditioned on the compressed representation of features obtained from the
+wavelet encoder using approximation and detail subimages in the
+wavelet-transform domain. The effectiveness of Res-WCAE is evaluated against
+several state-of-the-art denoising methods, and the experimental results
+demonstrate that Res-WCAE outperforms these methods, particularly for heavily
+degraded fingerprint images in the presence of high levels of noise. Overall,
+Res-WCAE shows promise as a solution to the challenges faced by biometric
+authentication systems in compact IoT devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Drug Repurposing Targeting COVID-19 3CL Protease using Molecular Docking
+  and Machine Learning Regression Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18088v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18088v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Imra Aqeel, Abdul Majid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The COVID-19 pandemic has created a global health crisis, with an urgent need
+for effective treatments. Drug repurposing has emerged as a promising solution,
+as it can save time, cost, and labor. However, the number of identified
+repurposed drugs for COVID-19 treatment remains limited, and there is a need
+for more efficient and comprehensive drug repurposing approaches. In this
+study, we aimed to identify potential therapeutic candidates for COVID-19
+treatment through drug repurposing using a combination of molecular docking and
+machine learning regression approaches. We utilized the Zinc database to screen
+5903 World-approved drugs for their potential to target the main protease 3CL
+of SARS-CoV-2, which is a key enzyme in the replication of the virus. We
+performed molecular docking to evaluate the binding affinity of the drugs to
+the main protease 3CL, and used several machine learning regression approaches
+for QSAR modeling to identify drugs with high binding affinity. Our results
+showed that the Decision Tree Regression (DTR) model had the best statistical
+measures of R2 and RMSE, and we shortlisted six promising drugs within the
+range of -15 kcal/mol to -13 kcal/mol. These drugs have novel repurposing
+potential, except for one antiviral ZINC203757351 compound that has already
+been identified in other studies. We further analyzed the physiochemical and
+pharmacokinetic properties of these top-ranked selected drugs and their best
+binding interaction for specific target protease 3CLpro. Our study provides an
+efficient framework for drug repurposing against COVID-19, and demonstrates the
+potential of combining molecular docking with machine learning regression
+approaches to accelerate the identification of potential therapeutic
+candidates. Our findings contribute to the larger goal of finding effective
+treatments for COVID-19, which is a critical global health challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Generalizability of Graph Anomaly Detection Models via Data
+  Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10168v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10168v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuang Zhou, Xiao Huang, Ninghao Liu, Fu-Lai Chung, Long-Kai Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph anomaly detection (GAD) is a vital task since even a few anomalies can
+pose huge threats to benign users. Recent semi-supervised GAD methods, which
+can effectively leverage the available labels as prior knowledge, have achieved
+superior performances than unsupervised methods. In practice, people usually
+need to identify anomalies on new (sub)graphs to secure their business, but
+they may lack labels to train an effective detection model. One natural idea is
+to directly adopt a trained GAD model to the new (sub)graph for testing.
+However, we find that existing semi-supervised GAD methods suffer from poor
+generalization issue, i.e., well-trained models could not perform well on an
+unseen area (i.e., not accessible in training) of the same graph. It may cause
+great troubles. In this paper, we base on the phenomenon and propose a general
+and novel research problem of generalized graph anomaly detection that aims to
+effectively identify anomalies on both the training-domain graph and unseen
+testing graph to eliminate potential dangers. Nevertheless, it is a challenging
+task since only limited labels are available, and the normal background may
+differ between training and testing data. Accordingly, we propose a data
+augmentation method named \textit{AugAN} (\uline{Aug}mentation for
+\uline{A}nomaly and \uline{N}ormal distributions) to enrich training data and
+boost the generalizability of GAD models. Experiments verify the effectiveness
+of our method in improving model generalizability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The updated version is accepted by TKDE 2023. Please refer to
+  arXiv:2306.10534v1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TriMLP: Revenge of a MLP-like Architecture in Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14675v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14675v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiheng Jiang, Yuanbo Xu, Yongjian Yang, Funing Yang, Pengyang Wang, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation models sequences of historical user-item
+interactive behaviors (or referred as token) to better infer dynamic
+preferences. Fueled by the improved neural network architectures such as RNN,
+CNN and Transformer, this field has enjoyed rapid performance boost in the past
+years. Recent progress on all-MLP models lights on an efficient method with
+less intensive computation, token-mixing MLP, to learn the transformation
+patterns among historical behaviors. However, due to the inherent
+fully-connection design that allows the unrestricted cross-token communication
+and ignores the chronological order, we find that directly applying
+token-mixing MLP into sequential recommendation leads to subpar performance. In
+this paper, we present a purely MLP-based sequential recommendation
+architecture TriMLP with a novel \underline{Tri}angular Mixer where the
+modified \underline{MLP} endows tokens with ordered interactions. As the
+cross-token interaction in MLP is actually matrix multiplication, Triangular
+Mixer drops the lower-triangle neurons in the weight matrix and thus blocks the
+connections from future tokens, which prevents information leakage and improves
+prediction capability under the standard auto-regressive training fashion. To
+further model long and short-term preferences on fine-grained level, the mixer
+adopts a dual-branch structure based on the delicate MLP described above,
+namely global and local mixing, to separately capture the sequential long-range
+dependencies and local patterns. Empirical study on 9 different scale datasets
+(contain 50K\textasciitilde20M behaviors) of various benchmarks, including
+MovieLens, Amazon and Tenrec, demonstrates that TriMLP attains promising and
+stable accuracy/efficiency trade-off, i.e., averagely surpasses several
+state-of-the-art baselines by 5.32\% and saves 8.44\% inference time cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TF-GNN: Graph Neural Networks in TensorFlow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.03522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.03522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleksandr Ferludin, Arno Eigenwillig, Martin Blais, Dustin Zelle, Jan Pfeifer, Alvaro Sanchez-Gonzalez, Wai Lok Sibon Li, Sami Abu-El-Haija, Peter Battaglia, Neslihan Bulut, Jonathan Halcrow, Filipe Miguel Gonçalves de Almeida, Pedro Gonnet, Liangze Jiang, Parth Kothari, Silvio Lattanzi, André Linhares, Brandon Mayer, Vahab Mirrokni, John Palowitch, Mihir Paradkar, Jennifer She, Anton Tsitsulin, Kevin Villela, Lisa Wang, David Wong, Bryan Perozzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  TensorFlow-GNN (TF-GNN) is a scalable library for Graph Neural Networks in
+TensorFlow. It is designed from the bottom up to support the kinds of rich
+heterogeneous graph data that occurs in today's information ecosystems. In
+addition to enabling machine learning researchers and advanced developers,
+TF-GNN offers low-code solutions to empower the broader developer community in
+graph learning. Many production models at Google use TF-GNN, and it has been
+recently released as an open source project. In this paper we describe the
+TF-GNN data model, its Keras message passing API, and relevant capabilities
+such as graph sampling and distributed training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Graph Generation to Graph Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07989v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07989v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oliver Schulte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This note describes a new approach to classifying graphs that leverages graph
+generative models (GGM). Assuming a GGM that defines a joint probability
+distribution over graphs and their class labels, I derive classification
+formulas for the probability of a class label given a graph. A new conditional
+ELBO can be used to train a generative graph auto-encoder model for
+discrimination. While leveraging generative models for classification has been
+well explored for non-relational i.i.d. data, to our knowledge it is a novel
+approach to graph classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>I welcome suggestions, comments, and proposals for collaboration to
+  develop further the ideas in this paper. Please email oschulte@cs.sfu.ca. I
+  am grateful to Renjie Liao for helpful comments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When Personalization Harms: Reconsidering the Use of Group Attributes in
+  Prediction <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02058v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02058v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vinith M. Suriyakumar, Marzyeh Ghassemi, Berk Ustun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models are often personalized with categorical attributes
+that are protected, sensitive, self-reported, or costly to acquire. In this
+work, we show models that are personalized with group attributes can reduce
+performance at a group level. We propose formal conditions to ensure the "fair
+use" of group attributes in prediction tasks by training one additional model
+-- i.e., collective preference guarantees to ensure that each group who
+provides personal data will receive a tailored gain in performance in return.
+We present sufficient conditions to ensure fair use in empirical risk
+minimization and characterize failure modes that lead to fair use violations
+due to standard practices in model development and deployment. We present a
+comprehensive empirical study of fair use in clinical prediction tasks. Our
+results demonstrate the prevalence of fair use violations in practice and
+illustrate simple interventions to mitigate their harm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Active Learning on Heteroskedastic Distributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.00928v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.00928v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Savya Khosla, Chew Kin Whye, Jordan T. Ash, Cyril Zhang, Kenji Kawaguchi, Alex Lamb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models that can actively seek out the best quality training data hold the
+promise of more accurate, adaptable, and efficient machine learning. Active
+learning techniques often tend to prefer examples that are the most difficult
+to classify. While this works well on homogeneous datasets, we find that it can
+lead to catastrophic failures when performed on multiple distributions with
+different degrees of label noise or heteroskedasticity. These active learning
+algorithms strongly prefer to draw from the distribution with more noise, even
+if their examples have no informative structure (such as solid color images
+with random labels). To this end, we demonstrate the catastrophic failure of
+these active learning algorithms on heteroskedastic distributions and propose a
+fine-tuning-based approach to mitigate these failures. Further, we propose a
+new algorithm that incorporates a model difference scoring function for each
+data point to filter out the noisy examples and sample clean examples that
+maximize accuracy, outperforming the existing active learning techniques on the
+heteroskedastic datasets. We hope these observations and techniques are
+immediately helpful to practitioners and can help to challenge common
+assumptions in the design of active learning algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Toward Efficient Gradient-Based Value Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13757v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13757v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arsalan Sharifnassab, Richard Sutton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient-based methods for value estimation in reinforcement learning have
+favorable stability properties, but they are typically much slower than
+Temporal Difference (TD) learning methods. We study the root causes of this
+slowness and show that Mean Square Bellman Error (MSBE) is an ill-conditioned
+loss function in the sense that its Hessian has large condition-number. To
+resolve the adverse effect of poor conditioning of MSBE on gradient based
+methods, we propose a low complexity batch-free proximal method that
+approximately follows the Gauss-Newton direction and is asymptotically robust
+to parameterization. Our main algorithm, called RANS, is efficient in the sense
+that it is significantly faster than the residual gradient methods while having
+almost the same computational complexity, and is competitive with TD on the
+classic problems that we tested.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Log-linear Guardedness and its Implications <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10012v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10012v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shauli Ravfogel, Yoav Goldberg, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods for erasing human-interpretable concepts from neural representations
+that assume linearity have been found to be tractable and useful. However, the
+impact of this removal on the behavior of downstream classifiers trained on the
+modified representations is not fully understood. In this work, we formally
+define the notion of log-linear guardedness as the inability of an adversary to
+predict the concept directly from the representation, and study its
+implications. We show that, in the binary case, under certain assumptions, a
+downstream log-linear model cannot recover the erased concept. However, we
+demonstrate that a multiclass log-linear model \emph{can} be constructed that
+indirectly recovers the concept in some cases, pointing to the inherent
+limitations of log-linear guardedness as a downstream bias mitigation
+technique. These findings shed light on the theoretical limitations of linear
+erasure methods and highlight the need for further research on the connections
+between intrinsic and extrinsic bias in neural models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a long paper in ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ManimML: Communicating Machine Learning Architectures with Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17108v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17108v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alec Helbling, Duen Horng Chau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been an explosion in interest in machine learning (ML) in recent
+years due to its applications to science and engineering. However, as ML
+techniques have advanced, tools for explaining and visualizing novel ML
+algorithms have lagged behind. Animation has been shown to be a powerful tool
+for making engaging visualizations of systems that dynamically change over
+time, which makes it well suited to the task of communicating ML algorithms.
+However, the current approach to animating ML algorithms is to handcraft
+applications that highlight specific algorithms or use complex generalized
+animation software. We developed ManimML, an open-source Python library for
+easily generating animations of ML algorithms directly from code. We sought to
+leverage ML practitioners' preexisting knowledge of programming rather than
+requiring them to learn complex animation software. ManimML has a familiar
+syntax for specifying neural networks that mimics popular deep learning
+frameworks like Pytorch. A user can take a preexisting neural network
+architecture and easily write a specification for an animation in ManimML,
+which will then automatically compose animations for different components of
+the system into a final animation of the entire neural network. ManimML is open
+source and available at https://github.com/helblazer811/ManimML.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Some Density Based Clustering Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09256v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09256v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rupanka Bhuyan, Samarjeet Borah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Density Based Clustering are a type of Clustering methods using in data
+mining for extracting previously unknown patterns from data sets. There are a
+number of density based clustering methods such as DBSCAN, OPTICS, DENCLUE,
+VDBSCAN, DVBSCAN, DBCLASD and ST-DBSCAN. In this paper, a study of these
+methods is done along with their characteristics, advantages and disadvantages
+and most importantly, their applicability to different types of data sets to
+mine useful and appropriate patterns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 1 figure, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning Models for Water Stage Predictions in South Florida 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15907v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15907v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jimeng Shi, Zeda Yin, Rukmangadh Myana, Khandker Ishtiaq, Anupama John, Jayantha Obeysekera, Arturo Leon, Giri Narasimhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulating and predicting water levels in river systems is essential for
+flood warnings, hydraulic operations, and flood mitigations. In the engineering
+field, tools such as HEC-RAS, MIKE, and SWMM are used to build detailed
+physics-based hydrological and hydraulic computational models to simulate the
+entire watershed, thereby predicting the water stage at any point in the
+system. However, these physics-based models are computationally intensive,
+especially for large watersheds and for longer simulations. To overcome this
+problem, we train several deep learning (DL) models for use as surrogate models
+to rapidly predict the water stage. The downstream stage of the Miami River in
+South Florida is chosen as a case study for this paper. The dataset is from
+January 1, 2010, to December 31, 2020, downloaded from the DBHYDRO database of
+the South Florida Water Management District (SFWMD). Extensive experiments show
+that the performance of the DL models is comparable to that of the
+physics-based models, even during extreme precipitation conditions (i.e.,
+tropical storms). Furthermore, we study the decline in prediction accuracy of
+the DL models with an increase in prediction lengths. In order to predict the
+water stage in the future, our DL models use measured variables of the river
+system from the recent past as well as covariates that can be reliably
+predicted in the near future. In summary, the deep learning models achieve
+comparable or better error rates with at least 1000x speedup in comparison to
+the physics-based models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LAno<span class="highlight-title">BERT</span>: System Log Anomaly Detection based on <span class="highlight-title">BERT</span> Masked Language
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.09564v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.09564v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukyung Lee, Jina Kim, Pilsung Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The system log generated in a computer system refers to large-scale data that
+are collected simultaneously and used as the basic data for determining errors,
+intrusion and abnormal behaviors. The aim of system log anomaly detection is to
+promptly identify anomalies while minimizing human intervention, which is a
+critical problem in the industry. Previous studies performed anomaly detection
+through algorithms after converting various forms of log data into a
+standardized template using a parser. Particularly, a template corresponding to
+a specific event should be defined in advance for all the log data using which
+the information within the log key may get lost. In this study, we propose
+LAnoBERT, a parser free system log anomaly detection method that uses the BERT
+model, exhibiting excellent natural language processing performance. The
+proposed method, LAnoBERT, learns the model through masked language modeling,
+which is a BERT-based pre-training method, and proceeds with unsupervised
+learning-based anomaly detection using the masked language modeling loss
+function per log key during the test process. In addition, we also propose an
+efficient inference process to establish a practically applicable pipeline to
+the actual system. Experiments on three well-known log datasets, i.e., HDFS,
+BGL, and Thunderbird, show that not only did LAnoBERT yield a higher anomaly
+detection performance compared to unsupervised learning-based benchmark models,
+but also it resulted in a comparable performance with supervised learning-based
+benchmark models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLIPTER: Looking at the Bigger Picture in Scene Text Recognition <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.07464v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.07464v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aviad Aberdam, David Bensaïd, Alona Golts, Roy Ganz, Oren Nuriel, Royee Tichauer, Shai Mazor, Ron Litman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reading text in real-world scenarios often requires understanding the context
+surrounding it, especially when dealing with poor-quality text. However,
+current scene text recognizers are unaware of the bigger picture as they
+operate on cropped text images. In this study, we harness the representative
+capabilities of modern vision-language models, such as CLIP, to provide
+scene-level information to the crop-based recognizer. We achieve this by fusing
+a rich representation of the entire image, obtained from the vision-language
+model, with the recognizer word-level features via a gated cross-attention
+mechanism. This component gradually shifts to the context-enhanced
+representation, allowing for stable fine-tuning of a pretrained recognizer. We
+demonstrate the effectiveness of our model-agnostic framework, CLIPTER (CLIP
+TExt Recognition), on leading text recognition architectures and achieve
+state-of-the-art results across multiple benchmarks. Furthermore, our analysis
+highlights improved robustness to out-of-vocabulary words and enhanced
+generalization in low-data regimes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sentiment<span class="highlight-title">GPT</span>: Exploiting <span class="highlight-title">GPT</span> for Advanced Sentiment Analysis and its
+  Departure from Current Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10234v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10234v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kiana Kheiri, Hamid Karimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a thorough examination of various Generative Pretrained
+Transformer (GPT) methodologies in sentiment analysis, specifically in the
+context of Task 4 on the SemEval 2017 dataset. Three primary strategies are
+employed: 1) prompt engineering using the advanced GPT-3.5 Turbo, 2)
+fine-tuning GPT models, and 3) an inventive approach to embedding
+classification. The research yields detailed comparative insights among these
+strategies and individual GPT models, revealing their unique strengths and
+potential limitations. Additionally, the study compares these GPT-based
+methodologies with other current, high-performing models previously used with
+the same dataset. The results illustrate the significant superiority of the GPT
+approaches in terms of predictive performance, more than 22\% in F1-score
+compared to the state-of-the-art. Further, the paper sheds light on common
+challenges in sentiment analysis tasks, such as understanding context and
+detecting sarcasm. It underscores the enhanced capabilities of the GPT models
+to effectively handle these complexities. Taken together, these findings
+highlight the promising potential of GPT models in sentiment analysis, setting
+the stage for future research in this field. The code can be found at
+https://github.com/DSAatUSU/SentimentGPT
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Frouros: A Python library for drift detection in machine learning
+  systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.06868v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.06868v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaime Céspedes-Sisniega, Álvaro López-García
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Frouros is an open-source Python library capable of detecting drift in
+machine learning systems. It provides a combination of classical and more
+recent algorithms for drift detection: both concept and data drift. We have
+designed it with the objective of making it compatible with any machine
+learning framework and easily adaptable to real-world use cases. The library is
+developed following a set of best development and continuous integration
+practices to ensure ease of maintenance and extensibility. The source code is
+available at https://github.com/IFCA/frouros.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Estimate-Then-Optimize versus Integrated-Estimation-Optimization versus
+  Sample Average Approximation: A Stochastic Dominance Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06833v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06833v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam N. Elmachtoub, Henry Lam, Haofeng Zhang, Yunfan Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In data-driven stochastic optimization, model parameters of the underlying
+distribution need to be estimated from data in addition to the optimization
+task. Recent literature considers integrating the estimation and optimization
+processes by selecting model parameters that lead to the best empirical
+objective performance. This integrated approach, which we call
+integrated-estimation-optimization (IEO), can be readily shown to outperform
+simple estimate-then-optimize (ETO) when the model is misspecified. In this
+paper, we show that a reverse behavior appears when the model class is
+well-specified and there is sufficient data. Specifically, for a general class
+of nonlinear stochastic optimization problems, we show that simple ETO
+outperforms IEO asymptotically when the model class covers the ground truth, in
+the strong sense of stochastic dominance of the regret. Namely, the entire
+distribution of the regret, not only its mean or other moments, is always
+better for ETO compared to IEO. Our results also apply to constrained,
+contextual optimization problems where the decision depends on observed
+features. Whenever applicable, we also demonstrate how standard sample average
+approximation (SAA) performs the worst when the model class is well-specified
+in terms of regret, and best when it is misspecified. Finally, we provide
+experimental results to support our theoretical comparisons and illustrate when
+our insights hold in finite-sample regimes and under various degrees of
+misspecification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MGR: Multi-generator Based Rationalization <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04492v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04492v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Liu, Haozhao Wang, Jun Wang, Ruixuan Li, Xinyang Li, Yuankai Zhang, Yang Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rationalization is to employ a generator and a predictor to construct a
+self-explaining NLP model in which the generator selects a subset of
+human-intelligible pieces of the input text to the following predictor.
+However, rationalization suffers from two key challenges, i.e., spurious
+correlation and degeneration, where the predictor overfits the spurious or
+meaningless pieces solely selected by the not-yet well-trained generator and in
+turn deteriorates the generator. Although many studies have been proposed to
+address the two challenges, they are usually designed separately and do not
+take both of them into account. In this paper, we propose a simple yet
+effective method named MGR to simultaneously solve the two problems. The key
+idea of MGR is to employ multiple generators such that the occurrence stability
+of real pieces is improved and more meaningful pieces are delivered to the
+predictor. Empirically, we show that MGR improves the F1 score by up to 20.9%
+as compared to state-of-the-art methods. Codes are available at
+https://github.com/jugechengzi/Rationalization-MGR .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, oral presentation. Fixed some typos and clarified some
+  implementation details. arXiv admin note: text overlap with arXiv:2209.08285</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Net<span class="highlight-title">GPT</span>: A Native-AI Network Architecture Beyond Provisioning
+  Personalized Generative Services 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06148v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06148v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Chen, Rongpeng Li, Zhifeng Zhao, Chenghui Peng, Jianjun Wu, Ekram Hossain, Honggang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have triggered tremendous success to empower
+daily life by generative information, and the personalization of LLMs could
+further contribute to their applications due to better alignment with human
+intents. Towards personalized generative services, a collaborative cloud-edge
+methodology sounds promising, as it facilitates the effective orchestration of
+heterogeneous distributed communication and computing resources. In this
+article, after discussing the pros and cons of several candidate cloud-edge
+collaboration techniques, we put forward NetGPT to capably deploy appropriate
+LLMs at the edge and the cloud in accordance with their computing capacity. In
+addition, edge LLMs could efficiently leverage location-based information for
+personalized prompt completion, thus benefiting the interaction with cloud
+LLMs. After deploying representative open-source LLMs (e.g., GPT-2-base and
+LLaMA model) at the edge and the cloud, we present the feasibility of NetGPT on
+the basis of low-rank adaptation-based light-weight fine-tuning. Subsequently,
+we highlight substantial essential changes required for a native artificial
+intelligence (AI) network architecture towards NetGPT, with special emphasis on
+deeper integration of communications and computing resources and careful
+calibration of logical AI workflow. Furthermore, we demonstrate several
+by-product benefits of NetGPT, given edge LLM's astonishing capability to
+predict trends and infer intents, which possibly leads to a unified solution
+for intelligent network management \& orchestration. In a nutshell, we argue
+that NetGPT is a promising native-AI network architecture beyond provisioning
+personalized generative services.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Launchpad<span class="highlight-title">GPT</span>: Language Model as Music Visualization Designer on
+  Launchpad 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siting Xu, Yunlong Tang, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Launchpad is a musical instrument that allows users to create and perform
+music by pressing illuminated buttons. To assist and inspire the design of the
+Launchpad light effect, and provide a more accessible approach for beginners to
+create music visualization with this instrument, we proposed the LaunchpadGPT
+model to generate music visualization designs on Launchpad automatically. Based
+on the language model with excellent generation ability, our proposed
+LaunchpadGPT takes an audio piece of music as input and outputs the lighting
+effects of Launchpad-playing in the form of a video (Launchpad-playing video).
+We collect Launchpad-playing videos and process them to obtain music and
+corresponding video frame of Launchpad-playing as prompt-completion pairs, to
+train the language model. The experiment result shows the proposed method can
+create better music visualization than random generation methods and hold the
+potential for a broader range of music visualization applications. Our code is
+available at https://github.com/yunlong10/LaunchpadGPT/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Computer Music Conference (ICMC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLIPSonic: Text-to-Audio Synthesis with Unlabeled Videos and <span class="highlight-title">Pretrain</span>ed
+  Language-Vision Models <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09635v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09635v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao-Wen Dong, Xiaoyu Liu, Jordi Pons, Gautam Bhattacharya, Santiago Pascual, Joan Serrà, Taylor Berg-Kirkpatrick, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has studied text-to-audio synthesis using large amounts of paired
+text-audio data. However, audio recordings with high-quality text annotations
+can be difficult to acquire. In this work, we approach text-to-audio synthesis
+using unlabeled videos and pretrained language-vision models. We propose to
+learn the desired text-audio correspondence by leveraging the visual modality
+as a bridge. We train a conditional diffusion model to generate the audio track
+of a video, given a video frame encoded by a pretrained contrastive
+language-image pretraining (CLIP) model. At test time, we first explore
+performing a zero-shot modality transfer and condition the diffusion model with
+a CLIP-encoded text query. However, we observe a noticeable performance drop
+with respect to image queries. To close this gap, we further adopt a pretrained
+diffusion prior model to generate a CLIP image embedding given a CLIP text
+embedding. Our results show the effectiveness of the proposed method, and that
+the pretrained diffusion prior can reduce the modality transfer gap. While we
+focus on text-to-audio synthesis, the proposed model can also generate audio
+from image queries, and it shows competitive performance against a
+state-of-the-art image-to-audio synthesis model in a subjective listening test.
+This study offers a new direction of approaching text-to-audio synthesis that
+leverages the naturally-occurring audio-visual correspondence in videos and the
+power of pretrained language-vision models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by WASPAA 2023. Demo:
+  https://salu133445.github.io/clipsonic/</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-22T00:00:00Z">2023-07-22</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">19</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Imitation Game: Detecting Human and AI-Generated Texts in the Era of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kadhim Hayawi, Sakib Shahriar, Sujith Samuel Mathew
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The potential of artificial intelligence (AI)-based large language models
+(LLMs) holds considerable promise in revolutionizing education, research, and
+practice. However, distinguishing between human-written and AI-generated text
+has become a significant task. This paper presents a comparative study,
+introducing a novel dataset of human-written and LLM-generated texts in
+different genres: essays, stories, poetry, and Python code. We employ several
+machine learning models to classify the texts. Results demonstrate the efficacy
+of these models in discerning between human and AI-generated text, despite the
+dataset's limited sample size. However, the task becomes more challenging when
+classifying GPT-generated text, particularly in story writing. The results
+indicate that the models exhibit superior performance in binary classification
+tasks, such as distinguishing human-generated text from a specific LLM,
+compared to the more complex multiclass tasks that involve discerning among
+human-generated and multiple LLMs. Our findings provide insightful implications
+for AI text detection while our dataset paves the way for future research in
+this evolving area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identifying Misinformation on YouTube through Transcript Contextual
+  Analysis with <span class="highlight-title">Transformer</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christos Christodoulou, Nikos Salamanos, Pantelitsa Leonidou, Michail Papadakis, Michael Sirivianos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Misinformation on YouTube is a significant concern, necessitating robust
+detection strategies. In this paper, we introduce a novel methodology for video
+classification, focusing on the veracity of the content. We convert the
+conventional video classification task into a text classification task by
+leveraging the textual content derived from the video transcripts. We employ
+advanced machine learning techniques like transfer learning to solve the
+classification challenge. Our approach incorporates two forms of transfer
+learning: (a) fine-tuning base transformer models such as BERT, RoBERTa, and
+ELECTRA, and (b) few-shot learning using sentence-transformers MPNet and
+RoBERTa-large. We apply the trained models to three datasets: (a) YouTube
+Vaccine-misinformation related videos, (b) YouTube Pseudoscience videos, and
+(c) Fake-News dataset (a collection of articles). Including the Fake-News
+dataset extended the evaluation of our approach beyond YouTube videos. Using
+these datasets, we evaluated the models distinguishing valid information from
+misinformation. The fine-tuned models yielded Matthews Correlation
+Coefficient>0.81, accuracy>0.90, and F1 score>0.90 in two of three datasets.
+Interestingly, the few-shot models outperformed the fine-tuned ones by 20% in
+both Accuracy and F1 score for the YouTube Pseudoscience dataset, highlighting
+the potential utility of this approach -- especially in the context of limited
+training data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modality Confidence Aware Training for Robust End-to-End Spoken Language
+  Understanding <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suyoun Kim, Akshat Shrivastava, Duc Le, Ju Lin, Ozlem Kalinli, Michael L. Seltzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end (E2E) spoken language understanding (SLU) systems that generate a
+semantic parse from speech have become more promising recently. This approach
+uses a single model that utilizes audio and text representations from
+pre-trained speech recognition models (ASR), and outperforms traditional
+pipeline SLU systems in on-device streaming scenarios. However, E2E SLU systems
+still show weakness when text representation quality is low due to ASR
+transcription errors. To overcome this issue, we propose a novel E2E SLU system
+that enhances robustness to ASR errors by fusing audio and text representations
+based on the estimated modality confidence of ASR hypotheses. We introduce two
+novel techniques: 1) an effective method to encode the quality of ASR
+hypotheses and 2) an effective approach to integrate them into E2E SLU models.
+We show accuracy improvements on STOP dataset and share the analysis to
+demonstrate the effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Topic-Enhanced Argument Mining from Heterogeneous Sources 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiasheng Si, Yingjie Zhu, Xingyu Shi, Deyu Zhou, Yulan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a controversial target such as ``nuclear energy'', argument mining aims
+to identify the argumentative text from heterogeneous sources. Current
+approaches focus on exploring better ways of integrating the target-associated
+semantic information with the argumentative text. Despite their empirical
+successes, two issues remain unsolved: (i) a target is represented by a word or
+a phrase, which is insufficient to cover a diverse set of target-related
+subtopics; (ii) the sentence-level topic information within an argument, which
+we believe is crucial for argument mining, is ignored. To tackle the above
+issues, we propose a novel explainable topic-enhanced argument mining approach.
+Specifically, with the use of the neural topic model and the language model,
+the target information is augmented by explainable topic representations.
+Moreover, the sentence-level topic information within the argument is captured
+by minimizing the distance between its latent topic distribution and its
+semantic representation through mutual learning. Experiments have been
+conducted on the benchmark dataset in both the in-target setting and the
+cross-target setting. Results demonstrate the superiority of the proposed model
+against the state-of-the-art baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Zero-shot and Few-shot Study of Instruction-Finetuned Large Language
+  Models Applied to Clinical and Biomedical Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanis Labrak, Mickael Rouvier, Richard Dufour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We evaluate four state-of-the-art instruction-tuned large language models
+(LLMs) -- ChatGPT, Flan-T5 UL2, Tk-Instruct, and Alpaca -- on a set of 13
+real-world clinical and biomedical natural language processing (NLP) tasks in
+English, such as named-entity recognition (NER), question-answering (QA),
+relation extraction (RE), etc. Our overall results demonstrate that the
+evaluated LLMs begin to approach performance of state-of-the-art models in
+zero- and few-shot scenarios for most tasks, and particularly well for the QA
+task, even though they have never seen examples from these tasks before.
+However, we observed that the classification and RE tasks perform below what
+can be achieved with a specifically trained model for the medical field, such
+as PubMedBERT. Finally, we noted that no LLM outperforms all the others on all
+the studied tasks, with some models being better suited for certain tasks than
+others.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review process</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Distillation for Continual Learning on Visual Question
+  Localized-Answering in Robotic Surgery <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Bai, Mobarakol Islam, Hongliang Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The visual-question localized-answering (VQLA) system can serve as a
+knowledgeable assistant in surgical education. Except for providing text-based
+answers, the VQLA system can highlight the interested region for better
+surgical scene understanding. However, deep neural networks (DNNs) suffer from
+catastrophic forgetting when learning new knowledge. Specifically, when DNNs
+learn on incremental classes or tasks, their performance on old tasks drops
+dramatically. Furthermore, due to medical data privacy and licensing issues, it
+is often difficult to access old data when updating continual learning (CL)
+models. Therefore, we develop a non-exemplar continual surgical VQLA framework,
+to explore and balance the rigidity-plasticity trade-off of DNNs in a
+sequential learning paradigm. We revisit the distillation loss in CL tasks, and
+propose rigidity-plasticity-aware distillation (RP-Dist) and self-calibrated
+heterogeneous distillation (SH-Dist) to preserve the old knowledge. The weight
+aligning (WA) technique is also integrated to adjust the weight bias between
+old and new tasks. We further establish a CL framework on three public surgical
+datasets in the context of surgical settings that consist of overlapping
+classes between old and new surgical VQLA tasks. With extensive experiments, we
+demonstrate that our proposed method excellently reconciles learning and
+forgetting on the continual surgical VQLA over conventional CL methods. Our
+code is publicly accessible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in MICCAI 2023. Code availability:
+  https://github.com/longbai1006/CS-VQLA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Psy-LLM: Scaling up Global Mental Health Psychological Services with
+  AI-based Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tin Lai, Yukun Shi, Zicong Du, Jiajie Wu, Ken Fu, Yichao Dou, Ziqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The demand for psychological counseling has grown significantly in recent
+years, particularly with the global outbreak of COVID-19, which has heightened
+the need for timely and professional mental health support. Online
+psychological counseling has emerged as the predominant mode of providing
+services in response to this demand. In this study, we propose the Psy-LLM
+framework, an AI-based system leveraging Large Language Models (LLMs) for
+question-answering in online psychological consultation. Our framework combines
+pre-trained LLMs with real-world professional Q&A from psychologists and
+extensively crawled psychological articles. The Psy-LLM framework serves as a
+front-end tool for healthcare professionals, allowing them to provide immediate
+responses and mindfulness activities to alleviate patient stress. Additionally,
+it functions as a screening tool to identify urgent cases requiring further
+assistance. We evaluated the framework using intrinsic metrics, such as
+perplexity, and extrinsic evaluation metrics, with human participant
+assessments of response helpfulness, fluency, relevance, and logic. The results
+demonstrate the effectiveness of the Psy-LLM framework in generating coherent
+and relevant answers to psychological questions. This article concludes by
+discussing the potential of large language models to enhance mental health
+support through AI technologies in online psychological consultation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Vision-and-Language Navigation from YouTube Videos <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunyang Lin, Peihao Chen, Diwei Huang, Thomas H. Li, Mingkui Tan, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-and-language navigation (VLN) requires an embodied agent to navigate
+in realistic 3D environments using natural language instructions. Existing VLN
+methods suffer from training on small-scale environments or unreasonable
+path-instruction datasets, limiting the generalization to unseen environments.
+There are massive house tour videos on YouTube, providing abundant real
+navigation experiences and layout information. However, these videos have not
+been explored for VLN before. In this paper, we propose to learn an agent from
+these videos by creating a large-scale dataset which comprises reasonable
+path-instruction pairs from house tour videos and pre-training the agent on it.
+To achieve this, we have to tackle the challenges of automatically constructing
+path-instruction pairs and exploiting real layout knowledge from raw and
+unlabeled videos. To address these, we first leverage an entropy-based method
+to construct the nodes of a path trajectory. Then, we propose an action-aware
+generator for generating instructions from unlabeled trajectories. Last, we
+devise a trajectory judgment pretext task to encourage the agent to mine the
+layout knowledge. Experimental results show that our method achieves
+state-of-the-art performance on two popular benchmarks (R2R and REVERIE). Code
+is available at https://github.com/JeremyLinky/YouTube-VLN
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extracting Molecular Properties from Natural Language with Multimodal
+  Contrastive Learning <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Romain Lacombe, Andrew Gaut, Jeff He, David Lüdeke, Kateryna Pistunova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning in computational biochemistry has traditionally focused on
+molecular graphs neural representations; however, recent advances in language
+models highlight how much scientific knowledge is encoded in text. To bridge
+these two modalities, we investigate how molecular property information can be
+transferred from natural language to graph representations. We study property
+prediction performance gains after using contrastive learning to align neural
+graph representations with representations of textual descriptions of their
+characteristics. We implement neural relevance scoring strategies to improve
+text retrieval, introduce a novel chemically-valid molecular graph augmentation
+strategy inspired by organic reactions, and demonstrate improved performance on
+downstream MoleculeNet property classification tasks. We achieve a +4.26% AUROC
+gain versus models pre-trained on the graph modality alone, and a +1.54% gain
+compared to recently proposed molecular graph/text contrastively trained MoMu
+model (Su et al. 2022).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 ICML Workshop on Computational Biology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Practical and Ethical Challenges of Large Language Models in Education:
+  A Systematic Scoping <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13379v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13379v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lixiang Yan, Lele Sha, Linxuan Zhao, Yuheng Li, Roberto Martinez-Maldonado, Guanliang Chen, Xinyu Li, Yueqiao Jin, Dragan Gašević
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Educational technology innovations leveraging large language models (LLMs)
+have shown the potential to automate the laborious process of generating and
+analysing textual content. While various innovations have been developed to
+automate a range of educational tasks (e.g., question generation, feedback
+provision, and essay grading), there are concerns regarding the practicality
+and ethicality of these innovations. Such concerns may hinder future research
+and the adoption of LLMs-based innovations in authentic educational contexts.
+To address this, we conducted a systematic scoping review of 118 peer-reviewed
+papers published since 2017 to pinpoint the current state of research on using
+LLMs to automate and support educational tasks. The findings revealed 53 use
+cases for LLMs in automating education tasks, categorised into nine main
+categories: profiling/labelling, detection, grading, teaching support,
+prediction, knowledge representation, feedback, content generation, and
+recommendation. Additionally, we also identified several practical and ethical
+challenges, including low technological readiness, lack of replicability and
+transparency, and insufficient privacy and beneficence considerations. The
+findings were summarised into three recommendations for future studies,
+including updating existing innovations with state-of-the-art models (e.g.,
+GPT-3/4), embracing the initiative of open-sourcing models/systems, and
+adopting a human-centred approach throughout the developmental process. As the
+intersection of AI and education is continuously evolving, the findings of this
+study can serve as an essential reference point for researchers, allowing them
+to leverage the strengths, learn from the limitations, and uncover potential
+research opportunities enabled by ChatGPT and other generative AI models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Empirical Study on Fertility Proposals Using Multi-Grained Topic
+  Analysis Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10025v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10025v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fertility issues are closely related to population security, in 60 years
+China's population for the first time in a negative growth trend, the change of
+fertility policy is of great concern to the community. 2023 "two sessions"
+proposal "suggests that the country in the form of legislation, the birth of
+the registration of the cancellation of the marriage restriction" This topic
+was once a hot topic on the Internet, and "unbundling" the relationship between
+birth registration and marriage has become the focus of social debate. In this
+paper, we adopt co-occurrence semantic analysis, topic analysis and sentiment
+analysis to conduct multi-granularity semantic analysis of microblog comments.
+It is found that the discussion on the proposal of "removing marriage
+restrictions from birth registration" involves the individual, society and the
+state at three dimensions, and is detailed into social issues such as personal
+behaviour, social ethics and law, and national policy, with people's sentiment
+inclined to be negative in most of the topics. Based on this, eight proposals
+were made to provide a reference for governmental decision making and to form a
+reference method for researching public opinion on political issues.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 4 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Knowledge Graph Reasoning on Graph Types: Static, Dynamic,
+  and Multimodal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05767v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05767v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Liang, Lingyuan Meng, Meng Liu, Yue Liu, Wenxuan Tu, Siwei Wang, Sihang Zhou, Xinwang Liu, Fuchun Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph reasoning (KGR), aiming to deduce new facts from existing
+facts based on mined logic rules underlying knowledge graphs (KGs), has become
+a fast-growing research direction. It has been proven to significantly benefit
+the usage of KGs in many AI applications, such as question answering,
+recommendation systems, and etc. According to the graph types, existing KGR
+models can be roughly divided into three categories, i.e., static models,
+temporal models, and multi-modal models. Early works in this domain mainly
+focus on static KGR, and recent works try to leverage the temporal and
+multi-modal information, which are more practical and closer to real-world.
+However, no survey papers and open-source repositories comprehensively
+summarize and discuss models in this important direction. To fill the gap, we
+conduct a first survey for knowledge graph reasoning tracing from static to
+temporal and then to multi-modal KGs. Concretely, the models are reviewed based
+on bi-level taxonomy, i.e., top-level (graph types) and base-level (techniques
+and scenarios). Besides, the performances, as well as datasets, are summarized
+and presented. Moreover, we point out the challenges and potential
+opportunities to enlighten the readers. The corresponding open-source
+repository is shared on GitHub
+https://github.com/LIANGKE23/Awesome-Knowledge-Graph-Reasoning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved NL2SQL based on Multi-layer Expert Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17727v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17727v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenduo Hao, Xu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Natural Language to SQL (NL2SQL) technique is used to convert natural
+language queries into executable SQL statements. Typically, slot-filling is
+employed as a classification method for multi-task cases to achieve this goal.
+However, slot-filling can result in inaccurate SQL statement generation due to
+negative migration issues arising from different classification tasks. To
+overcome this limitation, this study introduces a new approach called
+Multi-Layer Expert Generate SQL (MLEG-SQL), which utilizes a dedicated
+multi-task hierarchical network. The lower layer of the network extracts
+semantic features of natural language statements, while the upper layer builds
+a specialized expert system for handling specific classification tasks. This
+hierarchical approach mitigates performance degradation resulting from
+different task conflicts. The proposed method was evaluated on the WiKSQL
+dataset and was found to be effective in generating accurate SQL statements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>the paper's figure has something wrong</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating a Heterogeneous Graph with Entity-aware Self-attention using
+  Relative Position Labels for Reading Comprehension Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10443v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10443v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shima Foolad, Kourosh Kiani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the significant progress made by transformer models in machine
+reading comprehension tasks, they still fall short in handling complex
+reasoning tasks due to the absence of explicit knowledge in the input sequence.
+To address this limitation, many recent works have proposed injecting external
+knowledge into the model. However, selecting relevant external knowledge,
+ensuring its availability, and requiring additional processing steps remain
+challenging. In this paper, we introduce a novel attention pattern that
+integrates reasoning knowledge derived from a heterogeneous graph into the
+transformer architecture without relying on external knowledge. The proposed
+attention pattern comprises three key elements: global-local attention for word
+tokens, graph attention for entity tokens that exhibit strong attention towards
+tokens connected in the graph as opposed to those unconnected, and the
+consideration of the type of relationship between each entity token and word
+token. This results in optimized attention between the two if a relationship
+exists. The pattern is coupled with special relative position labels, allowing
+it to integrate with LUKE's entity-aware self-attention mechanism. The
+experimental findings corroborate that our model outperforms both the
+cutting-edge LUKE-Graph and the baseline LUKE model on the ReCoRD dataset that
+focuses on commonsense reasoning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted for Knowledge-Based Systems Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AspectCSE: Sentence Embeddings for Aspect-based Semantic Textual
+  Similarity using Contrastive Learning and Structured Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07851v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07851v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Schopf, Emanuel Gerber, Malte Ostendorff, Florian Matthes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generic sentence embeddings provide a coarse-grained approximation of
+semantic textual similarity but ignore specific aspects that make texts
+similar. Conversely, aspect-based sentence embeddings provide similarities
+between texts based on certain predefined aspects. Thus, similarity predictions
+of texts are more targeted to specific requirements and more easily
+explainable. In this paper, we present AspectCSE, an approach for aspect-based
+contrastive learning of sentence embeddings. Results indicate that AspectCSE
+achieves an average improvement of 3.97% on information retrieval tasks across
+multiple aspects compared to the previous best results. We also propose using
+Wikidata knowledge graph properties to train models of multi-aspect sentence
+embeddings in which multiple specific aspects are simultaneously considered
+during similarity predictions. We demonstrate that multi-aspect embeddings
+outperform single-aspect embeddings on aspect-specific information retrieval
+tasks. Finally, we examine the aspect-based sentence embedding space and
+demonstrate that embeddings of semantically similar aspect labels are often
+close, even without explicit similarity training between different aspect
+labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 14th International Conference on Recent Advances in
+  Natural Language Processing (RANLP 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Domain Adaptation of Sentence Embeddings using Adapters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Schopf, Dennis Schneider, Florian Matthes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sentence embeddings enable us to capture the semantic similarity of short
+texts. Most sentence embedding models are trained for general semantic textual
+similarity (STS) tasks. Therefore, to use sentence embeddings in a particular
+domain, the model must be adapted to it in order to achieve good results.
+Usually, this is done by fine-tuning the entire sentence embedding model for
+the domain of interest. While this approach yields state-of-the-art results,
+all of the model's weights are updated during fine-tuning, making this method
+resource-intensive. Therefore, instead of fine-tuning entire sentence embedding
+models for each target domain individually, we propose to train lightweight
+adapters. These domain-specific adapters do not require fine-tuning all
+underlying sentence embedding model parameters. Instead, we only train a small
+number of additional parameters while keeping the weights of the underlying
+sentence embedding model fixed. Training domain-specific adapters allows always
+using the same base model and only exchanging the domain-specific adapters to
+adapt sentence embeddings to a specific domain. We show that using adapters for
+parameter-efficient domain adaptation of sentence embeddings yields competitive
+performance within 1% of a domain-adapted, entirely fine-tuned sentence
+embedding model while only training approximately 3.6% of the parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 14th International Conference on Recent Advances in
+  Natural Language Processing (RANLP 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generating Mathematical Derivations with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09998v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09998v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Meadows, Marco Valentino, Andre Freitas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The derivation of mathematical results in specialised fields using Large
+Language Models (LLMs) is an emerging research direction that can help identify
+models' limitations, and potentially support mathematical discovery. In this
+paper, we leverage a symbolic engine to generate derivations of equations at
+scale, and investigate the capabilities of LLMs when deriving goal equations
+from premises. Specifically, we employ in-context learning for GPT and
+fine-tune a range of T5 models to compare the robustness and generalisation of
+pre-training strategies to specialised models. Empirical results show that
+fine-tuned FLAN-T5-large (MathT5) outperforms GPT models on all static and
+out-of-distribution test sets in terms of absolute performance. However, an
+in-depth analysis reveals that the fine-tuned models are more sensitive to
+perturbations involving unseen symbols and (to a lesser extent) changes to
+equation structure. In addition, we analyse 1.7K equations and over 200
+derivations to highlight common reasoning errors such as the inclusion of
+incorrect, irrelevant, and redundant equations, along with the tendency to skip
+derivation steps. Finally, we explore the suitability of existing metrics for
+evaluating mathematical derivations finding evidence that, while they capture
+general properties such as sensitivity to perturbations, they fail to highlight
+fine-grained reasoning errors and essential differences between models.
+Overall, this work demonstrates that training models on synthetic data can
+improve their mathematical capabilities beyond larger architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Detecting Harmful Agendas in News Articles <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00102v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00102v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melanie Subbiah, Amrita Bhattacharjee, Yilun Hua, Tharindu Kumarage, Huan Liu, Kathleen McKeown
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Manipulated news online is a growing problem which necessitates the use of
+automated systems to curtail its spread. We argue that while misinformation and
+disinformation detection have been studied, there has been a lack of investment
+in the important open challenge of detecting harmful agendas in news articles;
+identifying harmful agendas is critical to flag news campaigns with the
+greatest potential for real world harm. Moreover, due to real concerns around
+censorship, harmful agenda detectors must be interpretable to be effective. In
+this work, we propose this new task and release a dataset, NewsAgendas, of
+annotated news articles for agenda identification. We show how interpretable
+systems can be effective on this task and demonstrate that they can perform
+comparably to black-box models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera-ready for ACL-WASSA 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disco-Bench: A Discourse-Aware Evaluation Benchmark for Language
+  Modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08074v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08074v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longyue Wang, Zefeng Du, Donghuai Liu, Deng Cai, Dian Yu, Haiyun Jiang, Yan Wang, Leyang Cui, Shuming Shi, Zhaopeng Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling discourse -- the linguistic phenomena that go beyond individual
+sentences, is a fundamental yet challenging aspect of natural language
+processing (NLP). However, existing evaluation benchmarks primarily focus on
+the evaluation of inter-sentence properties and overlook critical discourse
+phenomena that cross sentences. To bridge the gap, we propose Disco-Bench, a
+benchmark that can evaluate intra-sentence discourse properties across a
+diverse set of NLP tasks, covering understanding, translation, and generation.
+Disco-Bench consists of 9 document-level testsets in the literature domain,
+which contain rich discourse phenomena (e.g. cohesion and coherence) in Chinese
+and/or English. For linguistic analysis, we also design a diagnostic test suite
+that can examine whether the target models learn discourse knowledge. We
+totally evaluate 20 general-, in-domain and commercial models based on
+Transformer, advanced pretraining architectures and large language models
+(LLMs). Our results show (1) the challenge and necessity of our evaluation
+benchmark; (2) fine-grained pretraining based on literary document-level
+training data consistently improves the modeling of discourse information. We
+will release the datasets, pretrained models, and leaderboard, which we hope
+can significantly facilitate research in this field:
+https://github.com/longyuewangdcu/Disco-Bench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Zhaopeng Tu is the corresponding author</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conformal Group Recommender System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12034v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12034v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Venkateswara Rao Kagita, Anshuman Singh, Vikas Kumar, Pavan Kalyan Reddy Neerudu, Arun K Pujari, Rohit Kumar Bondugula
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group recommender systems (GRS) are critical in discovering relevant items
+from a near-infinite inventory based on group preferences rather than
+individual preferences, like recommending a movie, restaurant, or tourist
+destination to a group of individuals. The traditional models of group
+recommendation are designed to act like a black box with a strict focus on
+improving recommendation accuracy, and most often, they place the onus on the
+users to interpret recommendations. In recent years, the focus of Recommender
+Systems (RS) research has shifted away from merely improving recommendation
+accuracy towards value additions such as confidence and explanation. In this
+work, we propose a conformal prediction framework that provides a measure of
+confidence with prediction in conjunction with a group recommender system to
+augment the system-generated plain recommendations. In the context of group
+recommender systems, we propose various nonconformity measures that play a
+vital role in the efficiency of the conformal framework. We also show that
+defined nonconformity satisfies the exchangeability property. Experimental
+results demonstrate the effectiveness of the proposed approach over several
+benchmark datasets. Furthermore, our proposed approach also satisfies validity
+and efficiency properties.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XWalk: Random Walk Based Candidate Retrieval for Product Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jon Eskreis-Winkler, Yubin Kim, Andrew Stanton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In e-commerce, head queries account for the vast majority of gross
+merchandise sales and improvements to head queries are highly impactful to the
+business. While most supervised approaches to search perform better in head
+queries vs. tail queries, we propose a method that further improves head query
+performance dramatically. We propose XWalk, a random-walk based graph approach
+to candidate retrieval for product search that borrows from recommendation
+system techniques. XWalk is highly efficient to train and inference in a
+large-scale high traffic e-commerce setting, and shows substantial improvements
+in head query performance over state-of-the-art neural retreivers. Ensembling
+XWalk with a neural and/or lexical retriever combines the best of both worlds
+and the resulting retrieval system outperforms all other methods in both
+offline relevance-based evaluation and in online A/B tests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HTP: Exploiting Holistic Temporal Patterns for Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Rui, Liang Guotao, Ma Chenrui, Han Qilong, Li Li, Huang Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommender systems have demonstrated a huge success for next-item
+recommendation by explicitly exploiting the temporal order of users' historical
+interactions. In practice, user interactions contain more useful temporal
+information beyond order, as shown by some pioneering studies. In this paper,
+we systematically investigate various temporal information for sequential
+recommendation and identify three types of advantageous temporal patterns
+beyond order, including absolute time information, relative item time intervals
+and relative recommendation time intervals. We are the first to explore
+item-oriented absolute time patterns. While existing models consider only one
+or two of these three patterns, we propose a novel holistic temporal pattern
+based neural network, named HTP, to fully leverage all these three patterns. In
+particular, we introduce novel components to address the subtle correlations
+between relative item time intervals and relative recommendation time
+intervals, which render a major technical challenge. Extensive experiments on
+three real-world benchmark datasets show that our HTP model consistently and
+substantially outperforms many state-of-the-art models. Our code is publically
+available at https://github.com/623851394/HTP/tree/main/HTP-main
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Graph Neural Networks for Attributed Network Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaoyu Tan, Xin Zhang, Xiao Huang, Hao Chen, Jundong Li, Xia Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have shown prominent performance on attributed
+network embedding. However, existing efforts mainly focus on exploiting network
+structures, while the exploitation of node attributes is rather limited as they
+only serve as node features at the initial layer. This simple strategy impedes
+the potential of node attributes in augmenting node connections, leading to
+limited receptive field for inactive nodes with few or even no neighbors.
+Furthermore, the training objectives (i.e., reconstructing network structures)
+of most GNNs also do not include node attributes, although studies have shown
+that reconstructing node attributes is beneficial. Thus, it is encouraging to
+deeply involve node attributes in the key components of GNNs, including graph
+convolution operations and training objectives. However, this is a nontrivial
+task since an appropriate way of integration is required to maintain the merits
+of GNNs. To bridge the gap, in this paper, we propose COllaborative graph
+Neural Networks--CONN, a tailored GNN architecture for attribute network
+embedding. It improves model capacity by 1) selectively diffusing messages from
+neighboring nodes and involved attribute categories, and 2) jointly
+reconstructing node-to-node and node-to-attribute-category interactions via
+cross-correlation. Experiments on real-world networks demonstrate that CONN
+excels state-of-the-art embedding algorithms with a great margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extracting Molecular Properties from Natural Language with Multimodal
+  Contrastive Learning <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Romain Lacombe, Andrew Gaut, Jeff He, David Lüdeke, Kateryna Pistunova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning in computational biochemistry has traditionally focused on
+molecular graphs neural representations; however, recent advances in language
+models highlight how much scientific knowledge is encoded in text. To bridge
+these two modalities, we investigate how molecular property information can be
+transferred from natural language to graph representations. We study property
+prediction performance gains after using contrastive learning to align neural
+graph representations with representations of textual descriptions of their
+characteristics. We implement neural relevance scoring strategies to improve
+text retrieval, introduce a novel chemically-valid molecular graph augmentation
+strategy inspired by organic reactions, and demonstrate improved performance on
+downstream MoleculeNet property classification tasks. We achieve a +4.26% AUROC
+gain versus models pre-trained on the graph modality alone, and a +1.54% gain
+compared to recently proposed molecular graph/text contrastively trained MoMu
+model (Su et al. 2022).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 ICML Workshop on Computational Biology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Knowledge Graph Reasoning on Graph Types: Static, Dynamic,
+  and Multimodal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05767v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05767v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Liang, Lingyuan Meng, Meng Liu, Yue Liu, Wenxuan Tu, Siwei Wang, Sihang Zhou, Xinwang Liu, Fuchun Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph reasoning (KGR), aiming to deduce new facts from existing
+facts based on mined logic rules underlying knowledge graphs (KGs), has become
+a fast-growing research direction. It has been proven to significantly benefit
+the usage of KGs in many AI applications, such as question answering,
+recommendation systems, and etc. According to the graph types, existing KGR
+models can be roughly divided into three categories, i.e., static models,
+temporal models, and multi-modal models. Early works in this domain mainly
+focus on static KGR, and recent works try to leverage the temporal and
+multi-modal information, which are more practical and closer to real-world.
+However, no survey papers and open-source repositories comprehensively
+summarize and discuss models in this important direction. To fill the gap, we
+conduct a first survey for knowledge graph reasoning tracing from static to
+temporal and then to multi-modal KGs. Concretely, the models are reviewed based
+on bi-level taxonomy, i.e., top-level (graph types) and base-level (techniques
+and scenarios). Besides, the performances, as well as datasets, are summarized
+and presented. Moreover, we point out the challenges and potential
+opportunities to enlighten the readers. The corresponding open-source
+repository is shared on GitHub
+https://github.com/LIANGKE23/Awesome-Knowledge-Graph-Reasoning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EvalRS 2023. Well-Rounded Recommender Systems For Real-World Deployments <span class="chip">KDD23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07145v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07145v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Bianchi, Patrick John Chia, Ciro Greco, Claudio Pomo, Gabriel Moreira, Davide Eynard, Fahd Husain, Jacopo Tagliabue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  EvalRS aims to bring together practitioners from industry and academia to
+foster a debate on rounded evaluation of recommender systems, with a focus on
+real-world impact across a multitude of deployment scenarios. Recommender
+systems are often evaluated only through accuracy metrics, which fall short of
+fully characterizing their generalization capabilities and miss important
+aspects, such as fairness, bias, usefulness, informativeness. This workshop
+builds on the success of last year's workshop at CIKM, but with a broader scope
+and an interactive format.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EvalRS 2023 is a workshop at KDD23. Code and hackathon materials:
+  https://github.com/RecList/evalRS-KDD-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">3</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-Time Neural Video Recovery and Enhancement on Mobile Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12152v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12152v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoyuan He, Yifan Yang, Lili Qiu, Kyoungjun Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As mobile devices become increasingly popular for video streaming, it's
+crucial to optimize the streaming experience for these devices. Although deep
+learning-based video enhancement techniques are gaining attention, most of them
+cannot support real-time enhancement on mobile devices. Additionally, many of
+these techniques are focused solely on super-resolution and cannot handle
+partial or complete loss or corruption of video frames, which is common on the
+Internet and wireless networks.
+  To overcome these challenges, we present a novel approach in this paper. Our
+approach consists of (i) a novel video frame recovery scheme, (ii) a new
+super-resolution algorithm, and (iii) a receiver enhancement-aware video bit
+rate adaptation algorithm. We have implemented our approach on an iPhone 12,
+and it can support 30 frames per second (FPS). We have evaluated our approach
+in various networks such as WiFi, 3G, 4G, and 5G networks. Our evaluation shows
+that our approach enables real-time enhancement and results in a significant
+increase in video QoE (Quality of Experience) of 24\% - 82\% in our video
+streaming system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Video Recovery for Cloud Gaming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07847v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07847v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoyuan He, Yifan Yang, Shuozhe Li, Diyuan Dai, Lili Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cloud gaming is a multi-billion dollar industry. A client in cloud gaming
+sends its movement to the game server on the Internet, which renders and
+transmits the resulting video back. In order to provide a good gaming
+experience, a latency below 80 ms is required. This means that video rendering,
+encoding, transmission, decoding, and display have to finish within that time
+frame, which is especially challenging to achieve due to server overload,
+network congestion, and losses. In this paper, we propose a new method for
+recovering lost or corrupted video frames in cloud gaming. Unlike traditional
+video frame recovery, our approach uses game states to significantly enhance
+recovery accuracy and utilizes partially decoded frames to recover lost
+portions. We develop a holistic system that consists of (i) efficiently
+extracting game states, (ii) modifying H.264 video decoder to generate a mask
+to indicate which portions of video frames need recovery, and (iii) designing a
+novel neural network to recover either complete or partial video frames. Our
+approach is extensively evaluated using iPhone 12 and laptop implementations,
+and we demonstrate the utility of game states in the game video recovery and
+the effectiveness of our overall design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Pan-sharpening with Memories of Spatial Details 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16181v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16181v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maoxun Yuan, Tianyi Zhao, Bo Li, Xingxing Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pan-sharpening, as one of the most commonly used techniques in remote sensing
+systems, aims to inject spatial details from panchromatic images into
+multispectral images (MS) to obtain high-resolution multispectral images. Since
+deep learning has received widespread attention because of its powerful fitting
+ability and efficient feature extraction, a variety of pan-sharpening methods
+have been proposed to achieve remarkable performance. However, current
+pan-sharpening methods usually require the paired panchromatic (PAN) and MS
+images as input, which limits their usage in some scenarios. To address this
+issue, in this paper we observe that the spatial details from PAN images are
+mainly high-frequency cues, i.e., the edges reflect the contour of input PAN
+images. This motivates us to develop a PAN-agnostic representation to store
+some base edges, so as to compose the contour for the corresponding PAN image
+via them. As a result, we can perform the pan-sharpening task with only the MS
+image when inference. To this end, a memory-based network is adapted to extract
+and memorize the spatial details during the training phase and is used to
+replace the process of obtaining spatial information from PAN images when
+inference, which is called Memory-based Spatial Details Network (MSDN).
+Finally, we integrate the proposed MSDN module into the existing deep
+learning-based pan-sharpening methods to achieve an end-to-end pan-sharpening
+network. With extensive experiments on the Gaofen1 and WorldView-4 satellites,
+we verify that our method constructs good spatial details without PAN images
+and achieves the best performance. The code is available at
+https://github.com/Zhao-Tian-yi/Learning-to-Pan-sharpening-with-Memories-of-Spatial-Details.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-21T00:00:00Z">2023-07-21</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">38</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OUTFOX: LLM-generated Essay Detection through In-context Learning with
+  Adversarially Generated Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryuto Koike, Masahiro Kaneko, Naoaki Okazaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have achieved human-level fluency in text
+generation, making it difficult to distinguish between human-written and
+LLM-generated texts. This poses a growing risk of misuse of LLMs and demands
+the development of detectors to identify LLM-generated texts. However, existing
+detectors degrade detection accuracy by simply paraphrasing LLM-generated
+texts. Furthermore, the effectiveness of these detectors in real-life
+situations, such as when students use LLMs for writing homework assignments
+(e.g., essays) and quickly learn how to evade these detectors, has not been
+explored. In this paper, we propose OUTFOX, a novel framework that improves the
+robustness of LLM-generated-text detectors by allowing both the detector and
+the attacker to consider each other's output and apply this to the domain of
+student essays. In our framework, the attacker uses the detector's prediction
+labels as examples for in-context learning and adversarially generates essays
+that are harder to detect. While the detector uses the adversarially generated
+essays as examples for in-context learning to learn to detect essays from a
+strong attacker. Our experiments show that our proposed detector learned
+in-context from the attacker improves the detection performance on the attacked
+dataset by up to +41.3 point F1-score. While our proposed attacker can
+drastically degrade the performance of the detector by up to -57.0 point
+F1-score compared to the paraphrasing method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing CLIP with <span class="highlight-title">GPT</span>-4: Harnessing Visual Descriptions as <span class="highlight-title">Prompt</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11661v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11661v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayug Maniparambil, Chris Vorster, Derek Molloy, Noel Murphy, Kevin McGuinness, Noel E. O'Connor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have
+revolutionized visual representation learning by providing good performance on
+downstream datasets. VLMs are 0-shot adapted to a downstream dataset by
+designing prompts that are relevant to the dataset. Such prompt engineering
+makes use of domain expertise and a validation dataset. Meanwhile, recent
+developments in generative pretrained models like GPT-4 mean they can be used
+as advanced internet search tools. They can also be manipulated to provide
+visual information in any structure. In this work, we show that GPT-4 can be
+used to generate text that is visually descriptive and how this can be used to
+adapt CLIP to downstream tasks. We show considerable improvements in 0-shot
+transfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD
+(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.
+We also design a simple few-shot adapter that learns to choose the best
+possible sentences to construct generalizable classifiers that outperform the
+recently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized
+fine-grained datasets. We will release the code, prompts, and auxiliary text
+dataset upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, Pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OxfordTVG-HIC: Can Machine Make Humorous Captions from Images? <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11636v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11636v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runjia Li, Shuyang Sun, Mohamed Elhoseiny, Philip Torr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents OxfordTVG-HIC (Humorous Image Captions), a large-scale
+dataset for humour generation and understanding. Humour is an abstract,
+subjective, and context-dependent cognitive construct involving several
+cognitive factors, making it a challenging task to generate and interpret.
+Hence, humour generation and understanding can serve as a new task for
+evaluating the ability of deep-learning methods to process abstract and
+subjective information. Due to the scarcity of data, humour-related generation
+tasks such as captioning remain under-explored. To address this gap,
+OxfordTVG-HIC offers approximately 2.9M image-text pairs with humour scores to
+train a generalizable humour captioning model. Contrary to existing captioning
+datasets, OxfordTVG-HIC features a wide range of emotional and semantic
+diversity resulting in out-of-context examples that are particularly conducive
+to generating humour. Moreover, OxfordTVG-HIC is curated devoid of offensive
+content. We also show how OxfordTVG-HIC can be leveraged for evaluating the
+humour of a generated text. Through explainability analysis of the trained
+models, we identify the visual and linguistic cues influential for evoking
+humour prediction (and generation). We observe qualitatively that these cues
+are aligned with the benign violation theory of humour in cognitive psychology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CausE: Towards Causal Knowledge Graph Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11610v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11610v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichi Zhang, Wen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph embedding (KGE) focuses on representing the entities and
+relations of a knowledge graph (KG) into the continuous vector spaces, which
+can be employed to predict the missing triples to achieve knowledge graph
+completion (KGC). However, KGE models often only briefly learn structural
+correlations of triple data and embeddings would be misled by the trivial
+patterns and noisy links in real-world KGs. To address this issue, we build the
+new paradigm of KGE in the context of causality and embedding disentanglement.
+We further propose a Causality-enhanced knowledge graph Embedding (CausE)
+framework. CausE employs causal intervention to estimate the causal effect of
+the confounder embeddings and design new training objectives to make stable
+predictions. Experimental results demonstrate that CausE could outperform the
+baseline models and achieve state-of-the-art KGC performance. We release our
+code in https://github.com/zjukg/CausE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Change of Heart: Improving Speech Emotion Recognition through
+  Speech-to-Text Modality Conversion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11584v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11584v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeinab Sadat Taghavi, Ali Satvaty, Hossein Sameti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech Emotion Recognition (SER) is a challenging task. In this paper, we
+introduce a modality conversion concept aimed at enhancing emotion recognition
+performance on the MELD dataset. We assess our approach through two
+experiments: first, a method named Modality-Conversion that employs automatic
+speech recognition (ASR) systems, followed by a text classifier; second, we
+assume perfect ASR output and investigate the impact of modality conversion on
+SER, this method is called Modality-Conversion++. Our findings indicate that
+the first method yields substantial results, while the second method
+outperforms state-of-the-art (SOTA) speech-based approaches in terms of SER
+weighted-F1 (WF1) score on the MELD dataset. This research highlights the
+potential of modality conversion for tasks that can be conducted in alternative
+modalities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Visual Grounding with Scene Knowledge: Benchmark and Method <span class="chip">CVPR-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihong Chen, Ruifei Zhang, Yibing Song, Xiang Wan, Guanbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual grounding (VG) aims to establish fine-grained alignment between vision
+and language. Ideally, it can be a testbed for vision-and-language models to
+evaluate their understanding of the images and texts and their reasoning
+abilities over their joint space. However, most existing VG datasets are
+constructed using simple description texts, which do not require sufficient
+reasoning over the images and texts. This has been demonstrated in a recent
+study~\cite{luo2022goes}, where a simple LSTM-based text encoder without
+pretraining can achieve state-of-the-art performance on mainstream VG datasets.
+Therefore, in this paper, we propose a novel benchmark of \underline{S}cene
+\underline{K}nowledge-guided \underline{V}isual \underline{G}rounding (SK-VG),
+where the image content and referring expressions are not sufficient to ground
+the target objects, forcing the models to have a reasoning ability on the
+long-form scene knowledge. To perform this task, we propose two approaches to
+accept the triple-type input, where the former embeds knowledge into the image
+features before the image-query interaction; the latter leverages linguistic
+structure to assist in computing the image-text matching. We conduct extensive
+experiments to analyze the above methods and show that the proposed approaches
+achieve promising results but still leave room for improvement, including
+performance and interpretability. The dataset and code are available at
+\url{https://github.com/zhjohnchan/SK-VG}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computer Vision and Natural Language Processing. 21 pages, 14
+  figures. CVPR-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Vision and Language Encoders: Parameter-Efficient Tuning for
+  Referring Image Segmentation <span class="chip">ICCV-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zunnan Xu, Zhihong Chen, Yong Zhang, Yibing Song, Xiang Wan, Guanbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter Efficient Tuning (PET) has gained attention for reducing the number
+of parameters while maintaining performance and providing better hardware
+resource savings, but few studies investigate dense prediction tasks and
+interaction between modalities. In this paper, we do an investigation of
+efficient tuning problems on referring image segmentation. We propose a novel
+adapter called Bridger to facilitate cross-modal information exchange and
+inject task-specific information into the pre-trained model. We also design a
+lightweight decoder for image segmentation. Our approach achieves comparable or
+superior performance with only 1.61\% to 3.38\% backbone parameter updates,
+evaluated on challenging benchmarks. The code is available at
+\url{https://github.com/kkakkkka/ETRIS}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computer Vision and Natural Language Processing. 14 pages, 8 figures.
+  ICCV-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IndigoVX: Where Human Intelligence Meets AI for Optimal Decision Making 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11516v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11516v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kais Dukes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper defines a new approach for augmenting human intelligence with AI
+for optimal goal solving. Our proposed AI, Indigo, is an acronym for Informed
+Numerical Decision-making through Iterative Goal-Oriented optimization. When
+combined with a human collaborator, we term the joint system IndigoVX, for
+Virtual eXpert. The system is conceptually simple. We envisage this method
+being applied to games or business strategies, with the human providing
+strategic context and the AI offering optimal, data-driven moves. Indigo
+operates through an iterative feedback loop, harnessing the human expert's
+contextual knowledge and the AI's data-driven insights to craft and refine
+strategies towards a well-defined goal. Using a quantified three-score schema,
+this hybridization allows the combined team to evaluate strategies and refine
+their plan, while adapting to challenges and changes in real-time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incorporating Human Translator Style into English-Turkish Literary
+  Machine Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeynep Yirmibeşoğlu, Olgun Dursun, Harun Dallı, Mehmet Şahin, Ena Hodzik, Sabri Gürses, Tunga Güngör
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although machine translation systems are mostly designed to serve in the
+general domain, there is a growing tendency to adapt these systems to other
+domains like literary translation. In this paper, we focus on English-Turkish
+literary translation and develop machine translation models that take into
+account the stylistic features of translators. We fine-tune a pre-trained
+machine translation model by the manually-aligned works of a particular
+translator. We make a detailed analysis of the effects of manual and automatic
+alignments, data augmentation methods, and corpus size on the translations. We
+propose an approach based on stylistic features to evaluate the style of a
+translator in the output translations. We show that the human translator style
+can be highly recreated in the target machine translations by adapting the
+models to the style of the translator.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topic Identification For Spontaneous Speech: Enriching Audio Features
+  With Embedded Linguistic Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dejan Porjazovski, Tamás Grósz, Mikko Kurimo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional topic identification solutions from audio rely on an automatic
+speech recognition system (ASR) to produce transcripts used as input to a
+text-based model. These approaches work well in high-resource scenarios, where
+there are sufficient data to train both components of the pipeline. However, in
+low-resource situations, the ASR system, even if available, produces
+low-quality transcripts, leading to a bad text-based classifier. Moreover,
+spontaneous speech containing hesitations can further degrade the performance
+of the ASR model. In this paper, we investigate alternatives to the standard
+text-only solutions by comparing audio-only and hybrid techniques of jointly
+utilising text and audio features. The models evaluated on spontaneous Finnish
+speech demonstrate that purely audio-based solutions are a viable option when
+ASR components are not available, while the hybrid multi-modal solutions
+achieve the best results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EUSIPCO 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeetEval: A Toolkit for Computation of Word Error Rates for Meeting
+  Transcription Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thilo von Neumann, Christoph Boeddeker, Marc Delcroix, Reinhold Haeb-Umbach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  MeetEval is an open-source toolkit to evaluate all kinds of meeting
+transcription systems. It provides a unified interface for the computation of
+commonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER
+along other WER definitions. We extend the cpWER computation by a temporal
+constraint to ensure that only words are identified as correct when the
+temporal alignment is plausible. This leads to a better quality of the matching
+of the hypothesis string to the reference string that more closely resembles
+the actual transcription quality, and a system is penalized if it provides poor
+time annotations. Since word-level timing information is often not available,
+we present a way to approximate exact word-level timings from segment-level
+timings (e.g., a sentence) and show that the approximation leads to a similar
+WER as a matching with exact word-level annotations. At the same time, the time
+constraint leads to a speedup of the matching algorithm, which outweighs the
+additional overhead caused by processing the time stamps.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the Chime7 workshop 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is Chat<span class="highlight-title">GPT</span> Involved in Texts? Measure the Polish Ratio to Detect
+  Chat<span class="highlight-title">GPT</span>-Generated Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingyi Yang, Feng Jiang, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable capabilities of large-scale language models, such as ChatGPT,
+in text generation have incited awe and spurred researchers to devise detectors
+to mitigate potential risks, including misinformation, phishing, and academic
+dishonesty. Despite this, most previous studies, including HC3, have been
+predominantly geared towards creating detectors that differentiate between
+purely ChatGPT-generated texts and human-authored texts. This approach,
+however, fails to work on discerning texts generated through human-machine
+collaboration, such as ChatGPT-polished texts. Addressing this gap, we
+introduce a novel dataset termed HPPT (ChatGPT-polished academic abstracts),
+facilitating the construction of more robust detectors. It diverges from extant
+corpora by comprising pairs of human-written and ChatGPT-polished abstracts
+instead of purely ChatGPT-generated texts. Additionally, we propose the "Polish
+Ratio" method, an innovative measure of ChatGPT's involvement in text
+generation based on editing distance. It provides a mechanism to measure the
+degree of human originality in the resulting text. Our experimental results
+show our proposed model has better robustness on the HPPT dataset and two
+existing datasets (HC3 and CDB). Furthermore, the "Polish Ratio" we proposed
+offers a more comprehensive explanation by quantifying the degree of ChatGPT
+involvement, which indicates that a Polish Ratio value greater than 0.2
+signifies ChatGPT involvement and a value exceeding 0.6 implies that ChatGPT
+generates most of the text.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cohort<span class="highlight-title">GPT</span>: An Enhanced <span class="highlight-title">GPT</span> for Participant Recruitment in Clinical Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Guan, Zihao Wu, Zhengliang Liu, Dufan Wu, Hui Ren, Quanzheng Li, Xiang Li, Ninghao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Participant recruitment based on unstructured medical texts such as clinical
+notes and radiology reports has been a challenging yet important task for the
+cohort establishment in clinical research. Recently, Large Language Models
+(LLMs) such as ChatGPT have achieved tremendous success in various downstream
+tasks thanks to their promising performance in language understanding,
+inference, and generation. It is then natural to test their feasibility in
+solving the cohort recruitment task, which involves the classification of a
+given paragraph of medical text into disease label(s). However, when applied to
+knowledge-intensive problem settings such as medical text classification, where
+the LLMs are expected to understand the decision made by human experts and
+accurately identify the implied disease labels, the LLMs show a mediocre
+performance. A possible explanation is that, by only using the medical text,
+the LLMs neglect to use the rich context of additional information that
+languages afford. To this end, we propose to use a knowledge graph as auxiliary
+information to guide the LLMs in making predictions. Moreover, to further boost
+the LLMs adapt to the problem setting, we apply a chain-of-thought (CoT) sample
+selection strategy enhanced by reinforcement learning, which selects a set of
+CoT samples given each individual medical report. Experimental results and
+various ablation studies show that our few-shot learning method achieves
+satisfactory performance compared with fine-tuning strategies and gains superb
+advantages when the available data is limited. The code and sample dataset of
+the proposed CohortGPT model is available at:
+https://anonymous.4open.science/r/CohortGPT-4872/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DEFTri: A Few-Shot Label Fused Contextual Representation Learning For
+  Product Defect Triage in e-Commerce 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ipsita Mohanty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Defect Triage is a time-sensitive and critical process in a large-scale agile
+software development lifecycle for e-commerce. Inefficiencies arising from
+human and process dependencies in this domain have motivated research in
+automated approaches using machine learning to accurately assign defects to
+qualified teams. This work proposes a novel framework for automated defect
+triage (DEFTri) using fine-tuned state-of-the-art pre-trained BERT on labels
+fused text embeddings to improve contextual representations from
+human-generated product defects. For our multi-label text classification defect
+triage task, we also introduce a Walmart proprietary dataset of product defects
+using weak supervision and adversarial learning, in a few-shot setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the Fifth Workshop on e-Commerce and NLP ECNLP 5
+  2022 Pages 1-7</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making <span class="highlight-title">Pre-train</span>ed Language Models both Task-solvers and
+  Self-calibrators <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangyi Chen, Xingyao Wang, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained language models (PLMs) serve as backbones for various real-world
+systems. For high-stake applications, it's equally essential to have reasonable
+confidence estimations in predictions. While the vanilla confidence scores of
+PLMs can already be effectively utilized, PLMs consistently become
+overconfident in their wrong predictions, which is not desirable in practice.
+Previous work shows that introducing an extra calibration task can mitigate
+this issue. The basic idea involves acquiring additional data to train models
+in predicting the confidence of their initial predictions. However, it only
+demonstrates the feasibility of this kind of method, assuming that there are
+abundant extra available samples for the introduced calibration task. In this
+work, we consider the practical scenario that we need to effectively utilize
+training samples to make PLMs both task-solvers and self-calibrators. Three
+challenges are presented, including limited training samples, data imbalance,
+and distribution shifts. We first conduct pilot experiments to quantify various
+decisive factors in the calibration task. Based on the empirical analysis
+results, we propose a training algorithm LM-TOAST to tackle the challenges.
+Experimental results show that LM-TOAST can effectively utilize the training
+data to make PLMs have reasonable confidence estimations while maintaining the
+original task performance. Further, we consider three downstream applications,
+namely selective classification, adversarial defense, and model cascading, to
+show the practical usefulness of LM-TOAST. The code will be made public at
+\url{https://github.com/Yangyi-Chen/LM-TOAST}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Image-Specific Text Improves Fine-grained Image
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11315v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11315v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emily Mu, Kathleen M. Lewis, Adrian V. Dalca, John Guttag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent vision-language models outperform vision-only models on many image
+classification tasks. However, because of the absence of paired text/image
+descriptions, it remains difficult to fine-tune these models for fine-grained
+image classification. In this work, we propose a method, GIST, for generating
+image-specific fine-grained text descriptions from image-only datasets, and
+show that these text descriptions can be used to improve classification. Key
+parts of our method include 1. prompting a pretrained large language model with
+domain-specific prompts to generate diverse fine-grained text descriptions for
+each class and 2. using a pretrained vision-language model to match each image
+to label-preserving text descriptions that capture relevant visual features in
+the image. We demonstrate the utility of GIST by fine-tuning vision-language
+models on the image-and-generated-text pairs to learn an aligned
+vision-language representation space for improved classification. We evaluate
+our learned representation space in full-shot and few-shot scenarios across
+four diverse fine-grained classification datasets, each from a different
+domain. Our method achieves an average improvement of $4.1\%$ in accuracy over
+CLIP linear probes and an average of $1.1\%$ improvement in accuracy over the
+previous state-of-the-art image-text classification method on the full-shot
+datasets. Our method achieves similar improvements across few-shot regimes.
+Code is available at https://github.com/emu1729/GIST.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generator-Retriever-Generator: A Novel Approach to Open-domain Question
+  Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrahman Abdallah, Adam Jatowt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-domain question answering (QA) tasks usually require the retrieval of
+relevant information from a large corpus to generate accurate answers. We
+propose a novel approach called Generator-Retriever-Generator (GRG) that
+combines document retrieval techniques with a large language model (LLM), by
+first prompting the model to generate contextual documents based on a given
+question. In parallel, a dual-encoder network retrieves documents that are
+relevant to the question from an external corpus. The generated and retrieved
+documents are then passed to the second LLM, which generates the final answer.
+By combining document retrieval and LLM generation, our approach addresses the
+challenges of open-domain QA, such as generating informative and contextually
+relevant answers. GRG outperforms the state-of-the-art generate-then-read and
+retrieve-then-read pipelines (GENREAD and RFiD) improving their performance at
+least by +5.2, +4.2, and +1.6 on TriviaQA, NQ, and WebQ datasets, respectively.
+We provide code, datasets, and checkpoints
+\footnote{\url{https://github.com/abdoelsayed2016/GRG}}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Selective Perception: Optimizing State Descriptions with Reinforcement
+  Learning for Language Model Actors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11922v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11922v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kolby Nottingham, Yasaman Razeghi, Kyungmin Kim, JB Lanier, Pierre Baldi, Roy Fox, Sameer Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are being applied as actors for sequential
+decision making tasks in domains such as robotics and games, utilizing their
+general world knowledge and planning abilities. However, previous work does
+little to explore what environment state information is provided to LLM actors
+via language. Exhaustively describing high-dimensional states can impair
+performance and raise inference costs for LLM actors. Previous LLM actors avoid
+the issue by relying on hand-engineered, task-specific protocols to determine
+which features to communicate about a state and which to leave out. In this
+work, we propose Brief Language INputs for DEcision-making Responses (BLINDER),
+a method for automatically selecting concise state descriptions by learning a
+value function for task-conditioned state descriptions. We evaluate BLINDER on
+the challenging video game NetHack and a robotic manipulation task. Our method
+improves task success rate, reduces input size and compute costs, and
+generalizes between LLM actors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CARTIER: Cartographic lAnguage Reasoning Targeted at Instruction
+  Execution for Robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Kakodkar, Dmitriy Rivkin, Bobak H. Baghi, Francois Hogan, Gregory Dudek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work explores the capacity of large language models (LLMs) to address
+problems at the intersection of spatial planning and natural language
+interfaces for navigation.Our focus is on following relatively complex
+instructions that are more akin to natural conversation than traditional
+explicit procedural directives seen in robotics. Unlike most prior work, where
+navigation directives are provided as imperative commands (e.g., go to the
+fridge), we examine implicit directives within conversational interactions. We
+leverage the 3D simulator AI2Thor to create complex and repeatable scenarios at
+scale, and augment it by adding complex language queries for 40 object types.
+We demonstrate that a robot can better parse descriptive language queries than
+existing methods by using an LLM to interpret the user interaction in the
+context of a list of the objects in the scene.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Looming Threat of Fake and LLM-generated LinkedIn Profiles:
+  Challenges and Opportunities for Detection and Prevention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navid Ayoobi, Sadat Shahriar, Arjun Mukherjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a novel method for detecting fake and Large
+Language Model (LLM)-generated profiles in the LinkedIn Online Social Network
+immediately upon registration and before establishing connections. Early fake
+profile identification is crucial to maintaining the platform's integrity since
+it prevents imposters from acquiring the private and sensitive information of
+legitimate users and from gaining an opportunity to increase their credibility
+for future phishing and scamming activities. This work uses textual information
+provided in LinkedIn profiles and introduces the Section and Subsection Tag
+Embedding (SSTE) method to enhance the discriminative characteristics of these
+data for distinguishing between legitimate profiles and those created by
+imposters manually or by using an LLM. Additionally, the dearth of a large
+publicly available LinkedIn dataset motivated us to collect 3600 LinkedIn
+profiles for our research. We will release our dataset publicly for research
+purposes. This is, to the best of our knowledge, the first large publicly
+available LinkedIn dataset for fake LinkedIn account detection. Within our
+paradigm, we assess static and contextualized word embeddings, including GloVe,
+Flair, BERT, and RoBERTa. We show that the suggested method can distinguish
+between legitimate and fake profiles with an accuracy of about 95% across all
+word embeddings. In addition, we show that SSTE has a promising accuracy for
+identifying LLM-generated profiles, despite the fact that no LLM-generated
+profiles were employed during the training phase, and can achieve an accuracy
+of approximately 90% when only 20 LLM-generated profiles are added to the
+training set. It is a significant finding since the proliferation of several
+LLMs in the near future makes it extremely challenging to design a single
+system that can identify profiles created with various LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33rd ACM Conference on Hypertext and Social Media (HT '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MythQA: Query-Based Large-Scale Check-Worthy Claim Detection through
+  Multi-Answer Open-Domain Question Answering <span class="chip">SIGIR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Bai, Anthony Colas, Daisy Zhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Check-worthy claim detection aims at providing plausible misinformation to
+downstream fact-checking systems or human experts to check. This is a crucial
+step toward accelerating the fact-checking process. Many efforts have been put
+into how to identify check-worthy claims from a small scale of pre-collected
+claims, but how to efficiently detect check-worthy claims directly from a
+large-scale information source, such as Twitter, remains underexplored. To fill
+this gap, we introduce MythQA, a new multi-answer open-domain question
+answering(QA) task that involves contradictory stance mining for query-based
+large-scale check-worthy claim detection. The idea behind this is that
+contradictory claims are a strong indicator of misinformation that merits
+scrutiny by the appropriate authorities. To study this task, we construct
+TweetMythQA, an evaluation dataset containing 522 factoid multi-answer
+questions based on controversial topics. Each question is annotated with
+multiple answers. Moreover, we collect relevant tweets for each distinct
+answer, then classify them into three categories: "Supporting", "Refuting", and
+"Neutral". In total, we annotated 5.3K tweets. Contradictory evidence is
+collected for all answers in the dataset. Finally, we present a baseline system
+for MythQA and evaluate existing NLP models for each system component using the
+TweetMythQA dataset. We provide initial benchmarks and identify key challenges
+for future models to improve upon. Code and data are available at:
+https://github.com/TonyBY/Myth-QA
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SIGIR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Document Analytics for Banking Process Automation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Gerling, Stefan Lessmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In response to growing FinTech competition and the need for improved
+operational efficiency, this research focuses on understanding the potential of
+advanced document analytics, particularly using multimodal models, in banking
+processes. We perform a comprehensive analysis of the diverse banking document
+landscape, highlighting the opportunities for efficiency gains through
+automation and advanced analytics techniques in the customer business. Building
+on the rapidly evolving field of natural language processing (NLP), we
+illustrate the potential of models such as LayoutXLM, a cross-lingual,
+multimodal, pre-trained model, for analyzing diverse documents in the banking
+sector. This model performs a text token classification on German company
+register extracts with an overall F1 score performance of around 80\%. Our
+empirical evidence confirms the critical role of layout information in
+improving model performance and further underscores the benefits of integrating
+image information. Interestingly, our study shows that over 75% F1 score can be
+achieved with only 30% of the training data, demonstrating the efficiency of
+LayoutXLM. Through addressing state-of-the-art document analysis frameworks,
+our study aims to enhance process efficiency and demonstrate the real-world
+applicability and benefits of multimodal models within banking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>ing Large Language Models with Speech Recognition Abilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yassir Fathullah, Chunyang Wu, Egor Lakomkin, Junteng Jia, Yuan Shangguan, Ke Li, Jinxi Guo, Wenhan Xiong, Jay Mahadeokar, Ozlem Kalinli, Christian Fuegen, Mike Seltzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have proven themselves highly flexible, able to solve a
+wide range of generative tasks, such as abstractive summarization and
+open-ended question answering. In this paper we extend the capabilities of LLMs
+by directly attaching a small audio encoder allowing it to perform speech
+recognition. By directly prepending a sequence of audial embeddings to the text
+token embeddings, the LLM can be converted to an automatic speech recognition
+(ASR) system, and be used in the exact same manner as its textual counterpart.
+Experiments on Multilingual LibriSpeech (MLS) show that incorporating a
+conformer encoder into the open sourced LLaMA-7B allows it to outperform
+monolingual baselines by 18% and perform multilingual speech recognition
+despite LLaMA being trained overwhelmingly on English text. Furthermore, we
+perform ablation studies to investigate whether the LLM can be completely
+frozen during training to maintain its original capabilities, scaling up the
+audio encoder, and increasing the audio encoder striding to generate fewer
+embeddings. The results from these studies show that multilingual ASR is
+possible even when the LLM is frozen or when strides of almost 1 second are
+used in the audio encoder opening up the possibility for LLMs to operate on
+long-form audio.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ (Ab)using Images and Sounds for Indirect Instruction Injection in
+  Multi-Modal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10490v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10490v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugene Bagdasaryan, Tsung-Yin Hsieh, Ben Nassi, Vitaly Shmatikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate how images and sounds can be used for indirect prompt and
+instruction injection in multi-modal LLMs. An attacker generates an adversarial
+perturbation corresponding to the prompt and blends it into an image or audio
+recording. When the user asks the (unmodified, benign) model about the
+perturbed image or audio, the perturbation steers the model to output the
+attacker-chosen text and/or make the subsequent dialog follow the attacker's
+instruction. We illustrate this attack with several proof-of-concept examples
+targeting LLaVa and PandaGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Going Beyond Local: Global Graph-Enhanced Personalized News
+  Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06576v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06576v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boming Yang, Dairui Liu, Toyotaro Suzumura, Ruihai Dong, Irene Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Precisely recommending candidate news articles to users has always been a
+core challenge for personalized news recommendation systems. Most recent works
+primarily focus on using advanced natural language processing techniques to
+extract semantic information from rich textual data, employing content-based
+methods derived from local historical news. However, this approach lacks a
+global perspective, failing to account for users' hidden motivations and
+behaviors beyond semantic information. To address this challenge, we propose a
+novel model called GLORY (Global-LOcal news Recommendation sYstem), which
+combines global representations learned from other users with local
+representations to enhance personalized recommendation systems. We accomplish
+this by constructing a Global-aware Historical News Encoder, which includes a
+global news graph and employs gated graph neural networks to enrich news
+representations, thereby fusing historical news representations by a historical
+news aggregator. Similarly, we extend this approach to a Global Candidate News
+Encoder, utilizing a global entity graph and a candidate news aggregator to
+enhance candidate news representation. Evaluation results on two public news
+datasets demonstrate that our method outperforms existing approaches.
+Furthermore, our model offers more diverse recommendations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, Recsys 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NusaCrowd: Open Source Initiative for Indonesian NLP Resources 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09648v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09648v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Cahyawijaya, Holy Lovenia, Alham Fikri Aji, Genta Indra Winata, Bryan Wilie, Rahmad Mahendra, Christian Wibisono, Ade Romadhony, Karissa Vincentio, Fajri Koto, Jennifer Santoso, David Moeljadi, Cahya Wirawan, Frederikus Hudi, Ivan Halim Parmonangan, Ika Alfina, Muhammad Satrio Wicaksono, Ilham Firdausi Putra, Samsul Rahmadani, Yulianti Oenang, Ali Akbar Septiandri, James Jaya, Kaustubh D. Dhole, Arie Ardiyanti Suryani, Rifki Afina Putri, Dan Su, Keith Stevens, Made Nindyatama Nityasya, Muhammad Farid Adilazuarda, Ryan Ignatius, Ryandito Diandaru, Tiezheng Yu, Vito Ghifari, Wenliang Dai, Yan Xu, Dyah Damapuspita, Cuk Tho, Ichwanul Muslim Karo Karo, Tirana Noor Fatyanosa, Ziwei Ji, Pascale Fung, Graham Neubig, Timothy Baldwin, Sebastian Ruder, Herry Sujaini, Sakriani Sakti, Ayu Purwarianti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present NusaCrowd, a collaborative initiative to collect and unify
+existing resources for Indonesian languages, including opening access to
+previously non-public resources. Through this initiative, we have brought
+together 137 datasets and 118 standardized data loaders. The quality of the
+datasets has been assessed manually and automatically, and their value is
+demonstrated through multiple experiments. NusaCrowd's data collection enables
+the creation of the first zero-shot benchmarks for natural language
+understanding and generation in Indonesian and the local languages of
+Indonesia. Furthermore, NusaCrowd brings the creation of the first multilingual
+automatic speech recognition benchmark in Indonesian and the local languages of
+Indonesia. Our work strives to advance natural language processing (NLP)
+research for languages that are under-represented despite being widely spoken.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ClueReader: Heterogeneous Graph Attention Network for Multi-hop Machine
+  Reading Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2107.00841v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2107.00841v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Gao, Feng Gao, Peng Wang, Jian-Cheng Ni, Fei Wang, Hamido Fujita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-hop machine reading comprehension is a challenging task in natural
+language processing as it requires more reasoning ability across multiple
+documents. Spectral models based on graph convolutional networks have shown
+good inferring abilities and lead to competitive results. However, the analysis
+and reasoning of some are inconsistent with those of humans. Inspired by the
+concept of grandmother cells in cognitive neuroscience, we propose a
+heterogeneous graph attention network model named ClueReader to imitate the
+grandmother cell concept. The model is designed to assemble the semantic
+features in multi-level representations and automatically concentrate or
+alleviate information for reasoning through the attention mechanism. The name
+ClueReader is a metaphor for the pattern of the model: it regards the subjects
+of queries as the starting points of clues, takes the reasoning entities as
+bridge points, considers the latent candidate entities as grandmother cells,
+and the clues end up in candidate entities. The proposed model enables the
+visualization of the reasoning graph, making it possible to analyze the
+importance of edges connecting entities and the selectivity in the mention and
+candidate nodes, which is easier to comprehend empirically. Evaluations on the
+open-domain multi-hop reading dataset WikiHop and drug-drug interaction dataset
+MedHop proved the validity of ClueReader and showed the feasibility of its
+application of the model in the molecular biology domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Forecasting consumer confidence through semantic network analysis of
+  online news 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.04900v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.04900v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Fronzetti Colladon, F. Grippa, B. Guardabascio, G. Costante, F. Ravazzolo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research studies the impact of online news on social and economic
+consumer perceptions through semantic network analysis. Using over 1.8 million
+online articles on Italian media covering four years, we calculate the semantic
+importance of specific economic-related keywords to see if words appearing in
+the articles could anticipate consumers' judgments about the economic situation
+and the Consumer Confidence Index. We use an innovative approach to analyze big
+textual data, combining methods and tools of text mining and social network
+analysis. Results show a strong predictive power for the judgments about the
+current households and national situation. Our indicator offers a complementary
+approach to estimating consumer confidence, lessening the limitations of
+traditional survey-based methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Coherence of Extractive Summarization with Multitask Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12851v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12851v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renlong Jie, Xiaojun Meng, Lifeng Shang, Xin Jiang, Qun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes a multitask learning architecture for extractive
+summarization with coherence boosting. The architecture contains an extractive
+summarizer and coherent discriminator module. The coherent discriminator is
+trained online on the sentence vectors of the augmented textual input, thus
+improving its general ability of judging whether the input sentences are
+coherent. Meanwhile, we maximize the coherent scores from the coherent
+discriminator by updating the parameters of the summarizer. To make the
+extractive sentences trainable in a differentiable manner, we introduce two
+strategies, including pre-trained converting model (model-based) and converting
+matrix (MAT-based) that merge sentence representations. Experiments show that
+our proposed method significantly improves the proportion of consecutive
+sentences in the extracted summaries based on their positions in the original
+article (i.e., automatic sentence-level coherence metric), while the goodness
+in terms of other automatic metrics (i.e., Rouge scores and BertScores) are
+preserved. Human evaluation also evidences the improvement of coherence and
+consistency of the extracted summaries given by our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chinese Fine-Grained Financial Sentiment Analysis with Large Language
+  Models <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14096v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14096v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinyu Lan, Yanru Wu, Wang Xu, Weiqiang Feng, Youhao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity-level fine-grained sentiment analysis in the financial domain is a
+crucial subtask of sentiment analysis and currently faces numerous challenges.
+The primary challenge stems from the lack of high-quality and large-scale
+annotated corpora specifically designed for financial text sentiment analysis,
+which in turn limits the availability of data necessary for developing
+effective text processing techniques. Recent advancements in large language
+models (LLMs) have yielded remarkable performance in natural language
+processing tasks, primarily centered around language pattern matching. In this
+paper, we propose a novel and extensive Chinese fine-grained financial
+sentiment analysis dataset, FinChina SA, for enterprise early warning. We
+thoroughly evaluate and experiment with well-known existing open-source LLMs
+using our dataset. We firmly believe that our dataset will serve as a valuable
+resource to advance the exploration of real-world financial sentiment analysis
+tasks, which should be the focus of future research. Our dataset and all code
+to replicate the experimental results will be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>FinLLM Symposium at IJCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Model Augmented Narrative Driven Recommendations <span class="chip">RecSys 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02250v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02250v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheshera Mysore, Andrew McCallum, Hamed Zamani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Narrative-driven recommendation (NDR) presents an information access problem
+where users solicit recommendations with verbose descriptions of their
+preferences and context, for example, travelers soliciting recommendations for
+points of interest while describing their likes/dislikes and travel
+circumstances. These requests are increasingly important with the rise of
+natural language-based conversational interfaces for search and recommendation
+systems. However, NDR lacks abundant training data for models, and current
+platforms commonly do not support these requests. Fortunately, classical
+user-item interaction datasets contain rich textual data, e.g., reviews, which
+often describe user preferences and context - this may be used to bootstrap
+training for NDR models. In this work, we explore using large language models
+(LLMs) for data augmentation to train NDR models. We use LLMs for authoring
+synthetic narrative queries from user-item interactions with few-shot prompting
+and train retrieval models for NDR on synthetic queries and user-item
+interaction data. Our experiments demonstrate that this is an effective
+strategy for training small-parameter retrieval models that outperform other
+retrieval and LLM baselines for narrative-driven recommendation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>RecSys 2023 Camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Editable User Profiles for Controllable Text Recommendation <span class="chip">SIGIR-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04250v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04250v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheshera Mysore, Mahmood Jasim, Andrew McCallum, Hamed Zamani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods for making high-quality recommendations often rely on learning latent
+representations from interaction data. These methods, while performant, do not
+provide ready mechanisms for users to control the recommendation they receive.
+Our work tackles this problem by proposing LACE, a novel concept value
+bottleneck model for controllable text recommendations. LACE represents each
+user with a succinct set of human-readable concepts through retrieval given
+user-interacted documents and learns personalized representations of the
+concepts based on user documents. This concept based user profile is then
+leveraged to make recommendations. The design of our model affords control over
+the recommendations through a number of intuitive interactions with a
+transparent user profile. We first establish the quality of recommendations
+obtained from LACE in an offline evaluation on three recommendation tasks
+spanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we
+validate the controllability of LACE under simulated user interactions.
+Finally, we implement LACE in an interactive controllable recommender system
+and conduct a user study to demonstrate that users are able to improve the
+quality of recommendations they receive through interactions with an editable
+user profile.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGIR-2023 Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">GPT</span>-FinRE: In-context Learning for Financial Relation Extraction using
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17519v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17519v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pawan Kumar Rajpoot, Ankur Parikh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relation extraction (RE) is a crucial task in natural language processing
+(NLP) that aims to identify and classify relationships between entities
+mentioned in text. In the financial domain, relation extraction plays a vital
+role in extracting valuable information from financial documents, such as news
+articles, earnings reports, and company filings. This paper describes our
+solution to relation extraction on one such dataset REFinD. The dataset was
+released along with shared task as a part of the Fourth Workshop on Knowledge
+Discovery from Unstructured Data in Financial Services, co-located with SIGIR
+2023. In this paper, we employed OpenAI models under the framework of
+in-context learning (ICL). We utilized two retrieval strategies to find top K
+relevant in-context learning demonstrations / examples from training data for a
+given test example. The first retrieval mechanism, we employed, is a
+learning-free dense retriever and the other system is a learning-based
+retriever. We were able to achieve 3rd rank overall. Our best F1-score is
+0.718.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2305.02105 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mutual Reinforcement Effects in Japanese Sentence Classification and
+  Named Entity Recognition Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10291v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10291v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengguang Gan, Qinghao Zhang, Tatsunori Mori
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information extraction(IE) is a crucial subfield within natural language
+processing. However, for the traditionally segmented approach to sentence
+classification and Named Entity Recognition, the intricate interactions between
+these individual subtasks remain largely uninvestigated. In this study, we
+propose an integrative analysis, converging sentence classification with Named
+Entity Recognition, with the objective to unveil and comprehend the mutual
+reinforcement effect within these two information extraction subtasks. To
+achieve this, we introduce a Sentence Classification and Named Entity
+Recognition Multi-task (SCNM) approach that combines Sentence Classification
+(SC) and Named Entity Recognition (NER). We develop a Sentence-to-Label
+Generation (SLG) framework for SCNM and construct a Wikipedia dataset
+containing both SC and NER. Using a format converter, we unify input formats
+and employ a generative model to generate SC-labels, NER-labels, and associated
+text segments. We propose a Constraint Mechanism (CM) to improve generated
+format accuracy. Our results show SC accuracy increased by 1.13 points and NER
+by 1.06 points in SCNM compared to standalone tasks, with CM raising format
+accuracy from 63.61 to 100. The findings indicate mutual reinforcement effects
+between SC and NER, and integration enhances both tasks' performance. We
+additionally implemented the SLG framework on single SC task. It yielded
+superior accuracies compared to the baseline on two distinct Japanese SC
+datasets. Notably, in the experiment of few-shot learning, SLG framework shows
+much better performance than fine-tune method. These empirical findings
+contribute additional evidence to affirm the efficacy of the SLG framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 12 figures, 19 tables. arXiv admin note: substantial text
+  overlap with arXiv:2306.15978</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pharmacy<span class="highlight-title">GPT</span>: The AI Pharmacist 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10432v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10432v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengliang Liu, Zihao Wu, Mengxuan Hu, Bokai Zhao, Lin Zhao, Tianyi Zhang, Haixing Dai, Xianyan Chen, Ye Shen, Sheng Li, Brian Murray, Tianming Liu, Andrea Sikora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce PharmacyGPT, a novel framework to assess the
+capabilities of large language models (LLMs) such as ChatGPT and GPT-4 in
+emulating the role of clinical pharmacists. Our methodology encompasses the
+utilization of LLMs to generate comprehensible patient clusters, formulate
+medication plans, and forecast patient outcomes. We conduct our investigation
+using real data acquired from the intensive care unit (ICU) at the University
+of North Carolina Chapel Hill (UNC) Hospital. Our analysis offers valuable
+insights into the potential applications and limitations of LLMs in the field
+of clinical pharmacy, with implications for both patient care and the
+development of future AI-driven healthcare solutions. By evaluating the
+performance of PharmacyGPT, we aim to contribute to the ongoing discourse
+surrounding the integration of artificial intelligence in healthcare settings,
+ultimately promoting the responsible and efficacious use of such technologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust Aspect-based Sentiment Analysis through
+  Non-counterfactual Augmentations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13971v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13971v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Liu, Yan Ding, Kaikai An, Chunyang Xiao, Pranava Madhyastha, Tong Xiao, Jingbo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While state-of-the-art NLP models have demonstrated excellent performance for
+aspect based sentiment analysis (ABSA), substantial evidence has been presented
+on their lack of robustness. This is especially manifested as significant
+degradation in performance when faced with out-of-distribution data. Recent
+solutions that rely on counterfactually augmented datasets show promising
+results, but they are inherently limited because of the lack of access to
+explicit causal structure. In this paper, we present an alternative approach
+that relies on non-counterfactual data augmentation. Our proposal instead
+relies on using noisy, cost-efficient data augmentations that preserve
+semantics associated with the target aspect. Our approach then relies on
+modelling invariances between different versions of the data to improve
+robustness. A comprehensive suite of experiments shows that our proposal
+significantly improves upon strong pre-trained baselines on both standard and
+robustness-specific datasets. Our approach further establishes a new
+state-of-the-art on the ABSA robustness benchmark and transfers well across
+domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10pages,1 figure,10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Toward expanding the scope of radiology report summarization to multiple
+  anatomies and modalities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08584v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08584v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihong Chen, Maya Varma, Xiang Wan, Curtis Langlotz, Jean-Benoit Delbrouck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiology report summarization (RRS) is a growing area of research. Given the
+Findings section of a radiology report, the goal is to generate a summary
+(called an Impression section) that highlights the key observations and
+conclusions of the radiology study. However, RRS currently faces essential
+limitations.First, many prior studies conduct experiments on private datasets,
+preventing reproduction of results and fair comparisons across different
+systems and solutions. Second, most prior approaches are evaluated solely on
+chest X-rays. To address these limitations, we propose a dataset (MIMIC-RRS)
+involving three new modalities and seven new anatomies based on the MIMIC-III
+and MIMIC-CXR datasets. We then conduct extensive experiments to evaluate the
+performance of models both within and across modality-anatomy pairs in
+MIMIC-RRS. In addition, we evaluate their clinical efficacy via RadGraph, a
+factual correctness metric.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inference-Time Intervention: Eliciting Truthful Answers from a Language
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03341v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03341v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kenneth Li, Oam Patel, Fernanda Viégas, Hanspeter Pfister, Martin Wattenberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Inference-Time Intervention (ITI), a technique designed to
+enhance the truthfulness of large language models (LLMs). ITI operates by
+shifting model activations during inference, following a set of directions
+across a limited number of attention heads. This intervention significantly
+improves the performance of LLaMA models on the TruthfulQA benchmark. On an
+instruction-finetuned LLaMA called Alpaca, ITI improves its truthfulness from
+32.5% to 65.1%. We identify a tradeoff between truthfulness and helpfulness and
+demonstrate how to balance it by tuning the intervention strength. ITI is
+minimally invasive and computationally inexpensive. Moreover, the technique is
+data efficient: while approaches like RLHF require extensive annotations, ITI
+locates truthful directions using only few hundred examples. Our findings
+suggest that LLMs may have an internal representation of the likelihood of
+something being true, even as they produce falsehoods on the surface.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code: https://github.com/likenneth/honest_llama</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">89</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BandRe: Rethinking Band-Pass Filters for Scale-Wise Object Detection
+  Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11748v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11748v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yosuke Shinya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scale-wise evaluation of object detectors is important for real-world
+applications. However, existing metrics are either coarse or not sufficiently
+reliable. In this paper, we propose novel scale-wise metrics that strike a
+balance between fineness and reliability, using a filter bank consisting of
+triangular and trapezoidal band-pass filters. We conduct experiments with two
+methods on two datasets and show that the proposed metrics can highlight the
+differences between the methods and between the datasets. Code is available at
+https://github.com/shinya7y/UniverseNet .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Honorable Mention Solution Award in Small Object Detection Challenge
+  for Spotting Birds, International Conference on Machine Vision Applications
+  (MVA) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Skeletonization of Complex Grapevines for Robotic Pruning <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11706v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11706v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Schneider, Sushanth Jayanth, Abhisesh Silwal, George Kantor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robotic pruning of dormant grapevines is an area of active research in order
+to promote vine balance and grape quality, but so far robotic efforts have
+largely focused on planar, simplified vines not representative of commercial
+vineyards. This paper aims to advance the robotic perception capabilities
+necessary for pruning in denser and more complex vine structures by extending
+plant skeletonization techniques. The proposed pipeline generates skeletal
+grapevine models that have lower reprojection error and higher connectivity
+than baseline algorithms. We also show how 3D and skeletal information enables
+prediction accuracy of pruning weight for dense vines surpassing prior work,
+where pruning weight is an important vine metric influencing pruning site
+selection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, IROS 2023 Computer Vision for Automation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SACReg: Scene-Agnostic Coordinate Regression for Visual Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11702v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11702v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerome Revaud, Yohann Cabon, Romain Brégier, JongMin Lee, Philippe Weinzaepfel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene coordinates regression (SCR), i.e., predicting 3D coordinates for every
+pixel of a given image, has recently shown promising potential. However,
+existing methods remain mostly scene-specific or limited to small scenes and
+thus hardly scale to realistic datasets. In this paper, we propose a new
+paradigm where a single generic SCR model is trained once to be then deployed
+to new test scenes, regardless of their scale and without further finetuning.
+For a given query image, it collects inputs from off-the-shelf image retrieval
+techniques and Structure-from-Motion databases: a list of relevant database
+images with sparse pointwise 2D-3D annotations. The model is based on the
+transformer architecture and can take a variable number of images and sparse
+2D-3D annotations as input. It is trained on a few diverse datasets and
+significantly outperforms other scene regression approaches on several
+benchmarks, including scene-specific models, for visual localization. In
+particular, we set a new state of the art on the Cambridge localization
+benchmark, even outperforming feature-matching-based approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing CLIP with <span class="highlight-title">GPT</span>-4: Harnessing Visual Descriptions as <span class="highlight-title">Prompt</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11661v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11661v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayug Maniparambil, Chris Vorster, Derek Molloy, Noel Murphy, Kevin McGuinness, Noel E. O'Connor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have
+revolutionized visual representation learning by providing good performance on
+downstream datasets. VLMs are 0-shot adapted to a downstream dataset by
+designing prompts that are relevant to the dataset. Such prompt engineering
+makes use of domain expertise and a validation dataset. Meanwhile, recent
+developments in generative pretrained models like GPT-4 mean they can be used
+as advanced internet search tools. They can also be manipulated to provide
+visual information in any structure. In this work, we show that GPT-4 can be
+used to generate text that is visually descriptive and how this can be used to
+adapt CLIP to downstream tasks. We show considerable improvements in 0-shot
+transfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD
+(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.
+We also design a simple few-shot adapter that learns to choose the best
+possible sentences to construct generalizable classifiers that outperform the
+recently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized
+fine-grained datasets. We will release the code, prompts, and auxiliary text
+dataset upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, Pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FEDD -- Fair, Efficient, and Diverse Diffusion-based Lesion Segmentation
+  and Malignancy Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Héctor Carrión, Narges Norouzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Skin diseases affect millions of people worldwide, across all ethnicities.
+Increasing diagnosis accessibility requires fair and accurate segmentation and
+classification of dermatology images. However, the scarcity of annotated
+medical images, especially for rare diseases and underrepresented skin tones,
+poses a challenge to the development of fair and accurate models. In this
+study, we introduce a Fair, Efficient, and Diverse Diffusion-based framework
+for skin lesion segmentation and malignancy classification. FEDD leverages
+semantically meaningful feature embeddings learned through a denoising
+diffusion probabilistic backbone and processes them via linear probes to
+achieve state-of-the-art performance on Diverse Dermatology Images (DDI). We
+achieve an improvement in intersection over union of 0.18, 0.13, 0.06, and 0.07
+while using only 5%, 10%, 15%, and 20% labeled samples, respectively.
+Additionally, FEDD trained on 10% of DDI demonstrates malignancy classification
+accuracy of 81%, 14% higher compared to the state-of-the-art. We showcase high
+efficiency in data-constrained scenarios while providing fair performance for
+diverse skin tones and rare malignancy conditions. Our newly annotated DDI
+segmentation masks and training code can be found on
+https://github.com/hectorcarrion/fedd.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Morphological Image Analysis and Feature Extraction for Reasoning with
+  AI-based Defect Detection and Classification Models <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Zhang, Georgina Cosma, Sarah Bugby, Axel Finke, Jason Watkins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the use of artificial intelligent (AI) models becomes more prevalent in
+industries such as engineering and manufacturing, it is essential that these
+models provide transparent reasoning behind their predictions. This paper
+proposes the AI-Reasoner, which extracts the morphological characteristics of
+defects (DefChars) from images and utilises decision trees to reason with the
+DefChar values. Thereafter, the AI-Reasoner exports visualisations (i.e.
+charts) and textual explanations to provide insights into outputs made by
+masked-based defect detection and classification models. It also provides
+effective mitigation strategies to enhance data pre-processing and overall
+model performance. The AI-Reasoner was tested on explaining the outputs of an
+IE Mask R-CNN model using a set of 366 images containing defects. The results
+demonstrated its effectiveness in explaining the IE Mask R-CNN model's
+predictions. Overall, the proposed AI-Reasoner provides a solution for
+improving the performance of AI models in industrial applications that require
+defect analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, 5 tables; submitted to 2023 IEEE symposium series
+  on computational intelligence (SSCI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Reinforcement Learning Based System for Intraoperative
+  Hyperspectral Video Autofocusing <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charlie Budd, Jianrong Qiu, Oscar MacCormac, Martin Huber, Christopher Mower, Mirek Janatka, Théo Trotouin, Jonathan Shapey, Mads S. Bergholt, Tom Vercauteren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral imaging (HSI) captures a greater level of spectral detail than
+traditional optical imaging, making it a potentially valuable intraoperative
+tool when precise tissue differentiation is essential. Hardware limitations of
+current optical systems used for handheld real-time video HSI result in a
+limited focal depth, thereby posing usability issues for integration of the
+technology into the operating room. This work integrates a focus-tunable liquid
+lens into a video HSI exoscope, and proposes novel video autofocusing methods
+based on deep reinforcement learning. A first-of-its-kind robotic focal-time
+scan was performed to create a realistic and reproducible testing dataset. We
+benchmarked our proposed autofocus algorithm against traditional policies, and
+found our novel approach to perform significantly ($p<0.05$) better than
+traditional techniques ($0.070\pm.098$ mean absolute focal error compared to
+$0.146\pm.148$). In addition, we performed a blinded usability trial by having
+two neurosurgeons compare the system with different autofocus policies, and
+found our novel approach to be the most favourable, making our system a
+desirable addition for intraoperative HSI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OxfordTVG-HIC: Can Machine Make Humorous Captions from Images? <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11636v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11636v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runjia Li, Shuyang Sun, Mohamed Elhoseiny, Philip Torr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents OxfordTVG-HIC (Humorous Image Captions), a large-scale
+dataset for humour generation and understanding. Humour is an abstract,
+subjective, and context-dependent cognitive construct involving several
+cognitive factors, making it a challenging task to generate and interpret.
+Hence, humour generation and understanding can serve as a new task for
+evaluating the ability of deep-learning methods to process abstract and
+subjective information. Due to the scarcity of data, humour-related generation
+tasks such as captioning remain under-explored. To address this gap,
+OxfordTVG-HIC offers approximately 2.9M image-text pairs with humour scores to
+train a generalizable humour captioning model. Contrary to existing captioning
+datasets, OxfordTVG-HIC features a wide range of emotional and semantic
+diversity resulting in out-of-context examples that are particularly conducive
+to generating humour. Moreover, OxfordTVG-HIC is curated devoid of offensive
+content. We also show how OxfordTVG-HIC can be leveraged for evaluating the
+humour of a generated text. Through explainability analysis of the trained
+models, we identify the visual and linguistic cues influential for evoking
+humour prediction (and generation). We observe qualitatively that these cues
+are aligned with the benign violation theory of humour in cognitive psychology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Divide and Adapt: Active Domain Adaptation via Customized Learning <span class="chip">CVPR2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11618v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11618v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duojun Huang, Jichang Li, Weikai Chen, Junshi Huang, Zhenhua Chai, Guanbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Active domain adaptation (ADA) aims to improve the model adaptation
+performance by incorporating active learning (AL) techniques to label a
+maximally-informative subset of target samples. Conventional AL methods do not
+consider the existence of domain shift, and hence, fail to identify the truly
+valuable samples in the context of domain adaptation. To accommodate active
+learning and domain adaption, the two naturally different tasks, in a
+collaborative framework, we advocate that a customized learning strategy for
+the target data is the key to the success of ADA solutions. We present
+Divide-and-Adapt (DiaNA), a new ADA framework that partitions the target
+instances into four categories with stratified transferable properties. With a
+novel data subdivision protocol based on uncertainty and domainness, DiaNA can
+accurately recognize the most gainful samples. While sending the informative
+instances for annotation, DiaNA employs tailored learning strategies for the
+remaining categories. Furthermore, we propose an informativeness score that
+unifies the data partitioning criteria. This enables the use of a Gaussian
+mixture model (GMM) to automatically sample unlabeled data into the proposed
+four categories. Thanks to the "divideand-adapt" spirit, DiaNA can handle data
+with large variations of domain gap. In addition, we show that DiaNA can
+generalize to different domain adaptation settings, such as unsupervised domain
+adaptation (UDA), semi-supervised domain adaptation (SSDA), source-free domain
+adaptation (SFDA), etc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR2023, Highlight paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consistency-guided Meta-Learning for Bootstrapping Semi-Supervised
+  Medical Image Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11604v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11604v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyue Wei, Lequan Yu, Xianhang Li, Wei Shao, Cihang Xie, Lei Xing, Yuyin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical imaging has witnessed remarkable progress but usually requires a
+large amount of high-quality annotated data which is time-consuming and costly
+to obtain. To alleviate this burden, semi-supervised learning has garnered
+attention as a potential solution. In this paper, we present Meta-Learning for
+Bootstrapping Medical Image Segmentation (MLB-Seg), a novel method for tackling
+the challenge of semi-supervised medical image segmentation. Specifically, our
+approach first involves training a segmentation model on a small set of clean
+labeled images to generate initial labels for unlabeled data. To further
+optimize this bootstrapping process, we introduce a per-pixel weight mapping
+system that dynamically assigns weights to both the initialized labels and the
+model's own predictions. These weights are determined using a meta-process that
+prioritizes pixels with loss gradient directions closer to those of clean data,
+which is based on a small set of precisely annotated images. To facilitate the
+meta-learning process, we additionally introduce a consistency-based Pseudo
+Label Enhancement (PLE) scheme that improves the quality of the model's own
+predictions by ensembling predictions from various augmented versions of the
+same input. In order to improve the quality of the weight maps obtained through
+multiple augmentations of a single input, we introduce a mean teacher into the
+PLE scheme. This method helps to reduce noise in the weight maps and stabilize
+its generation process. Our extensive experimental results on public atrial and
+prostate segmentation datasets demonstrate that our proposed method achieves
+state-of-the-art results under semi-supervision. Our code is available at
+https://github.com/aijinrjinr/MLB-Seg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023. Code is publicly available at
+  https://github.com/aijinrjinr/MLB-Seg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cascaded multitask U-Net using topological loss for vessel segmentation
+  and centerline extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11603v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11603v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Rougé, Nicolas Passat, Odyssée Merveille
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vessel segmentation and centerline extraction are two crucial preliminary
+tasks for many computer-aided diagnosis tools dealing with vascular diseases.
+Recently, deep-learning based methods have been widely applied to these tasks.
+However, classic deep-learning approaches struggle to capture the complex
+geometry and specific topology of vascular networks, which is of the utmost
+importance in most applications. To overcome these limitations, the clDice
+loss, a topological loss that focuses on the vessel centerlines, has been
+recently proposed. This loss requires computing, with a proposed soft-skeleton
+algorithm, the skeletons of both the ground truth and the predicted
+segmentation. However, the soft-skeleton algorithm provides suboptimal results
+on 3D images, which makes the clDice hardly suitable on 3D images. In this
+paper, we propose to replace the soft-skeleton algorithm by a U-Net which
+computes the vascular skeleton directly from the segmentation. We show that our
+method provides more accurate skeletons than the soft-skeleton algorithm. We
+then build upon this network a cascaded U-Net trained with the clDice loss to
+embed topological constraints during the segmentation. The resulting model is
+able to predict both the vessel segmentation and centerlines with a more
+accurate topology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CortexMorph: fast cortical thickness estimation via diffeomorphic
+  registration using VoxelMorph <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard McKinley, Christian Rummel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The thickness of the cortical band is linked to various neurological and
+psychiatric conditions, and is often estimated through surface-based methods
+such as Freesurfer in MRI studies. The DiReCT method, which calculates cortical
+thickness using a diffeomorphic deformation of the gray-white matter interface
+towards the pial surface, offers an alternative to surface-based methods.
+Recent studies using a synthetic cortical thickness phantom have demonstrated
+that the combination of DiReCT and deep-learning-based segmentation is more
+sensitive to subvoxel cortical thinning than Freesurfer.
+  While anatomical segmentation of a T1-weighted image now takes seconds,
+existing implementations of DiReCT rely on iterative image registration methods
+which can take up to an hour per volume. On the other hand, learning-based
+deformable image registration methods like VoxelMorph have been shown to be
+faster than classical methods while improving registration accuracy. This paper
+proposes CortexMorph, a new method that employs unsupervised deep learning to
+directly regress the deformation field needed for DiReCT. By combining
+CortexMorph with a deep-learning-based segmentation model, it is possible to
+estimate region-wise thickness in seconds from a T1-weighted image, while
+maintaining the ability to detect cortical atrophy. We validate this claim on
+the OASIS-3 dataset and the synthetic cortical thickness phantom of Rusak et
+al.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted (early acceptance) at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Visual Grounding with Scene Knowledge: Benchmark and Method <span class="chip">CVPR-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihong Chen, Ruifei Zhang, Yibing Song, Xiang Wan, Guanbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual grounding (VG) aims to establish fine-grained alignment between vision
+and language. Ideally, it can be a testbed for vision-and-language models to
+evaluate their understanding of the images and texts and their reasoning
+abilities over their joint space. However, most existing VG datasets are
+constructed using simple description texts, which do not require sufficient
+reasoning over the images and texts. This has been demonstrated in a recent
+study~\cite{luo2022goes}, where a simple LSTM-based text encoder without
+pretraining can achieve state-of-the-art performance on mainstream VG datasets.
+Therefore, in this paper, we propose a novel benchmark of \underline{S}cene
+\underline{K}nowledge-guided \underline{V}isual \underline{G}rounding (SK-VG),
+where the image content and referring expressions are not sufficient to ground
+the target objects, forcing the models to have a reasoning ability on the
+long-form scene knowledge. To perform this task, we propose two approaches to
+accept the triple-type input, where the former embeds knowledge into the image
+features before the image-query interaction; the latter leverages linguistic
+structure to assist in computing the image-text matching. We conduct extensive
+experiments to analyze the above methods and show that the proposed approaches
+achieve promising results but still leave room for improvement, including
+performance and interpretability. The dataset and code are available at
+\url{https://github.com/zhjohnchan/SK-VG}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computer Vision and Natural Language Processing. 21 pages, 14
+  figures. CVPR-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ YOLOPose V2: Understanding and Improving <span class="highlight-title">Transformer</span>-based 6D Pose
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11550v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11550v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arul Selvam Periyasamy, Arash Amini, Vladimir Tsaturyan, Sven Behnke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  6D object pose estimation is a crucial prerequisite for autonomous robot
+manipulation applications. The state-of-the-art models for pose estimation are
+convolutional neural network (CNN)-based. Lately, Transformers, an architecture
+originally proposed for natural language processing, is achieving
+state-of-the-art results in many computer vision tasks as well. Equipped with
+the multi-head self-attention mechanism, Transformers enable simple
+single-stage end-to-end architectures for learning object detection and 6D
+object pose estimation jointly. In this work, we propose YOLOPose (short form
+for You Only Look Once Pose estimation), a Transformer-based multi-object 6D
+pose estimation method based on keypoint regression and an improved variant of
+the YOLOPose model. In contrast to the standard heatmaps for predicting
+keypoints in an image, we directly regress the keypoints. Additionally, we
+employ a learnable orientation estimation module to predict the orientation
+from the keypoints. Along with a separate translation estimation module, our
+model is end-to-end differentiable. Our method is suitable for real-time
+applications and achieves results comparable to state-of-the-art methods. We
+analyze the role of object queries in our architecture and reveal that the
+object queries specialize in detecting objects in specific image regions.
+Furthermore, we quantify the accuracy trade-off of using datasets of smaller
+sizes to train our model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Robotics and Autonomous Systems Journal, Elsevier, to appear 2023.
+  arXiv admin note: substantial text overlap with arXiv:2205.02536</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Vision and Language Encoders: Parameter-Efficient Tuning for
+  Referring Image Segmentation <span class="chip">ICCV-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zunnan Xu, Zhihong Chen, Yong Zhang, Yibing Song, Xiang Wan, Guanbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter Efficient Tuning (PET) has gained attention for reducing the number
+of parameters while maintaining performance and providing better hardware
+resource savings, but few studies investigate dense prediction tasks and
+interaction between modalities. In this paper, we do an investigation of
+efficient tuning problems on referring image segmentation. We propose a novel
+adapter called Bridger to facilitate cross-modal information exchange and
+inject task-specific information into the pre-trained model. We also design a
+lightweight decoder for image segmentation. Our approach achieves comparable or
+superior performance with only 1.61\% to 3.38\% backbone parameter updates,
+evaluated on challenging benchmarks. The code is available at
+\url{https://github.com/kkakkkka/ETRIS}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computer Vision and Natural Language Processing. 14 pages, 8 figures.
+  ICCV-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KVN: Keypoints Voting Network with Differentiable RANSAC for Stereo Pose
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivano Donadi, Alberto Pretto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object pose estimation is a fundamental computer vision task exploited in
+several robotics and augmented reality applications. Many established
+approaches rely on predicting 2D-3D keypoint correspondences using RANSAC
+(Random sample consensus) and estimating the object pose using the PnP
+(Perspective-n-Point) algorithm. Being RANSAC non-differentiable,
+correspondences cannot be directly learned in an end-to-end fashion. In this
+paper, we address the stereo image-based object pose estimation problem by (i)
+introducing a differentiable RANSAC layer into a well-known monocular pose
+estimation network; (ii) exploiting an uncertainty-driven multi-view PnP solver
+which can fuse information from multiple views. We evaluate our approach on a
+challenging public stereo object pose estimation dataset, yielding
+state-of-the-art results against other recent approaches. Furthermore, in our
+ablation study, we show that the differentiable RANSAC layer plays a
+significant role in the accuracy of the proposed method. We release with this
+paper the open-source implementation of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Robotics and Automation Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UWAT-GAN: Fundus Fluorescein Angiography Synthesis via Ultra-wide-angle
+  Transformation Multi-scale GAN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11530v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11530v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaojie Fang, Zhanghao Chen, Pengxue Wei, Wangting Li, Shaochong Zhang, Ahmed Elazab, Gangyong Jia, Ruiquan Ge, Changmiao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fundus photography is an essential examination for clinical and differential
+diagnosis of fundus diseases. Recently, Ultra-Wide-angle Fundus (UWF)
+techniques, UWF Fluorescein Angiography (UWF-FA) and UWF Scanning Laser
+Ophthalmoscopy (UWF-SLO) have been gradually put into use. However, Fluorescein
+Angiography (FA) and UWF-FA require injecting sodium fluorescein which may have
+detrimental influences. To avoid negative impacts, cross-modality medical image
+generation algorithms have been proposed. Nevertheless, current methods in
+fundus imaging could not produce high-resolution images and are unable to
+capture tiny vascular lesion areas. This paper proposes a novel conditional
+generative adversarial network (UWAT-GAN) to synthesize UWF-FA from UWF-SLO.
+Using multi-scale generators and a fusion module patch to better extract global
+and local information, our model can generate high-resolution images. Moreover,
+an attention transmit module is proposed to help the decoder learn effectively.
+Besides, a supervised approach is used to train the network using multiple new
+weighted losses on different scales of data. Experiments on an in-house UWF
+image dataset demonstrate the superiority of the UWAT-GAN over the
+state-of-the-art methods. The source code is available at:
+https://github.com/Tinysqua/UWAT-GAN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26th International Conference on Medical Image Computing and Computer
+  Assisted Intervention</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Viewpoint Robustness for Visual Recognition via Adversarial
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shouwei Ruan, Yinpeng Dong, Hang Su, Jianteng Peng, Ning Chen, Xingxing Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Viewpoint invariance remains challenging for visual recognition in the 3D
+world, as altering the viewing directions can significantly impact predictions
+for the same object. While substantial efforts have been dedicated to making
+neural networks invariant to 2D image translations and rotations, viewpoint
+invariance is rarely investigated. Motivated by the success of adversarial
+training in enhancing model robustness, we propose Viewpoint-Invariant
+Adversarial Training (VIAT) to improve the viewpoint robustness of image
+classifiers. Regarding viewpoint transformation as an attack, we formulate VIAT
+as a minimax optimization problem, where the inner maximization characterizes
+diverse adversarial viewpoints by learning a Gaussian mixture distribution
+based on the proposed attack method GMVFool. The outer minimization obtains a
+viewpoint-invariant classifier by minimizing the expected loss over the
+worst-case viewpoint distributions that can share the same one for different
+objects within the same category. Based on GMVFool, we contribute a large-scale
+dataset called ImageNet-V+ to benchmark viewpoint robustness. Experimental
+results show that VIAT significantly improves the viewpoint robustness of
+various image classifiers based on the diversity of adversarial viewpoints
+generated by GMVFool. Furthermore, we propose ViewRS, a certified viewpoint
+robustness method that provides a certified radius and accuracy to demonstrate
+the effectiveness of VIAT from the theoretical perspective.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 12 figures. arXiv admin note: substantial text overlap with
+  arXiv:2307.10235</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CopyRNeRF: Protecting the CopyRight of Neural Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyuan Luo, Qing Guo, Ka Chun Cheung, Simon See, Renjie Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRF) have the potential to be a major representation
+of media. Since training a NeRF has never been an easy task, the protection of
+its model copyright should be a priority. In this paper, by analyzing the pros
+and cons of possible copyright protection solutions, we propose to protect the
+copyright of NeRF models by replacing the original color representation in NeRF
+with a watermarked color representation. Then, a distortion-resistant rendering
+scheme is designed to guarantee robust message extraction in 2D renderings of
+NeRF. Our proposed method can directly protect the copyright of NeRF models
+while maintaining high rendering quality and bit accuracy when compared among
+optional solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures, accepted by iccv 2023 non-camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BatMobility: Towards Flying Without Seeing for Autonomous Drones 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emerson Sie, Zikun Liu, Deepak Vasisht
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned aerial vehicles (UAVs) rely on optical sensors such as cameras and
+lidar for autonomous operation. However, such optical sensors are error-prone
+in bad lighting, inclement weather conditions including fog and smoke, and
+around textureless or transparent surfaces. In this paper, we ask: is it
+possible to fly UAVs without relying on optical sensors, i.e., can UAVs fly
+without seeing? We present BatMobility, a lightweight mmWave radar-only
+perception system for UAVs that eliminates the need for optical sensors.
+BatMobility enables two core functionalities for UAVs -- radio flow estimation
+(a novel FMCW radar-based alternative for optical flow based on
+surface-parallel doppler shift) and radar-based collision avoidance. We build
+BatMobility using commodity sensors and deploy it as a real-time system on a
+small off-the-shelf quadcopter running an unmodified flight controller. Our
+evaluation shows that BatMobility achieves comparable or better performance
+than commercial-grade optical sensors across a wide range of scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CORE: Cooperative Reconstruction for Multi-Agent Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11514v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11514v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binglu Wang, Lei Zhang, Zhaozhong Wang, Yongqiang Zhao, Tianfei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents CORE, a conceptually simple, effective and
+communication-efficient model for multi-agent cooperative perception. It
+addresses the task from a novel perspective of cooperative reconstruction,
+based on two key insights: 1) cooperating agents together provide a more
+holistic observation of the environment, and 2) the holistic observation can
+serve as valuable supervision to explicitly guide the model learning how to
+reconstruct the ideal observation based on collaboration. CORE instantiates the
+idea with three major components: a compressor for each agent to create more
+compact feature representation for efficient broadcasting, a lightweight
+attentive collaboration component for cross-agent message aggregation, and a
+reconstruction module to reconstruct the observation based on aggregated
+feature representations. This learning-to-reconstruct idea is task-agnostic,
+and offers clear and reasonable supervision to inspire more effective
+collaboration, eventually promoting perception tasks. We validate CORE on
+OPV2V, a large-scale multi-agent percetion dataset, in two tasks, i.e., 3D
+object detection and semantic segmentation. Results demonstrate that the model
+achieves state-of-the-art performance on both tasks, and is more
+communication-efficient.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bone mineral density estimation from a plain X-ray image by learning
+  decomposition into projections of bone-segmented computed tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11513v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11513v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Gu, Yoshito Otake, Keisuke Uemura, Mazen Soufi, Masaki Takao, Hugues Talbot, Seiji Okada, Nobuhiko Sugano, Yoshinobu Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Osteoporosis is a prevalent bone disease that causes fractures in fragile
+bones, leading to a decline in daily living activities. Dual-energy X-ray
+absorptiometry (DXA) and quantitative computed tomography (QCT) are highly
+accurate for diagnosing osteoporosis; however, these modalities require special
+equipment and scan protocols. To frequently monitor bone health, low-cost,
+low-dose, and ubiquitously available diagnostic methods are highly anticipated.
+In this study, we aim to perform bone mineral density (BMD) estimation from a
+plain X-ray image for opportunistic screening, which is potentially useful for
+early diagnosis. Existing methods have used multi-stage approaches consisting
+of extraction of the region of interest and simple regression to estimate BMD,
+which require a large amount of training data. Therefore, we propose an
+efficient method that learns decomposition into projections of bone-segmented
+QCT for BMD estimation under limited datasets. The proposed method achieved
+high accuracy in BMD estimation, where Pearson correlation coefficients of
+0.880 and 0.920 were observed for DXA-measured BMD and QCT-measured BMD
+estimation tasks, respectively, and the root mean square of the coefficient of
+variation values were 3.27 to 3.79% for four measurements with different poses.
+Furthermore, we conducted extensive validation experiments, including
+multi-pose, uncalibrated-CT, and compression experiments toward actual
+application in routine clinical practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages and 22 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Redemption from Range-view for Accurate 3D Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11482v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11482v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihan Wang, Qiao Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most recent approaches for 3D object detection predominantly rely on
+point-view or bird's-eye view representations, with limited exploration of
+range-view-based methods. The range-view representation suffers from scale
+variation and surface texture deficiency, both of which pose significant
+limitations for developing corresponding methods. Notably, the surface texture
+loss problem has been largely ignored by all existing methods, despite its
+significant impact on the accuracy of range-view-based 3D object detection. In
+this study, we propose Redemption from Range-view R-CNN (R2 R-CNN), a novel and
+accurate approach that comprehensively explores the range-view representation.
+Our proposed method addresses scale variation through the HD Meta Kernel, which
+captures range-view geometry information in multiple scales. Additionally, we
+introduce Feature Points Redemption (FPR) to recover the lost 3D surface
+texture information from the range view, and Synchronous-Grid RoI Pooling
+(S-Grid RoI Pooling), a multi-scaled approach with multiple receptive fields
+for accurate box refinement. Our R2 R-CNN outperforms existing range-view-based
+methods, achieving state-of-the-art performance on both the KITTI benchmark and
+the Waymo Open Dataset. Our study highlights the critical importance of
+addressing the surface texture loss problem for accurate 3D object detection in
+range-view-based methods. Codes will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SA-BEV: Generating Semantic-Aware Bird's-Eye-View Feature for Multi-view
+  3D Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11477v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11477v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinqing Zhang, Yanan Zhang, Qingjie Liu, Yunhong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the pure camera-based Bird's-Eye-View (BEV) perception provides a
+feasible solution for economical autonomous driving. However, the existing
+BEV-based multi-view 3D detectors generally transform all image features into
+BEV features, without considering the problem that the large proportion of
+background information may submerge the object information. In this paper, we
+propose Semantic-Aware BEV Pooling (SA-BEVPool), which can filter out
+background information according to the semantic segmentation of image features
+and transform image features into semantic-aware BEV features. Accordingly, we
+propose BEV-Paste, an effective data augmentation strategy that closely matches
+with semantic-aware BEV feature. In addition, we design a Multi-Scale
+Cross-Task (MSCT) head, which combines task-specific and cross-task information
+to predict depth distribution and semantic segmentation more accurately,
+further improving the quality of semantic-aware BEV feature. Finally, we
+integrate the above modules into a novel multi-view 3D object detection
+framework, namely SA-BEV. Experiments on nuScenes show that SA-BEV achieves
+state-of-the-art performance. Code has been available at
+https://github.com/mengtan00/SA-BEV.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Visual Question Answering: <span class="highlight-title">Dataset</span>s, Methods, and Future
+  Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Ma, Pinghui Wang, Dechen Kong, Zewei Wang, Jun Liu, Hongbin Pei, Junzhou Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual question answering requires a system to provide an accurate natural
+language answer given an image and a natural language question. However, it is
+widely recognized that previous generic VQA methods often exhibit a tendency to
+memorize biases present in the training data rather than learning proper
+behaviors, such as grounding images before predicting answers. Therefore, these
+methods usually achieve high in-distribution but poor out-of-distribution
+performance. In recent years, various datasets and debiasing methods have been
+proposed to evaluate and enhance the VQA robustness, respectively. This paper
+provides the first comprehensive survey focused on this emerging fashion.
+Specifically, we first provide an overview of the development process of
+datasets from in-distribution and out-of-distribution perspectives. Then, we
+examine the evaluation metrics employed by these datasets. Thirdly, we propose
+a typology that presents the development process, similarities and differences,
+robustness comparison, and technical features of existing debiasing methods.
+Furthermore, we analyze and discuss the robustness of representative
+vision-and-language pre-training models on VQA. Finally, through a thorough
+review of the available literature and experimental analysis, we discuss the
+key areas for future research from various viewpoints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE TPAMI (Under Review)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-Aware Semi-Supervised Underwater Image Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Qi, Xinghui Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Underwater images normally suffer from degradation due to the transmission
+medium of water bodies. Both traditional prior-based approaches and deep
+learning-based methods have been used to address this problem. However, the
+inflexible assumption of the former often impairs their effectiveness in
+handling diverse underwater scenes, while the generalization of the latter to
+unseen images is usually weakened by insufficient data. In this study, we
+leverage both the physics-based underwater Image Formation Model (IFM) and deep
+learning techniques for Underwater Image Enhancement (UIE). To this end, we
+propose a novel Physics-Aware Dual-Stream Underwater Image Enhancement Network,
+i.e., PA-UIENet, which comprises a Transmission Estimation Steam (T-Stream) and
+an Ambient Light Estimation Stream (A-Stream). This network fulfills the UIE
+task by explicitly estimating the degradation parameters of the IFM. We also
+adopt an IFM-inspired semi-supervised learning framework, which exploits both
+the labeled and unlabeled images, to address the issue of insufficient data.
+Our method performs better than, or at least comparably to, eight baselines
+across five testing sets in the degradation estimation and UIE tasks. This
+should be due to the fact that it not only can model the degradation but also
+can learn the characteristics of diverse underwater scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distribution Shift Matters for Knowledge Distillation with Webly
+  Collected Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialiang Tang, Shuo Chen, Gang Niu, Masashi Sugiyama, Chen Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation aims to learn a lightweight student network from a
+pre-trained teacher network. In practice, existing knowledge distillation
+methods are usually infeasible when the original training data is unavailable
+due to some privacy issues and data management considerations. Therefore,
+data-free knowledge distillation approaches proposed to collect training
+instances from the Internet. However, most of them have ignored the common
+distribution shift between the instances from original training data and webly
+collected data, affecting the reliability of the trained student network. To
+solve this problem, we propose a novel method dubbed ``Knowledge Distillation
+between Different Distributions" (KD$^{3}$), which consists of three
+components. Specifically, we first dynamically select useful training instances
+from the webly collected data according to the combined predictions of teacher
+network and student network. Subsequently, we align both the weighted features
+and classifier parameters of the two networks for knowledge memorization.
+Meanwhile, we also build a new contrastive learning block called
+MixDistribution to generate perturbed data with a new distribution for instance
+alignment, so that the student network can further learn a
+distribution-invariant representation. Intensive experiments on various
+benchmark datasets demonstrate that our proposed KD$^{3}$ can outperform the
+state-of-the-art data-free knowledge distillation approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MatSpectNet: Material Segmentation Network with Domain-Aware and
+  Physically-Constrained Hyperspectral Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11466v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11466v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuwen Heng, Yihong Wu, Jiawen Chen, Srinandan Dasmahapatra, Hansung Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Achieving accurate material segmentation for 3-channel RGB images is
+challenging due to the considerable variation in a material's appearance.
+Hyperspectral images, which are sets of spectral measurements sampled at
+multiple wavelengths, theoretically offer distinct information for material
+identification, as variations in intensity of electromagnetic radiation
+reflected by a surface depend on the material composition of a scene. However,
+existing hyperspectral datasets are impoverished regarding the number of images
+and material categories for the dense material segmentation task, and
+collecting and annotating hyperspectral images with a spectral camera is
+prohibitively expensive. To address this, we propose a new model, the
+MatSpectNet to segment materials with recovered hyperspectral images from RGB
+images. The network leverages the principles of colour perception in modern
+cameras to constrain the reconstructed hyperspectral images and employs the
+domain adaptation method to generalise the hyperspectral reconstruction
+capability from a spectral recovery dataset to material segmentation datasets.
+The reconstructed hyperspectral images are further filtered using learned
+response curves and enhanced with human perception. The performance of
+MatSpectNet is evaluated on the LMD dataset as well as the OpenSurfaces
+dataset. Our experiments demonstrate that MatSpectNet attains a 1.60% increase
+in average pixel accuracy and a 3.42% improvement in mean class accuracy
+compared with the most recent publication. The project code is attached to the
+supplementary material and will be published on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages main content</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Strip-MLP: Efficient Token Interaction for Vision MLP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11458v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11458v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guiping Cao, Shengda Luo, Wenjian Huang, Xiangyuan Lan, Dongmei Jiang, Yaowei Wang, Jianguo Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Token interaction operation is one of the core modules in MLP-based models to
+exchange and aggregate information between different spatial locations.
+However, the power of token interaction on the spatial dimension is highly
+dependent on the spatial resolution of the feature maps, which limits the
+model's expressive ability, especially in deep layers where the feature are
+down-sampled to a small spatial size. To address this issue, we present a novel
+method called \textbf{Strip-MLP} to enrich the token interaction power in three
+ways. Firstly, we introduce a new MLP paradigm called Strip MLP layer that
+allows the token to interact with other tokens in a cross-strip manner,
+enabling the tokens in a row (or column) to contribute to the information
+aggregations in adjacent but different strips of rows (or columns). Secondly, a
+\textbf{C}ascade \textbf{G}roup \textbf{S}trip \textbf{M}ixing \textbf{M}odule
+(CGSMM) is proposed to overcome the performance degradation caused by small
+spatial feature size. The module allows tokens to interact more effectively in
+the manners of within-patch and cross-patch, which is independent to the
+feature spatial size. Finally, based on the Strip MLP layer, we propose a novel
+\textbf{L}ocal \textbf{S}trip \textbf{M}ixing \textbf{M}odule (LSMM) to boost
+the token interaction power in the local region. Extensive experiments
+demonstrate that Strip-MLP significantly improves the performance of MLP-based
+models on small datasets and obtains comparable or even better results on
+ImageNet. In particular, Strip-MLP models achieve higher average Top-1 accuracy
+than existing MLP-based models by +2.44\% on Caltech-101 and +2.16\% on
+CIFAR-100. The source codes will be available
+at~\href{https://github.com/Med-Process/Strip_MLP{https://github.com/Med-Process/Strip\_MLP}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention Consistency Refined Masked Frequency Forgery Representation
+  for Generalizing Face Forgery Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Decheng Liu, Tao Chen, Chunlei Peng, Nannan Wang, Ruimin Hu, Xinbo Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the successful development of deep image generation technology, visual
+data forgery detection would play a more important role in social and economic
+security. Existing forgery detection methods suffer from unsatisfactory
+generalization ability to determine the authenticity in the unseen domain. In
+this paper, we propose a novel Attention Consistency Refined masked frequency
+forgery representation model toward generalizing face forgery detection
+algorithm (ACMF). Most forgery technologies always bring in high-frequency
+aware cues, which make it easy to distinguish source authenticity but difficult
+to generalize to unseen artifact types. The masked frequency forgery
+representation module is designed to explore robust forgery cues by randomly
+discarding high-frequency information. In addition, we find that the forgery
+attention map inconsistency through the detection network could affect the
+generalizability. Thus, the forgery attention consistency is introduced to
+force detectors to focus on similar attention regions for better generalization
+ability. Experiment results on several public face forgery datasets
+(FaceForensic++, DFD, Celeb-DF, and WDF datasets) demonstrate the superior
+performance of the proposed method compared with the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The source code and models are publicly available at
+  https://github.com/chenboluo/ACMF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Batching for Green AI -- An Exploratory Study on Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11434v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11434v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Yarally, Luís Cruz, Daniel Feitosa, June Sallou, Arie van Deursen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The batch size is an essential parameter to tune during the development of
+new neural networks. Amongst other quality indicators, it has a large degree of
+influence on the model's accuracy, generalisability, training times and
+parallelisability. This fact is generally known and commonly studied. However,
+during the application phase of a deep learning model, when the model is
+utilised by an end-user for inference, we find that there is a disregard for
+the potential benefits of introducing a batch size. In this study, we examine
+the effect of input batching on the energy consumption and response times of
+five fully-trained neural networks for computer vision that were considered
+state-of-the-art at the time of their publication. The results suggest that
+batching has a significant effect on both of these metrics. Furthermore, we
+present a timeline of the energy efficiency and accuracy of neural networks
+over the past decade. We find that in general, energy consumption rises at a
+much steeper pace than accuracy and question the necessity of this evolution.
+Additionally, we highlight one particular network, ShuffleNetV2(2018), that
+achieved a competitive performance for its time while maintaining a much lower
+energy consumption. Nevertheless, we highlight that the results are model
+dependent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, 1 table. Accepted at Euromicro Conference Series
+  on Software Engineering and Advanced Applications (SEAA) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural
+  Radiance Fields <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11418v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11418v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungwon Hwang, Junha Hyung, Daejin Kim, Min-Jung Kim, Jaegul Choo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As recent advances in Neural Radiance Fields (NeRF) have enabled
+high-fidelity 3D face reconstruction and novel view synthesis, its manipulation
+also became an essential task in 3D vision. However, existing manipulation
+methods require extensive human labor, such as a user-provided semantic mask
+and manual attribute search unsuitable for non-expert users. Instead, our
+approach is designed to require a single text to manipulate a face
+reconstructed with NeRF. To do so, we first train a scene manipulator, a latent
+code-conditional deformable NeRF, over a dynamic scene to control a face
+deformation using the latent code. However, representing a scene deformation
+with a single latent code is unfavorable for compositing local deformations
+observed in different instances. As so, our proposed Position-conditional
+Anchor Compositor (PAC) learns to represent a manipulated scene with spatially
+varying latent codes. Their renderings with the scene manipulator are then
+optimized to yield high cosine similarity to a target text in CLIP embedding
+space for text-driven manipulation. To the best of our knowledge, our approach
+is the first to address the text-driven manipulation of a face reconstructed
+with NeRF. Extensive results, comparisons, and ablation studies demonstrate the
+effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Video-based Detector for Suspicious Activity in Examination with
+  OpenPose 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reuben Moyo, Stanley Ndebvu, Michael Zimba, Jimmy Mbelwa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Examinations are a crucial part of the learning process, and academic
+institutions invest significant resources into maintaining their integrity by
+preventing cheating from students or facilitators. However, cheating has become
+rampant in examination setups, compromising their integrity. The traditional
+method of relying on invigilators to monitor every student is impractical and
+ineffective. To address this issue, there is a need to continuously record exam
+sessions to monitor students for suspicious activities. However, these
+recordings are often too lengthy for invigilators to analyze effectively, and
+fatigue may cause them to miss significant details. To widen the coverage,
+invigilators could use fixed overhead or wearable cameras. This paper
+introduces a framework that uses automation to analyze videos and detect
+suspicious activities during examinations efficiently and effectively. We
+utilized the OpenPose framework and Convolutional Neural Network (CNN) to
+identify students exchanging objects during exams. This detection system is
+vital in preventing cheating and promoting academic integrity, fairness, and
+quality education for institutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Directly-Trained Spiking Neural Networks for Object Detection <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11411v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11411v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaoyi Su, Yuhong Chou, Yifan Hu, Jianing Li, Shijie Mei, Ziyang Zhang, Guoqi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs) are brain-inspired energy-efficient models
+that encode information in spatiotemporal dynamics. Recently, deep SNNs trained
+directly have shown great success in achieving high performance on
+classification tasks with very few time steps. However, how to design a
+directly-trained SNN for the regression task of object detection still remains
+a challenging problem. To address this problem, we propose EMS-YOLO, a novel
+directly-trained SNN framework for object detection, which is the first trial
+to train a deep SNN with surrogate gradients for object detection rather than
+ANN-SNN conversion strategies. Specifically, we design a full-spike residual
+block, EMS-ResNet, which can effectively extend the depth of the
+directly-trained SNN with low power consumption. Furthermore, we theoretically
+analyze and prove the EMS-ResNet could avoid gradient vanishing or exploding.
+The results demonstrate that our approach outperforms the state-of-the-art
+ANN-SNN conversion methods (at least 500 time steps) in extremely fewer time
+steps (only 4 time steps). It is shown that our model could achieve comparable
+performance to the ANN with the same architecture while consuming 5.83 times
+less energy on the frame-based COCO Dataset and the event-based Gen1 Dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Subject-Diffusion:Open Domain Personalized Text-to-Image Generation
+  without Test-time Fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Ma, Junhao Liang, Chen Chen, Haonan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent progress in personalized image generation using diffusion models has
+been significant. However, development in the area of open-domain and
+non-fine-tuning personalized image generation is proceeding rather slowly. In
+this paper, we propose Subject-Diffusion, a novel open-domain personalized
+image generation model that, in addition to not requiring test-time
+fine-tuning, also only requires a single reference image to support
+personalized generation of single- or multi-subject in any domain. Firstly, we
+construct an automatic data labeling tool and use the LAION-Aesthetics dataset
+to construct a large-scale dataset consisting of 76M images and their
+corresponding subject detection bounding boxes, segmentation masks and text
+descriptions. Secondly, we design a new unified framework that combines text
+and image semantics by incorporating coarse location and fine-grained reference
+image control to maximize subject fidelity and generalization. Furthermore, we
+also adopt an attention control mechanism to support multi-subject generation.
+Extensive qualitative and quantitative results demonstrate that our method
+outperforms other SOTA frameworks in single, multiple, and human customized
+image generation. Please refer to our
+\href{https://oppo-mente-lab.github.io/subject_diffusion/}{project page}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Latent-OFER: Detect, Mask, and Reconstruct with Latent Vectors for
+  Occluded Facial Expression Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isack Lee, Eungi Lee, Seok Bong Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most research on facial expression recognition (FER) is conducted in highly
+controlled environments, but its performance is often unacceptable when applied
+to real-world situations. This is because when unexpected objects occlude the
+face, the FER network faces difficulties extracting facial features and
+accurately predicting facial expressions. Therefore, occluded FER (OFER) is a
+challenging problem. Previous studies on occlusion-aware FER have typically
+required fully annotated facial images for training. However, collecting facial
+images with various occlusions and expression annotations is time-consuming and
+expensive. Latent-OFER, the proposed method, can detect occlusions, restore
+occluded parts of the face as if they were unoccluded, and recognize them,
+improving FER accuracy. This approach involves three steps: First, the vision
+transformer (ViT)-based occlusion patch detector masks the occluded position by
+training only latent vectors from the unoccluded patches using the support
+vector data description algorithm. Second, the hybrid reconstruction network
+generates the masking position as a complete image using the ViT and
+convolutional neural network (CNN). Last, the expression-relevant latent vector
+extractor retrieves and uses expression-related information from all latent
+vectors by applying a CNN-based class activation map. This mechanism has a
+significant advantage in preventing performance degradation from occlusion by
+unseen objects. The experimental results on several databases demonstrate the
+superiority of the proposed method over state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Modeling of Inter- and Intra-observer Variability in
+  Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arne Schmidt, Pablo Morales-Álvarez, Rafael Molina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation is a challenging task, particularly due to inter-
+and intra-observer variability, even between medical experts. In this paper, we
+propose a novel model, called Probabilistic Inter-Observer and iNtra-Observer
+variation NetwOrk (Pionono). It captures the labeling behavior of each rater
+with a multidimensional probability distribution and integrates this
+information with the feature maps of the image to produce probabilistic
+segmentation predictions. The model is optimized by variational inference and
+can be trained end-to-end. It outperforms state-of-the-art models such as
+STAPLE, Probabilistic U-Net, and models based on confusion matrices.
+Additionally, Pionono predicts multiple coherent segmentation maps that mimic
+the rater's expert opinion, which provides additional valuable information for
+the diagnostic process. Experiments on real-world cancer segmentation datasets
+demonstrate the high accuracy and efficiency of Pionono, making it a powerful
+tool for medical image analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLR: Channel-wise Lightweight Reprogramming for Continual Learning <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11386v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11386v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunhao Ge, Yuecheng Li, Shuo Ni, Jiaping Zhao, Ming-Hsuan Yang, Laurent Itti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning aims to emulate the human ability to continually
+accumulate knowledge over sequential tasks. The main challenge is to maintain
+performance on previously learned tasks after learning new tasks, i.e., to
+avoid catastrophic forgetting. We propose a Channel-wise Lightweight
+Reprogramming (CLR) approach that helps convolutional neural networks (CNNs)
+overcome catastrophic forgetting during continual learning. We show that a CNN
+model trained on an old task (or self-supervised proxy task) could be
+``reprogrammed" to solve a new task by using our proposed lightweight (very
+cheap) reprogramming parameter. With the help of CLR, we have a better
+stability-plasticity trade-off to solve continual learning problems: To
+maintain stability and retain previous task ability, we use a common
+task-agnostic immutable part as the shared ``anchor" parameter set. We then add
+task-specific lightweight reprogramming parameters to reinterpret the outputs
+of the immutable parts, to enable plasticity and integrate new knowledge. To
+learn sequential tasks, we only train the lightweight reprogramming parameters
+to learn each new task. Reprogramming parameters are task-specific and
+exclusive to each task, which makes our method immune to catastrophic
+forgetting. To minimize the parameter requirement of reprogramming to learn new
+tasks, we make reprogramming lightweight by only adjusting essential kernels
+and learning channel-wise linear mappings from anchor parameters to
+task-specific domain knowledge. We show that, for general CNNs, the CLR
+parameter increase is less than 0.6\% for any new task. Our method outperforms
+13 state-of-the-art continual learning baselines on a new challenging sequence
+of 53 image classification datasets. Code and data are available at
+https://github.com/gyhandy/Channel-wise-Lightweight-Reprogramming
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LatentAugment: Data Augmentation via Guided Manipulation of GAN's Latent
+  Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Tronchin, Minh H. Vu, Paolo Soda, Tommy Löfstedt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data Augmentation (DA) is a technique to increase the quantity and diversity
+of the training data, and by that alleviate overfitting and improve
+generalisation. However, standard DA produces synthetic data for augmentation
+with limited diversity. Generative Adversarial Networks (GANs) may unlock
+additional information in a dataset by generating synthetic samples having the
+appearance of real images. However, these models struggle to simultaneously
+address three key requirements: fidelity and high-quality samples; diversity
+and mode coverage; and fast sampling. Indeed, GANs generate high-quality
+samples rapidly, but have poor mode coverage, limiting their adoption in DA
+applications. We propose LatentAugment, a DA strategy that overcomes the low
+diversity of GANs, opening up for use in DA applications. Without external
+supervision, LatentAugment modifies latent vectors and moves them into latent
+space regions to maximise the synthetic images' diversity and fidelity. It is
+also agnostic to the dataset and the downstream task. A wide set of experiments
+shows that LatentAugment improves the generalisation of a deep model
+translating from MRI-to-CT beating both standard DA as well GAN-based sampling.
+Moreover, still in comparison with GAN-based sampling, LatentAugment synthetic
+samples show superior mode coverage and diversity. Code is available at:
+https://github.com/ltronchin/LatentAugment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Photo2Relief: Let Human in the Photograph Stand Out 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11364v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11364v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongping Ji, Feifei Che, Hanshuo Liu, Ziyi Zhao, Yu-Wei Zhang, Wenping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a technique for making humans in photographs
+protrude like reliefs. Unlike previous methods which mostly focus on the face
+and head, our method aims to generate art works that describe the whole body
+activity of the character. One challenge is that there is no ground-truth for
+supervised deep learning. We introduce a sigmoid variant function to manipulate
+gradients tactfully and train our neural networks by equipping with a loss
+function defined in gradient domain. The second challenge is that actual
+photographs often across different light conditions. We used image-based
+rendering technique to address this challenge and acquire rendering images and
+depth data under different lighting conditions. To make a clear division of
+labor in network modules, a two-scale architecture is proposed to create
+high-quality relief from a single photograph. Extensive experimental results on
+a variety of scenes show that our method is a highly effective solution for
+generating digital 2.5D artwork from photographs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ParGANDA: Making Synthetic Pedestrians A Reality For Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daria Reshetova, Guanhang Wu, Marcel Puyat, Chunhui Gu, Huizhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object detection is the key technique to a number of Computer Vision
+applications, but it often requires large amounts of annotated data to achieve
+decent results. Moreover, for pedestrian detection specifically, the collected
+data might contain some personally identifiable information (PII), which is
+highly restricted in many countries. This label intensive and privacy
+concerning task has recently led to an increasing interest in training the
+detection models using synthetically generated pedestrian datasets collected
+with a photo-realistic video game engine. The engine is able to generate
+unlimited amounts of data with precise and consistent annotations, which gives
+potential for significant gains in the real-world applications. However, the
+use of synthetic data for training introduces a synthetic-to-real domain shift
+aggravating the final performance. To close the gap between the real and
+synthetic data, we propose to use a Generative Adversarial Network (GAN), which
+performsparameterized unpaired image-to-image translation to generate more
+realistic images. The key benefit of using the GAN is its intrinsic preference
+of low-level changes to geometric ones, which means annotations of a given
+synthetic image remain accurate even after domain translation is performed thus
+eliminating the need for labeling real data. We extensively experimented with
+the proposed method using MOTSynth dataset to train and MOT17 and MOT20
+detection datasets to test, with experimental results demonstrating the
+effectiveness of this method. Our approach not only produces visually plausible
+samples but also does not require any labels of the real domain thus making it
+applicable to the variety of downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tuning <span class="highlight-title">Pre-train</span>ed Model via Moment Probing <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11342v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11342v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingze Gao, Qilong Wang, Zhenyi Lin, Pengfei Zhu, Qinghua Hu, Jingbo Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, efficient fine-tuning of large-scale pre-trained models has
+attracted increasing research interests, where linear probing (LP) as a
+fundamental module is involved in exploiting the final representations for
+task-dependent classification. However, most of the existing methods focus on
+how to effectively introduce a few of learnable parameters, and little work
+pays attention to the commonly used LP module. In this paper, we propose a
+novel Moment Probing (MP) method to further explore the potential of LP.
+Distinguished from LP which builds a linear classification head based on the
+mean of final features (e.g., word tokens for ViT) or classification tokens,
+our MP performs a linear classifier on feature distribution, which provides the
+stronger representation ability by exploiting richer statistical information
+inherent in features. Specifically, we represent feature distribution by its
+characteristic function, which is efficiently approximated by using first- and
+second-order moments of features. Furthermore, we propose a multi-head
+convolutional cross-covariance (MHC$^3$) to compute second-order moments in an
+efficient and effective manner. By considering that MP could affect feature
+learning, we introduce a partially shared module to learn two recalibrating
+parameters (PSRP) for backbones based on MP, namely MP$_{+}$. Extensive
+experiments on ten benchmarks using various models show that our MP
+significantly outperforms LP and is competitive with counterparts at less
+training cost, while our MP$_{+}$ achieves state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023; Project Page:
+  https://github.com/mingzeG/Moment-Probing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Character Time-series Matching For Robust License Plate Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quang Huy Che, Tung Do Thanh, Cuong Truong Van
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic License Plate Recognition (ALPR) is becoming a popular study area
+and is applied in many fields such as transportation or smart city. However,
+there are still several limitations when applying many current methods to
+practical problems due to the variation in real-world situations such as light
+changes, unclear License Plate (LP) characters, and image quality. Almost
+recent ALPR algorithms process on a single frame, which reduces accuracy in
+case of worse image quality. This paper presents methods to improve license
+plate recognition accuracy by tracking the license plate in multiple frames.
+First, the Adaptive License Plate Rotation algorithm is applied to correctly
+align the detected license plate. Second, we propose a method called Character
+Time-series Matching to recognize license plate characters from many
+consequence frames. The proposed method archives high performance in the
+UFPR-ALPR dataset which is \boldmath$96.7\%$ accuracy in real-time on RTX A5000
+GPU card. We also deploy the algorithm for the Vietnamese ALPR system. The
+accuracy for license plate detection and character recognition are 0.881 and
+0.979 $mAP^{test}$@.5 respectively. The source code is available at
+https://github.com/chequanghuy/Character-Time-series-Matching.git
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tri-MipRF: Tri-Mip Representation for Efficient Anti-Aliasing Neural
+  Radiance Fields <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenbo Hu, Yuling Wang, Lin Ma, Bangbang Yang, Lin Gao, Xiao Liu, Yuewen Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the tremendous progress in neural radiance fields (NeRF), we still
+face a dilemma of the trade-off between quality and efficiency, e.g., MipNeRF
+presents fine-detailed and anti-aliased renderings but takes days for training,
+while Instant-ngp can accomplish the reconstruction in a few minutes but
+suffers from blurring or aliasing when rendering at various distances or
+resolutions due to ignoring the sampling area. To this end, we propose a novel
+Tri-Mip encoding that enables both instant reconstruction and anti-aliased
+high-fidelity rendering for neural radiance fields. The key is to factorize the
+pre-filtered 3D feature spaces in three orthogonal mipmaps. In this way, we can
+efficiently perform 3D area sampling by taking advantage of 2D pre-filtered
+feature maps, which significantly elevates the rendering quality without
+sacrificing efficiency. To cope with the novel Tri-Mip representation, we
+propose a cone-casting rendering technique to efficiently sample anti-aliased
+3D features with the Tri-Mip encoding considering both pixel imaging and
+observing distance. Extensive experiments on both synthetic and real-world
+datasets demonstrate our method achieves state-of-the-art rendering quality and
+reconstruction speed while maintaining a compact representation that reduces
+25% model size compared against Instant-ngp.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023 Project page:
+  https://wbhu.github.io/projects/Tri-MipRF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Transferability of Adversarial Examples via Bayesian Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhang Li, Yiwen Guo, Xiaochen Yang, Wangmeng Zuo, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a substantial extension of our work published at ICLR.
+Our ICLR work advocated for enhancing transferability in adversarial examples
+by incorporating a Bayesian formulation into model parameters, which
+effectively emulates the ensemble of infinitely many deep neural networks,
+while, in this paper, we introduce a novel extension by incorporating the
+Bayesian formulation into the model input as well, enabling the joint
+diversification of both the model input and model parameters. Our empirical
+findings demonstrate that: 1) the combination of Bayesian formulations for both
+the model input and model parameters yields significant improvements in
+transferability; 2) by introducing advanced approximations of the posterior
+distribution over the model input, adversarial transferability achieves further
+enhancement, surpassing all state-of-the-arts when attacking without model
+fine-tuning. Moreover, we propose a principled approach to fine-tune model
+parameters in such an extended Bayesian formulation. The derived optimization
+objective inherently encourages flat minima in the parameter space and input
+space. Extensive experiments demonstrate that our method achieves a new
+state-of-the-art on transfer-based attacks, improving the average success rate
+on ImageNet and CIFAR-10 by 19.14% and 2.08%, respectively, when comparing with
+our ICLR basic Bayesian method. We will make our code publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HVDetFusion: A Simple and Robust Camera-Radar Fusion Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Lei, Zhan Chen, Shuman Jia, Xiaoteng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of autonomous driving, 3D object detection is a very important
+perception module. Although the current SOTA algorithm combines Camera and
+Lidar sensors, limited by the high price of Lidar, the current mainstream
+landing schemes are pure Camera sensors or Camera+Radar sensors. In this study,
+we propose a new detection algorithm called HVDetFusion, which is a multi-modal
+detection algorithm that not only supports pure camera data as input for
+detection, but also can perform fusion input of radar data and camera data. The
+camera stream does not depend on the input of Radar data, thus addressing the
+downside of previous methods. In the pure camera stream, we modify the
+framework of Bevdet4D for better perception and more efficient inference, and
+this stream has the whole 3D detection output. Further, to incorporate the
+benefits of Radar signals, we use the prior information of different object
+positions to filter the false positive information of the original radar data,
+according to the positioning information and radial velocity information
+recorded by the radar sensors to supplement and fuse the BEV features generated
+by the original camera data, and the effect is further improved in the process
+of fusion training. Finally, HVDetFusion achieves the new state-of-the-art
+67.4\% NDS on the challenging nuScenes test set among all camera-radar 3D
+object detectors. The code is available at
+https://github.com/HVXLab/HVDetFusion
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XLDA: Linear Discriminant Analysis for Scaling Continual Learning to
+  Extreme Classification at the Edge <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karan Shah, Vishruth Veerendranath, Anushka Hebbar, Raghavendra Bhat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Streaming Linear Discriminant Analysis (LDA) while proven in
+Class-incremental Learning deployments at the edge with limited classes (upto
+1000), has not been proven for deployment in extreme classification scenarios.
+In this paper, we present: (a) XLDA, a framework for Class-IL in edge
+deployment where LDA classifier is proven to be equivalent to FC layer
+including in extreme classification scenarios, and (b) optimizations to enable
+XLDA-based training and inference for edge deployment where there is a
+constraint on available compute resources. We show up to 42x speed up using a
+batched training approach and up to 5x inference speedup with nearest neighbor
+search on extreme datasets like AliProducts (50k classes) and Google Landmarks
+V2 (81k classes)
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted at ICML 2023: PAC-Bayes Interactive Learning Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Image-Specific Text Improves Fine-grained Image
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11315v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11315v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emily Mu, Kathleen M. Lewis, Adrian V. Dalca, John Guttag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent vision-language models outperform vision-only models on many image
+classification tasks. However, because of the absence of paired text/image
+descriptions, it remains difficult to fine-tune these models for fine-grained
+image classification. In this work, we propose a method, GIST, for generating
+image-specific fine-grained text descriptions from image-only datasets, and
+show that these text descriptions can be used to improve classification. Key
+parts of our method include 1. prompting a pretrained large language model with
+domain-specific prompts to generate diverse fine-grained text descriptions for
+each class and 2. using a pretrained vision-language model to match each image
+to label-preserving text descriptions that capture relevant visual features in
+the image. We demonstrate the utility of GIST by fine-tuning vision-language
+models on the image-and-generated-text pairs to learn an aligned
+vision-language representation space for improved classification. We evaluate
+our learned representation space in full-shot and few-shot scenarios across
+four diverse fine-grained classification datasets, each from a different
+domain. Our method achieves an average improvement of $4.1\%$ in accuracy over
+CLIP linear probes and an average of $1.1\%$ improvement in accuracy over the
+previous state-of-the-art image-text classification method on the full-shot
+datasets. Our method achieves similar improvements across few-shot regimes.
+Code is available at https://github.com/emu1729/GIST.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DPM-OT: A New Diffusion Probabilistic Model Based on Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11308v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11308v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zezeng Li, ShengHao Li, Zhanpeng Wang, Na Lei, Zhongxuan Luo, Xianfeng Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sampling from diffusion probabilistic models (DPMs) can be viewed as a
+piecewise distribution transformation, which generally requires hundreds or
+thousands of steps of the inverse diffusion trajectory to get a high-quality
+image. Recent progress in designing fast samplers for DPMs achieves a trade-off
+between sampling speed and sample quality by knowledge distillation or
+adjusting the variance schedule or the denoising equation. However, it can't be
+optimal in both aspects and often suffer from mode mixture in short steps. To
+tackle this problem, we innovatively regard inverse diffusion as an optimal
+transport (OT) problem between latents at different stages and propose the
+DPM-OT, a unified learning framework for fast DPMs with a direct expressway
+represented by OT map, which can generate high-quality samples within around 10
+function evaluations. By calculating the semi-discrete optimal transport map
+between the data latents and the white noise, we obtain an expressway from the
+prior distribution to the data distribution, while significantly alleviating
+the problem of mode mixture. In addition, we give the error bound of the
+proposed method, which theoretically guarantees the stability of the algorithm.
+Extensive experiments validate the effectiveness and advantages of DPM-OT in
+terms of speed and quality (FID and mode mixture), thus representing an
+efficient solution for generative modeling. Source codes are available at
+https://github.com/cognaclee/DPM-OT
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>iccv2023 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EndoSurf: Neural Surface Reconstruction of Deformable Tissues with
+  Stereo Endoscope Videos <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11307v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11307v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruyi Zha, Xuelian Cheng, Hongdong Li, Mehrtash Harandi, Zongyuan Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing soft tissues from stereo endoscope videos is an essential
+prerequisite for many medical applications. Previous methods struggle to
+produce high-quality geometry and appearance due to their inadequate
+representations of 3D scenes. To address this issue, we propose a novel
+neural-field-based method, called EndoSurf, which effectively learns to
+represent a deforming surface from an RGBD sequence. In EndoSurf, we model
+surface dynamics, shape, and texture with three neural fields. First, 3D points
+are transformed from the observed space to the canonical space using the
+deformation field. The signed distance function (SDF) field and radiance field
+then predict their SDFs and colors, respectively, with which RGBD images can be
+synthesized via differentiable volume rendering. We constrain the learned shape
+by tailoring multiple regularization strategies and disentangling geometry and
+appearance. Experiments on public endoscope datasets demonstrate that EndoSurf
+significantly outperforms existing solutions, particularly in reconstructing
+high-fidelity shapes. Code is available at
+https://github.com/Ruyi-Zha/endosurf.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023 (Early Accept); Ruyi Zha and Xuelian Cheng made equal
+  contributions. Corresponding author: Ruyi Zha (ruyi.zha@gmail.com)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAS: Towards Resource-Efficient Federated Multiple-Task Learning <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiming Zhuang, Yonggang Wen, Lingjuan Lyu, Shuai Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is an emerging distributed machine learning method
+that empowers in-situ model training on decentralized edge devices. However,
+multiple simultaneous FL tasks could overload resource-constrained devices. In
+this work, we propose the first FL system to effectively coordinate and train
+multiple simultaneous FL tasks. We first formalize the problem of training
+simultaneous FL tasks. Then, we present our new approach, MAS (Merge and
+Split), to optimize the performance of training multiple simultaneous FL tasks.
+MAS starts by merging FL tasks into an all-in-one FL task with a multi-task
+architecture. After training for a few rounds, MAS splits the all-in-one FL
+task into two or more FL tasks by using the affinities among tasks measured
+during the all-in-one training. It then continues training each split of FL
+tasks based on model parameters from the all-in-one training. Extensive
+experiments demonstrate that MAS outperforms other methods while reducing
+training time by 2x and reducing energy consumption by 40%. We hope this work
+will inspire the community to further study and optimize training simultaneous
+FL tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV'23. arXiv admin note: substantial text overlap with
+  arXiv:2207.04202</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Screening Mammography Breast Cancer Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debajyoti Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast cancer is a leading cause of cancer-related deaths, but current
+programs are expensive and prone to false positives, leading to unnecessary
+follow-up and patient anxiety. This paper proposes a solution to automated
+breast cancer detection, to improve the efficiency and accuracy of screening
+programs. Different methodologies were tested against the RSNA dataset of
+radiographic breast images of roughly 20,000 female patients and yielded an
+average validation case pF1 score of 0.56 across methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Released @ Apr 2023. For associated project files, see
+  https://github.com/chakrabortyde/rsna-breast-cancer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Terabyte-scale supervised 3D training and benchmarking <span class="highlight-title">dataset</span> of the
+  mouse kidney 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.02226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.02226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Willy Kuo, Diego Rossinelli, Georg Schulz, Roland H. Wenger, Simone Hieber, Bert Müller, Vartan Kurtcuoglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of machine learning algorithms, when used for segmenting 3D
+biomedical images, does not reach the level expected based on results achieved
+with 2D photos. This may be explained by the comparative lack of high-volume,
+high-quality training datasets, which require state-of-the-art imaging
+facilities, domain experts for annotation and large computational and personal
+resources. The HR-Kidney dataset presented in this work bridges this gap by
+providing 1.7 TB of artefact-corrected synchrotron radiation-based X-ray
+phase-contrast microtomography images of whole mouse kidneys and validated
+segmentations of 33 729 glomeruli, which corresponds to a one to two orders of
+magnitude increase over currently available biomedical datasets. The image sets
+also contain the underlying raw data, threshold- and morphology-based
+semi-automatic segmentations of renal vasculature and uriniferous tubules, as
+well as true 3D manual annotations. We therewith provide a broad basis for the
+scientific community to build upon and expand in the fields of image
+processing, data augmentation and machine learning, in particular unsupervised
+and semi-supervised learning investigations, as well as transfer learning and
+generative adversarial networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ More From Less: <span class="highlight-title">Self-Supervised</span> Knowledge Distillation for Routine
+  Histopathology Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10656v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10656v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Farndale, Robert Insall, Ke Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical imaging technologies are generating increasingly large amounts of
+high-quality, information-dense data. Despite the progress, practical use of
+advanced imaging technologies for research and diagnosis remains limited by
+cost and availability, so information-sparse data such as H&E stains are relied
+on in practice. The study of diseased tissue requires methods which can
+leverage these information-dense data to extract more value from routine,
+information-sparse data. Using self-supervised deep learning, we demonstrate
+that it is possible to distil knowledge during training from information-dense
+data into models which only require information-sparse data for inference. This
+improves downstream classification accuracy on information-sparse data, making
+it comparable with the fully-supervised baseline. We find substantial effects
+on the learned representations, and this training process identifies subtle
+features which otherwise go undetected. This approach enables the design of
+models which require only routine images, but contain insights from
+state-of-the-art data, allowing better use of the available resources.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Few-shot Image Classification with Cosine <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.06828v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.06828v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quang-Huy Nguyen, Cuong Q. Nguyen, Dung D. Le, Hieu H. Pham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the few-shot image classification problem, where the
+classification task is performed on unlabeled query samples given a small
+amount of labeled support samples only. One major challenge of the few-shot
+learning problem is the large variety of object visual appearances that
+prevents the support samples to represent that object comprehensively. This
+might result in a significant difference between support and query samples,
+therefore undermining the performance of few-shot algorithms. In this paper, we
+tackle the problem by proposing Few-shot Cosine Transformer (FS-CT), where the
+relational map between supports and queries is effectively obtained for the
+few-shot tasks. The FS-CT consists of two parts, a learnable prototypical
+embedding network to obtain categorical representations from support samples
+with hard cases, and a transformer encoder to effectively achieve the
+relational map from two different support and query samples. We introduce
+Cosine Attention, a more robust and stable attention module that enhances the
+transformer module significantly and therefore improves FS-CT performance from
+5% to over 20% in accuracy compared to the default scaled dot-product
+mechanism. Our method performs competitive results in mini-ImageNet, CUB-200,
+and CIFAR-FS on 1-shot learning and 5-shot learning tasks across backbones and
+few-shot configurations. We also developed a custom few-shot dataset for Yoga
+pose recognition to demonstrate the potential of our algorithm for practical
+application. Our FS-CT with cosine attention is a lightweight, simple few-shot
+algorithm that can be applied for a wide range of applications, such as
+healthcare, medical, and security surveillance. The official implementation
+code of our Few-shot Cosine Transformer is available at
+https://github.com/vinuni-vishc/Few-Shot-Cosine-Transformer
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated wildlife image classification: An active learning tool for
+  ecological applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15823v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15823v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ludwig Bothmann, Lisa Wimmer, Omid Charrakh, Tobias Weber, Hendrik Edelhoff, Wibke Peters, Hien Nguyen, Caryl Benjamin, Annette Menzel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wildlife camera trap images are being used extensively to investigate animal
+abundance, habitat associations, and behavior, which is complicated by the fact
+that experts must first classify the images manually. Artificial intelligence
+systems can take over this task but usually need a large number of
+already-labeled training images to achieve sufficient performance. This
+requirement necessitates human expert labor and poses a particular challenge
+for projects with few cameras or short durations. We propose a label-efficient
+learning strategy that enables researchers with small or medium-sized image
+databases to leverage the potential of modern machine learning, thus freeing
+crucial resources for subsequent analyses.
+  Our methodological proposal is two-fold: (1) We improve current strategies of
+combining object detection and image classification by tuning the
+hyperparameters of both models. (2) We provide an active learning (AL) system
+that allows training deep learning models very efficiently in terms of required
+human-labeled training images. We supply a software package that enables
+researchers to use these methods directly and thereby ensure the broad
+applicability of the proposed framework in ecological practice.
+  We show that our tuning strategy improves predictive performance. We
+demonstrate how the AL pipeline reduces the amount of pre-labeled data needed
+to achieve a specific predictive performance and that it is especially valuable
+for improving out-of-sample predictive performance.
+  We conclude that the combination of tuning and AL increases predictive
+performance substantially. Furthermore, we argue that our work can broadly
+impact the community through the ready-to-use software package provided.
+Finally, the publication of our models tailored to European wildlife data
+enriches existing model bases mostly trained on data from Africa and North
+America.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MOISST: Multimodal Optimization of Implicit Scene for SpatioTemporal
+  calibration <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.03056v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.03056v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quentin Herau, Nathan Piasco, Moussab Bennehar, Luis Roldão, Dzmitry Tsishkou, Cyrille Migniot, Pascal Vasseur, Cédric Demonceaux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent advances in autonomous driving and the decreasing cost of
+LiDARs, the use of multimodal sensor systems is on the rise. However, in order
+to make use of the information provided by a variety of complimentary sensors,
+it is necessary to accurately calibrate them. We take advantage of recent
+advances in computer graphics and implicit volumetric scene representation to
+tackle the problem of multi-sensor spatial and temporal calibration. Thanks to
+a new formulation of the Neural Radiance Field (NeRF) optimization, we are able
+to jointly optimize calibration parameters along with scene representation
+based on radiometric and geometric measurements. Our method enables accurate
+and robust calibration from data captured in uncontrolled and unstructured
+urban environments, making our solution more scalable than existing calibration
+solutions. We demonstrate the accuracy and robustness of our method in urban
+scenes typically encountered in autonomous driving scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IROS2023 Project site: https://qherau.github.io/MOISST/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conditional Diffusion Models for Semantic 3D Medical Image Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18453v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18453v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zolnamar Dorjsembe, Hsing-Kuo Pao, Sodtavilan Odonchimed, Furen Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The demand for artificial intelligence (AI) in healthcare is rapidly
+increasing. However, significant challenges arise from data scarcity and
+privacy concerns, particularly in medical imaging. While existing generative
+models have achieved success in image synthesis and image-to-image translation
+tasks, there remains a gap in the generation of 3D semantic medical images. To
+address this gap, we introduce Med-DDPM, a diffusion model specifically
+designed for semantic 3D medical image synthesis, effectively tackling data
+scarcity and privacy issues. The novelty of Med-DDPM lies in its incorporation
+of semantic conditioning, enabling precise control during the image generation
+process. Our model outperforms Generative Adversarial Networks (GANs) in terms
+of stability and performance, generating diverse and anatomically coherent
+images with high visual fidelity. Comparative analysis against state-of-the-art
+augmentation techniques demonstrates that Med-DDPM produces comparable results,
+highlighting its potential as a data augmentation tool for enhancing model
+accuracy. In conclusion, Med-DDPM pioneers 3D semantic medical image synthesis
+by delivering high-quality and anatomically coherent images. Furthermore, the
+integration of semantic conditioning with Med-DDPM holds promise for image
+anonymization in the field of biomedical imaging, showcasing the capabilities
+of the model in addressing challenges related to data scarcity and privacy
+concerns.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Foresightful Dense Visual Affordance for Deformable Object
+  Manipulation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11057v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11057v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruihai Wu, Chuanruo Ning, Hao Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding and manipulating deformable objects (e.g., ropes and fabrics)
+is an essential yet challenging task with broad applications. Difficulties come
+from complex states and dynamics, diverse configurations and high-dimensional
+action space of deformable objects. Besides, the manipulation tasks usually
+require multiple steps to accomplish, and greedy policies may easily lead to
+local optimal states. Existing studies usually tackle this problem using
+reinforcement learning or imitating expert demonstrations, with limitations in
+modeling complex states or requiring hand-crafted expert policies. In this
+paper, we study deformable object manipulation using dense visual affordance,
+with generalization towards diverse states, and propose a novel kind of
+foresightful dense affordance, which avoids local optima by estimating states'
+values for long-term manipulation. We propose a framework for learning this
+representation, with novel designs such as multi-stage stable learning and
+efficient self-supervised data collection without experts. Experiments
+demonstrate the superiority of our proposed foresightful dense affordance.
+Project page: https://hyperplane-lab.github.io/DeformableAffordance
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BoxSnake: Polygonal Instance Segmentation with Box Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11630v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11630v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Yang, Lin Song, Yixiao Ge, Xiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Box-supervised instance segmentation has gained much attention as it requires
+only simple box annotations instead of costly mask or polygon annotations.
+However, existing box-supervised instance segmentation models mainly focus on
+mask-based frameworks. We propose a new end-to-end training technique, termed
+BoxSnake, to achieve effective polygonal instance segmentation using only box
+annotations for the first time. Our method consists of two loss functions: (1)
+a point-based unary loss that constrains the bounding box of predicted polygons
+to achieve coarse-grained segmentation; and (2) a distance-aware pairwise loss
+that encourages the predicted polygons to fit the object boundaries. Compared
+with the mask-based weakly-supervised methods, BoxSnake further reduces the
+performance gap between the predicted segmentation and the bounding box, and
+shows significant superiority on the Cityscapes dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span>-based end-to-end classification of variable-length
+  volumetric data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06666v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06666v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marzieh Oghbaie, Teresa Araujo, Taha Emre, Ursula Schmidt-Erfurth, Hrvoje Bogunovic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The automatic classification of 3D medical data is memory-intensive. Also,
+variations in the number of slices between samples is common. Na\"ive solutions
+such as subsampling can solve these problems, but at the cost of potentially
+eliminating relevant diagnosis information. Transformers have shown promising
+performance for sequential data analysis. However, their application for long
+sequences is data, computationally, and memory demanding. In this paper, we
+propose an end-to-end Transformer-based framework that allows to classify
+volumetric data of variable length in an efficient fashion. Particularly, by
+randomizing the input volume-wise resolution(#slices) during training, we
+enhance the capacity of the learnable positional embedding assigned to each
+volume slice. Consequently, the accumulated positional information in each
+positional embedding can be generalized to the neighbouring slices, even for
+high-resolution volumes at the test time. By doing so, the model will be more
+robust to variable volume length and amenable to different computational
+budgets. We evaluated the proposed approach in retinal OCT volume
+classification and achieved 21.96% average improvement in balanced accuracy on
+a 9-class diagnostic task, compared to state-of-the-art video transformers. Our
+findings show that varying the volume-wise resolution of the input during
+training results in more informative volume representation as compared to
+training with fixed number of slices per volume.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VERITE: A Robust Benchmark for Multimodal Misinformation Detection
+  Accounting for Unimodal Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14133v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14133v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefanos-Iordanis Papadopoulos, Christos Koutlis, Symeon Papadopoulos, Panagiotis C. Petrantonakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimedia content has become ubiquitous on social media platforms, leading
+to the rise of multimodal misinformation (MM) and the urgent need for effective
+strategies to detect and prevent its spread. In recent years, the challenge of
+multimodal misinformation detection (MMD) has garnered significant attention by
+researchers and has mainly involved the creation of annotated, weakly
+annotated, or synthetically generated training datasets, along with the
+development of various deep learning MMD models. However, the problem of
+unimodal bias in MMD benchmarks -- where biased or unimodal methods outperform
+their multimodal counterparts on an inherently multimodal task -- has been
+overlooked. In this study, we systematically investigate and identify the
+presence of unimodal bias in widely-used MMD benchmarks (VMU-Twitter, COSMOS),
+raising concerns about their suitability for reliable evaluation. To address
+this issue, we introduce the "VERification of Image-TExtpairs" (VERITE)
+benchmark for MMD which incorporates real-world data, excludes "asymmetric
+multimodal misinformation" and utilizes "modality balancing". We conduct an
+extensive comparative study with a Transformer-based architecture that shows
+the ability of VERITE to effectively address unimodal bias, rendering it a
+robust evaluation framework for MMD. Furthermore, we introduce a new method --
+termed Crossmodal HArd Synthetic MisAlignment (CHASMA) -- for generating
+realistic synthetic training data that preserve crossmodal relations between
+legitimate images and false human-written captions. By leveraging CHASMA in the
+training process, we observe consistent and notable improvements in predictive
+performance on VERITE; with a 9.2% increase in accuracy. We release our code
+at: https://github.com/stevejpapad/image-text-verification
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Hyperspectral Inpainting with the Optimisation inspired
+  Deep Neural Network Prior <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07308v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07308v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Li, Mehrdad Yaghoobi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral Image (HSI)s cover hundreds or thousands of narrow spectral
+bands, conveying a wealth of spatial and spectral information. However, due to
+the instrumental errors and the atmospheric changes, the HSI obtained in
+practice are often contaminated by noise and dead pixels(lines), resulting in
+missing information that may severely compromise the subsequent applications.
+We introduce here a novel HSI missing pixel prediction algorithm, called Low
+Rank and Sparsity Constraint Plug-and-Play (LRS-PnP). It is shown that LRS-PnP
+is able to predict missing pixels and bands even when all spectral bands of the
+image are missing. The proposed LRS-PnP algorithm is further extended to a
+self-supervised model by combining the LRS-PnP with the Deep Image Prior (DIP),
+called LRS-PnP-DIP. In a series of experiments with real data, It is shown that
+the LRS-PnP-DIP either achieves state-of-the-art inpainting performance
+compared to other learning-based methods, or outperforms them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented in ISCS23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semantic Self-adaptation: Enhancing Generalization with a Single Sample 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.05788v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.05788v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sherwin Bahmani, Oliver Hahn, Eduard Zamfir, Nikita Araslanov, Daniel Cremers, Stefan Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lack of out-of-domain generalization is a critical weakness of deep
+networks for semantic segmentation. Previous studies relied on the assumption
+of a static model, i. e., once the training process is complete, model
+parameters remain fixed at test time. In this work, we challenge this premise
+with a self-adaptive approach for semantic segmentation that adjusts the
+inference process to each input sample. Self-adaptation operates on two levels.
+First, it fine-tunes the parameters of convolutional layers to the input image
+using consistency regularization. Second, in Batch Normalization layers,
+self-adaptation interpolates between the training and the reference
+distribution derived from a single test sample. Despite both techniques being
+well known in the literature, their combination sets new state-of-the-art
+accuracy on synthetic-to-real generalization benchmarks. Our empirical study
+suggests that self-adaptation may complement the established practice of model
+regularization at training time for improving deep network generalization to
+out-of-domain data. Our code and pre-trained models are available at
+https://github.com/visinf/self-adaptive.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in TMLR (July 2023); OpenReview:
+  https://openreview.net/forum?id=ILNqQhGbLx; Code:
+  https://github.com/visinf/self-adaptive; Video: https://youtu.be/s4DG65ic0EA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MSKdeX: Musculoskeletal (MSK) decomposition from an X-ray image for
+  fine-grained estimation of lean muscle mass and muscle volume <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19920v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19920v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Gu, Yoshito Otake, Keisuke Uemura, Masaki Takao, Mazen Soufi, Yuta Hiasa, Hugues Talbot, Seiji Okata, Nobuhiko Sugano, Yoshinobu Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Musculoskeletal diseases such as sarcopenia and osteoporosis are major
+obstacles to health during aging. Although dual-energy X-ray absorptiometry
+(DXA) and computed tomography (CT) can be used to evaluate musculoskeletal
+conditions, frequent monitoring is difficult due to the cost and accessibility
+(as well as high radiation exposure in the case of CT). We propose a method
+(named MSKdeX) to estimate fine-grained muscle properties from a plain X-ray
+image, a low-cost, low-radiation, and highly accessible imaging modality,
+through musculoskeletal decomposition leveraging fine-grained segmentation in
+CT. We train a multi-channel quantitative image translation model to decompose
+an X-ray image into projections of CT of individual muscles to infer the lean
+muscle mass and muscle volume. We propose the object-wise intensity-sum loss, a
+simple yet surprisingly effective metric invariant to muscle deformation and
+projection direction, utilizing information in CT and X-ray images collected
+from the same patient. While our method is basically an unpaired image-to-image
+translation, we also exploit the nature of the bone's rigidity, which provides
+the paired data through 2D-3D rigid registration, adding strong pixel-wise
+supervision in unpaired training. Through the evaluation using a 539-patient
+dataset, we showed that the proposed method significantly outperformed
+conventional methods. The average Pearson correlation coefficient between the
+predicted and CT-derived ground truth metrics was increased from 0.460 to
+0.863. We believe our method opened up a new musculoskeletal diagnosis method
+and has the potential to be extended to broader applications in multi-channel
+quantitative image translation tasks. Our source code will be released soon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023 early acceptance (12 pages and 6 figures)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Learning for Abdominal Multi-Organ and Tumor Segmentation <span class="chip">MICCAI-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00988v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00988v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixiao Zhang, Xinyi Li, Huimiao Chen, Alan Yuille, Yaoyao Liu, Zongwei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to dynamically extend a model to new data and classes is critical
+for multiple organ and tumor segmentation. However, due to privacy regulations,
+accessing previous data and annotations can be problematic in the medical
+domain. This poses a significant barrier to preserving the high segmentation
+accuracy of the old classes when learning from new classes because of the
+catastrophic forgetting problem. In this paper, we first empirically
+demonstrate that simply using high-quality pseudo labels can fairly mitigate
+this problem in the setting of organ segmentation. Furthermore, we put forward
+an innovative architecture designed specifically for continuous organ and tumor
+segmentation, which incurs minimal computational overhead. Our proposed design
+involves replacing the conventional output layer with a suite of lightweight,
+class-specific heads, thereby offering the flexibility to accommodate newly
+emerging classes. These heads enable independent predictions for newly
+introduced and previously learned classes, effectively minimizing the impact of
+new classes on old ones during the course of continual learning. We further
+propose incorporating Contrastive Language-Image Pretraining (CLIP) embeddings
+into the organ-specific heads. These embeddings encapsulate the semantic
+information of each class, informed by extensive image-text co-training. The
+proposed method is evaluated on both in-house and public abdominal CT datasets
+under organ and tumor segmentation tasks. Empirical results suggest that the
+proposed design improves the segmentation performance of a baseline neural
+network on newly-introduced and previously-learned classes along the learning
+trajectory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Score-Based Generative Models for Medical Image Segmentation using
+  Signed Distance Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05966v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05966v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lea Bogensperger, Dominik Narnhofer, Filip Ilic, Thomas Pock
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation is a crucial task that relies on the ability to
+accurately identify and isolate regions of interest in medical images. Thereby,
+generative approaches allow to capture the statistical properties of
+segmentation masks that are dependent on the respective structures. In this
+work we propose a conditional score-based generative modeling framework to
+represent the signed distance function (SDF) leading to an implicit
+distribution of segmentation masks. The advantage of leveraging the SDF is a
+more natural distortion when compared to that of binary masks. By learning the
+score function of the conditional distribution of SDFs we can accurately sample
+from the distribution of segmentation masks, allowing for the evaluation of
+statistical quantities. Thus, this probabilistic representation allows for the
+generation of uncertainty maps represented by the variance, which can aid in
+further analysis and enhance the predictive robustness. We qualitatively and
+quantitatively illustrate competitive performance of the proposed method on a
+public nuclei and gland segmentation data set, highlighting its potential
+utility in medical image segmentation applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedForgery: Generalized Face Forgery Detection with Residual Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.09563v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.09563v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Decheng Liu, Zhan Dang, Chunlei Peng, Yu Zheng, Shuang Li, Nannan Wang, Xinbo Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the continuous development of deep learning in the field of image
+generation models, a large number of vivid forged faces have been generated and
+spread on the Internet. These high-authenticity artifacts could grow into a
+threat to society security. Existing face forgery detection methods directly
+utilize the obtained public shared or centralized data for training but ignore
+the personal privacy and security issues when personal data couldn't be
+centralizedly shared in real-world scenarios. Additionally, different
+distributions caused by diverse artifact types would further bring adverse
+influences on the forgery detection task. To solve the mentioned problems, the
+paper proposes a novel generalized residual Federated learning for face Forgery
+detection (FedForgery). The designed variational autoencoder aims to learn
+robust discriminative residual feature maps to detect forgery faces (with
+diverse or even unknown artifact types). Furthermore, the general federated
+learning strategy is introduced to construct distributed detection model
+trained collaboratively with multiple local decentralized devices, which could
+further boost the representation generalization. Experiments conducted on
+publicly available face forgery detection datasets prove the superior
+performance of the proposed FedForgery. The designed novel generalized face
+forgery detection protocols and source code would be publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code is available at https://github.com/GANG370/FedForgery. The
+  paper has been accepted in the IEEE Transactions on Information Forensics &
+  Security</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Confidence intervals for performance estimates in 3D medical image
+  segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10926v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10926v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        R. El Jurdi, G. Varoquaux, O. Colliot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical segmentation models are evaluated empirically. As such an evaluation
+is based on a limited set of example images, it is unavoidably noisy. Beyond a
+mean performance measure, reporting confidence intervals is thus crucial.
+However, this is rarely done in medical image segmentation. The width of the
+confidence interval depends on the test set size and on the spread of the
+performance measure (its standard-deviation across of the test set). For
+classification, many test images are needed to avoid wide confidence intervals.
+Segmentation, however, has not been studied, and it differs by the amount of
+information brought by a given test image. In this paper, we study the typical
+confidence intervals in medical image segmentation. We carry experiments on 3D
+image segmentation using the standard nnU-net framework, two datasets from the
+Medical Decathlon challenge and two performance measures: the Dice accuracy and
+the Hausdorff distance. We show that the parametric confidence intervals are
+reasonable approximations of the bootstrap estimates for varying test set sizes
+and spread of the performance metric. Importantly, we show that the test size
+needed to achieve a given precision is often much lower than for classification
+tasks. Typically, a 1% wide confidence interval requires about 100-200 test
+samples when the spread is low (standard-deviation around 3%). More difficult
+segmentation tasks may lead to higher spreads and require over 1000 samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gait Data Augmentation using Physics-Based Biomechanical Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08092v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08092v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mritula Chandrasekaran, Jarek Francik, Dimitrios Makris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on addressing the problem of data scarcity for gait
+analysis. Standard augmentation methods may produce gait sequences that are not
+consistent with the biomechanical constraints of human walking. To address this
+issue, we propose a novel framework for gait data augmentation by using
+OpenSIM, a physics-based simulator, to synthesize biomechanically plausible
+walking sequences. The proposed approach is validated by augmenting the WBDS
+and CASIA-B datasets and then training gait-based classifiers for 3D gender
+gait classification and 2D gait person identification respectively.
+Experimental results indicate that our augmentation approach can improve the
+performance of model-based gait classifiers and deliver state-of-the-art
+results for gait-based person identification with an accuracy of up to 96.11%
+on the CASIA-B dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages including references, 5 Figures submitted to ESWA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SegNetr: Rethinking the local-global interactions and skip connections
+  in U-shaped networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02953v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02953v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junlong Cheng, Chengrui Gao, Fengjie Wang, Min Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, U-shaped networks have dominated the field of medical image
+segmentation due to their simple and easily tuned structure. However, existing
+U-shaped segmentation networks: 1) mostly focus on designing complex
+self-attention modules to compensate for the lack of long-term dependence based
+on convolution operation, which increases the overall number of parameters and
+computational complexity of the network; 2) simply fuse the features of encoder
+and decoder, ignoring the connection between their spatial locations. In this
+paper, we rethink the above problem and build a lightweight medical image
+segmentation network, called SegNetr. Specifically, we introduce a novel
+SegNetr block that can perform local-global interactions dynamically at any
+stage and with only linear complexity. At the same time, we design a general
+information retention skip connection (IRSC) to preserve the spatial location
+information of encoder features and achieve accurate fusion with the decoder
+features. We validate the effectiveness of SegNetr on four mainstream medical
+image segmentation datasets, with 59\% and 76\% fewer parameters and GFLOPs
+than vanilla U-Net, while achieving segmentation performance comparable to
+state-of-the-art methods. Notably, the components proposed in this paper can be
+applied to other U-shaped networks to improve their segmentation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Shortcut Detection with Variational Autoencoders <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04246v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04246v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas M. Müller, Simon Roschmann, Shahbaz Khan, Philip Sperl, Konstantin Böttinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For real-world applications of machine learning (ML), it is essential that
+models make predictions based on well-generalizing features rather than
+spurious correlations in the data. The identification of such spurious
+correlations, also known as shortcuts, is a challenging problem and has so far
+been scarcely addressed. In this work, we present a novel approach to detect
+shortcuts in image and audio datasets by leveraging variational autoencoders
+(VAEs). The disentanglement of features in the latent space of VAEs allows us
+to discover feature-target correlations in datasets and semi-automatically
+evaluate them for ML shortcuts. We demonstrate the applicability of our method
+on several real-world datasets and identify shortcuts that have not been
+discovered before.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the ICML 2023 Workshop on Spurious Correlations,
+  Invariance and Stability</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Generalizable Diabetic Retinopathy Grading in Unseen Domains <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04378v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04378v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxuan Che, Yuhan Cheng, Haibo Jin, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diabetic Retinopathy (DR) is a common complication of diabetes and a leading
+cause of blindness worldwide. Early and accurate grading of its severity is
+crucial for disease management. Although deep learning has shown great
+potential for automated DR grading, its real-world deployment is still
+challenging due to distribution shifts among source and target domains, known
+as the domain generalization problem. Existing works have mainly attributed the
+performance degradation to limited domain shifts caused by simple visual
+discrepancies, which cannot handle complex real-world scenarios. Instead, we
+present preliminary evidence suggesting the existence of three-fold
+generalization issues: visual and degradation style shifts, diagnostic pattern
+diversity, and data imbalance. To tackle these issues, we propose a novel
+unified framework named Generalizable Diabetic Retinopathy Grading Network
+(GDRNet). GDRNet consists of three vital components: fundus visual-artifact
+augmentation (FundusAug), dynamic hybrid-supervised loss (DahLoss), and
+domain-class-aware re-balancing (DCR). FundusAug generates realistic augmented
+images via visual transformation and image degradation, while DahLoss jointly
+leverages pixel-level consistency and image-level semantics to capture the
+diverse diagnostic patterns and build generalizable feature representations.
+Moreover, DCR mitigates the data imbalance from a domain-class view and avoids
+undesired over-emphasis on rare domain-class pairs. Finally, we design a
+publicly available benchmark for fair evaluations. Extensive comparison
+experiments against advanced methods and exhaustive ablation studies
+demonstrate the effectiveness and generalization ability of GDRNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Early Accepted by MICCAI 2023, the 26th International Conference on
+  Medical Image Computing and Computer Assisted Intervention</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion-Scenario Decoupling for Rat-Aware Video Position Prediction:
+  Strategy and Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18310v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18310v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofeng Liu, Jiaxin Gao, Yaohua Liu, Risheng Liu, Nenggan Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently significant progress has been made in human action recognition and
+behavior prediction using deep learning techniques, leading to improved
+vision-based semantic understanding. However, there is still a lack of
+high-quality motion datasets for small bio-robotics, which presents more
+challenging scenarios for long-term movement prediction and behavior control
+based on third-person observation. In this study, we introduce RatPose, a
+bio-robot motion prediction dataset constructed by considering the influence
+factors of individuals and environments based on predefined annotation rules.
+To enhance the robustness of motion prediction against these factors, we
+propose a Dual-stream Motion-Scenario Decoupling (\textit{DMSD}) framework that
+effectively separates scenario-oriented and motion-oriented features and
+designs a scenario contrast loss and motion clustering loss for overall
+training. With such distinctive architecture, the dual-branch feature flow
+information is interacted and compensated in a decomposition-then-fusion
+manner. Moreover, we demonstrate significant performance improvements of the
+proposed \textit{DMSD} framework on different difficulty-level tasks. We also
+implement long-term discretized trajectory prediction tasks to verify the
+generalization ability of the proposed dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Rat, Video Position Prediction</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedNeXt: <span class="highlight-title">Transformer</span>-driven Scaling of ConvNets for Medical Image
+  Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09975v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09975v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saikat Roy, Gregor Koehler, Constantin Ulrich, Michael Baumgartner, Jens Petersen, Fabian Isensee, Paul F. Jaeger, Klaus Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been exploding interest in embracing Transformer-based
+architectures for medical image segmentation. However, the lack of large-scale
+annotated medical datasets make achieving performances equivalent to those in
+natural images challenging. Convolutional networks, in contrast, have higher
+inductive biases and consequently, are easily trainable to high performance.
+Recently, the ConvNeXt architecture attempted to modernize the standard ConvNet
+by mirroring Transformer blocks. In this work, we improve upon this to design a
+modernized and scalable convolutional architecture customized to challenges of
+data-scarce medical settings. We introduce MedNeXt, a Transformer-inspired
+large kernel segmentation network which introduces - 1) A fully ConvNeXt 3D
+Encoder-Decoder Network for medical image segmentation, 2) Residual ConvNeXt up
+and downsampling blocks to preserve semantic richness across scales, 3) A novel
+technique to iteratively increase kernel sizes by upsampling small kernel
+networks, to prevent performance saturation on limited medical data, 4)
+Compound scaling at multiple levels (depth, width, kernel size) of MedNeXt.
+This leads to state-of-the-art performance on 4 tasks on CT and MRI modalities
+and varying dataset sizes, representing a modernized deep architecture for
+medical image segmentation. Our code is made publicly available at:
+https://github.com/MIC-DKFZ/MedNeXt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ord2Seq: Regarding Ordinal Regression as Label Sequence Prediction <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09004v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09004v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhong Wang, Yi Cheng, Jintai Chen, Tingting Chen, Danny Chen, Jian Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ordinal regression refers to classifying object instances into ordinal
+categories. It has been widely studied in many scenarios, such as medical
+disease grading, movie rating, etc. Known methods focused only on learning
+inter-class ordinal relationships, but still incur limitations in
+distinguishing adjacent categories thus far. In this paper, we propose a simple
+sequence prediction framework for ordinal regression called Ord2Seq, which, for
+the first time, transforms each ordinal category label into a special label
+sequence and thus regards an ordinal regression task as a sequence prediction
+process. In this way, we decompose an ordinal regression task into a series of
+recursive binary classification steps, so as to subtly distinguish adjacent
+categories. Comprehensive experiments show the effectiveness of distinguishing
+adjacent categories for performance improvement and our new approach exceeds
+state-of-the-art performances in four different scenarios. Codes are available
+at https://github.com/wjh892521292/Ord2Seq.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SiamixFormer: a fully-<span class="highlight-title">transformer</span> Siamese network with temporal Fusion
+  for accurate building detection and change detection in bi-temporal remote
+  sensing images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.00657v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.00657v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Mohammadian, Foad Ghaderi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building detection and change detection using remote sensing images can help
+urban and rescue planning. Moreover, they can be used for building damage
+assessment after natural disasters. Currently, most of the existing models for
+building detection use only one image (pre-disaster image) to detect buildings.
+This is based on the idea that post-disaster images reduce the model's
+performance because of presence of destroyed buildings. In this paper, we
+propose a siamese model, called SiamixFormer, which uses pre- and post-disaster
+images as input. Our model has two encoders and has a hierarchical transformer
+architecture. The output of each stage in both encoders is given to a temporal
+transformer for feature fusion in a way that query is generated from
+pre-disaster images and (key, value) is generated from post-disaster images. To
+this end, temporal features are also considered in feature fusion. Another
+advantage of using temporal transformers in feature fusion is that they can
+better maintain large receptive fields generated by transformer encoders
+compared with CNNs. Finally, the output of the temporal transformer is given to
+a simple MLP decoder at each stage. The SiamixFormer model is evaluated on xBD,
+and WHU datasets, for building detection and on LEVIR-CD and CDD datasets for
+change detection and could outperform the state-of-the-art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LDP: Language-driven Dual-Pixel Image Defocus Deblurring Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09815v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09815v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Yang, Liyuan Pan, Yan Yang, Miaomiao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recovering sharp images from dual-pixel (DP) pairs with disparity-dependent
+blur is a challenging task.~Existing blur map-based deblurring methods have
+demonstrated promising results. In this paper, we propose, to the best of our
+knowledge, the first framework to introduce the contrastive language-image
+pre-training framework (CLIP) to achieve accurate blur map estimation from DP
+pairs unsupervisedly. To this end, we first carefully design text prompts to
+enable CLIP to understand blur-related geometric prior knowledge from the DP
+pair. Then, we propose a format to input stereo DP pair to the CLIP without any
+fine-tuning, where the CLIP is pre-trained on monocular images. Given the
+estimated blur map, we introduce a blur-prior attention block, a blur-weighting
+loss and a blur-aware loss to recover the all-in-focus image. Our method
+achieves state-of-the-art performance in extensive experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ethosight: A Reasoning-Guided Iterative Learning System for Nuanced
+  Perception based on Joint-Embedding & Contextual Label Affinity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10577v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10577v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Latapie, Kristinn R. Thorisson, Shan Yu, Vahagn Petrosyan, Patrick Hammer, Pei Wang, Brandon Kynoch, Hanning Chen, Tangrui Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional computer vision models often require extensive manual effort for
+data acquisition, annotation and validation, particularly when detecting subtle
+behavioral nuances or events. The difficulty in distinguishing routine
+behaviors from potential risks in real-world applications, such as
+differentiating routine shopping from potential shoplifting, further
+complicates the process. Moreover, these models may demonstrate high false
+positive rates and imprecise event detection when exposed to real-world
+scenarios that differ significantly from the conditions of the training data.
+  To overcome these hurdles, we present Ethosight, a novel zero-shot computer
+vision system. Ethosight initiates with a clean slate based on user
+requirements and semantic knowledge of interest. Using localized label affinity
+calculations and a reasoning-guided iterative learning loop, Ethosight infers
+scene details and iteratively refines the label set. Reasoning mechanisms can
+be derived from large language models like GPT4, symbolic reasoners like
+OpenNARS\cite{wang2013}\cite{wang2006}, or hybrid systems.
+  Our evaluations demonstrate Ethosight's efficacy across 40 complex use cases,
+spanning domains such as health, safety, and security. Detailed results and
+case studies within the main body of this paper and an appendix underscore a
+promising trajectory towards enhancing the adaptability and resilience of
+computer vision models in detecting and extracting subtle and nuanced
+behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StyleGANEX: StyleGAN-Based Manipulation Beyond Cropped Aligned Faces <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06146v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06146v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Yang, Liming Jiang, Ziwei Liu, Chen Change Loy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in face manipulation using StyleGAN have produced impressive
+results. However, StyleGAN is inherently limited to cropped aligned faces at a
+fixed image resolution it is pre-trained on. In this paper, we propose a simple
+and effective solution to this limitation by using dilated convolutions to
+rescale the receptive fields of shallow layers in StyleGAN, without altering
+any model parameters. This allows fixed-size small features at shallow layers
+to be extended into larger ones that can accommodate variable resolutions,
+making them more robust in characterizing unaligned faces. To enable real face
+inversion and manipulation, we introduce a corresponding encoder that provides
+the first-layer feature of the extended StyleGAN in addition to the latent
+style code. We validate the effectiveness of our method using unaligned face
+inputs of various resolutions in a diverse set of face manipulation tasks,
+including facial attribute editing, super-resolution, sketch/mask-to-face
+translation, and face toonification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Code: https://github.com/williamyang1991/StyleGANEX
+  Project page: https://www.mmlab-ntu.com/project/styleganex/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FSD: Fully-Specialized Detector via Neural Architecture Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16649v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16649v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Huang, Yudian Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most generic object detectors are mainly built for standard object detection
+tasks such as COCO and PASCAL VOC. They might not work well and/or efficiently
+on tasks of other domains consisting of images that are visually different from
+standard datasets. To this end, many advances have been focused on adapting a
+general-purposed object detector with limited domain-specific designs. However,
+designing a successful task-specific detector requires extraneous manual
+experiments and parameter tuning through trial and error. In this paper, we
+first propose and examine a fully-automatic pipeline to design a
+fully-specialized detector (FSD) which mainly incorporates a
+neural-architectural-searched model by exploring ideal network structures over
+the backbone and task-specific head. On the DeepLesion dataset, extensive
+results show that FSD can achieve 3.1 mAP gain while using approximately 40%
+fewer parameters on binary lesion detection task and improved the mAP by around
+10% on multi-type lesion detection task via our region-aware graph modeling
+compared with existing general-purposed medical lesion detection networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Large Margin Sparse Embeddings for Open Set Medical Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04541v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04541v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Liu, Lu Xu, Jicong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fueled by deep learning, computer-aided diagnosis achieves huge advances.
+However, out of controlled lab environments, algorithms could face multiple
+challenges. Open set recognition (OSR), as an important one, states that
+categories unseen in training could appear in testing. In medical fields, it
+could derive from incompletely collected training datasets and the constantly
+emerging new or rare diseases. OSR requires an algorithm to not only correctly
+classify known classes, but also recognize unknown classes and forward them to
+experts for further diagnosis. To tackle OSR, we assume that known classes
+could densely occupy small parts of the embedding space and the remaining
+sparse regions could be recognized as unknowns. Following it, we propose Open
+Margin Cosine Loss (OMCL) unifying two mechanisms. The former, called Margin
+Loss with Adaptive Scale (MLAS), introduces angular margin for reinforcing
+intra-class compactness and inter-class separability, together with an adaptive
+scaling factor to strengthen the generalization capacity. The latter, called
+Open-Space Suppression (OSS), opens the classifier by recognizing sparse
+embedding space as unknowns using proposed feature space descriptors. Besides,
+since medical OSR is still a nascent field, two publicly available benchmark
+datasets are proposed for comparison. Extensive ablation studies and feature
+visualization demonstrate the effectiveness of each design. Compared with
+state-of-the-art methods, MLAS achieves superior performances, measured by ACC,
+AUROC, and OSCR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reverse Knowledge Distillation: Training a Large Model using a Small One
+  for Retinal Image Matching on Limited Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahar Almahfouz Nasser, Nihar Gupte, Amit Sethi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retinal image matching plays a crucial role in monitoring disease progression
+and treatment response. However, datasets with matched keypoints between
+temporally separated pairs of images are not available in abundance to train
+transformer-based model. We propose a novel approach based on reverse knowledge
+distillation to train large models with limited data while preventing
+overfitting. Firstly, we propose architectural modifications to a CNN-based
+semi-supervised method called SuperRetina that help us improve its results on a
+publicly available dataset. Then, we train a computationally heavier model
+based on a vision transformer encoder using the lighter CNN-based model, which
+is counter-intuitive in the field knowledge-distillation research where
+training lighter models based on heavier ones is the norm. Surprisingly, such
+reverse knowledge distillation improves generalization even further. Our
+experiments suggest that high-dimensional fitting in representation space may
+prevent overfitting unlike training directly to match the final output. We also
+provide a public dataset with annotations for retinal image keypoint detection
+and matching to help the research community develop algorithms for retinal
+image applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exact Diffusion Inversion via Bi-directional Integration Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10829v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10829v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoqiang Zhang, J. P. Lewis, W. Bastiaan Kleijn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, different methods have been proposed to address the inconsistency
+issue of DDIM inversion to enable image editing, such as EDICT
+\cite{Wallace23EDICT} and Null-text inversion \cite{Mokady23NullTestInv}.
+However, the above methods introduce considerable computational overhead. In
+this paper, we propose a new technique, named \emph{bi-directional integration
+approximation} (BDIA), to perform exact diffusion inversion with neglible
+computational overhead. Suppose we would like to estimate the next diffusion
+state $\boldsymbol{z}_{i-1}$ at timestep $t_i$ with the historical information
+$(i,\boldsymbol{z}_i)$ and $(i+1,\boldsymbol{z}_{i+1})$. We first obtain the
+estimated Gaussian noise $\hat{\boldsymbol{\epsilon}}(\boldsymbol{z}_i,i)$, and
+then apply the DDIM update procedure twice for approximating the ODE
+integration over the next time-slot $[t_i, t_{i-1}]$ in the forward manner and
+the previous time-slot $[t_i, t_{t+1}]$ in the backward manner. The DDIM step
+for the previous time-slot is used to refine the integration approximation made
+earlier when computing $\boldsymbol{z}_i$. One nice property with BDIA-DDIM is
+that the update expression for $\boldsymbol{z}_{i-1}$ is a linear combination
+of $(\boldsymbol{z}_{i+1}, \boldsymbol{z}_i,
+\hat{\boldsymbol{\epsilon}}(\boldsymbol{z}_i,i))$. This allows for exact
+backward computation of $\boldsymbol{z}_{i+1}$ given $(\boldsymbol{z}_i,
+\boldsymbol{z}_{i-1})$, thus leading to exact diffusion inversion. Experiments
+on both image reconstruction and image editing were conducted, confirming our
+statement. BDIA can also be applied to improve the performance of other ODE
+solvers in addition to DDIM. In our work, it is found that applying BDIA to the
+EDM sampling procedure produces slightly better FID score over CIFAR10.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2304.11328</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Name Your Colour For the Task: Artificially Discover Colour Naming via
+  Colour Quantisation <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.03434v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.03434v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenghan Su, Lin Gu, Yue Yang, Zenghui Zhang, Tatsuya Harada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The long-standing theory that a colour-naming system evolves under dual
+pressure of efficient communication and perceptual mechanism is supported by
+more and more linguistic studies, including analysing four decades of
+diachronic data from the Nafaanra language. This inspires us to explore whether
+machine learning could evolve and discover a similar colour-naming system via
+optimising the communication efficiency represented by high-level recognition
+performance. Here, we propose a novel colour quantisation transformer,
+CQFormer, that quantises colour space while maintaining the accuracy of machine
+recognition on the quantised images. Given an RGB image, Annotation Branch maps
+it into an index map before generating the quantised image with a colour
+palette; meanwhile the Palette Branch utilises a key-point detection way to
+find proper colours in the palette among the whole colour space. By interacting
+with colour annotation, CQFormer is able to balance both the machine vision
+accuracy and colour perceptual structure such as distinct and stable colour
+distribution for discovered colour system. Very interestingly, we even observe
+the consistent evolution pattern between our artificial colour system and basic
+colour terms across human languages. Besides, our colour quantisation method
+also offers an efficient quantisation method that effectively compresses the
+image storage while maintaining high performance in high-level recognition
+tasks such as classification and detection. Extensive experiments demonstrate
+the superior performance of our method with extremely low bit-rate colours,
+showing potential to integrate into quantisation network to quantities from
+image to network activation. The source code is available at
+https://github.com/ryeocthiv/CQFormer
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Multiview Clustering by Contrasting Cluster Assignments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.10769v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.10769v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Chen, Hua Mao, Wai Lok Woo, Xi Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiview clustering (MVC) aims to reveal the underlying structure of
+multiview data by categorizing data samples into clusters. Deep learning-based
+methods exhibit strong feature learning capabilities on large-scale datasets.
+For most existing deep MVC methods, exploring the invariant representations of
+multiple views is still an intractable problem. In this paper, we propose a
+cross-view contrastive learning (CVCL) method that learns view-invariant
+representations and produces clustering results by contrasting the cluster
+assignments among multiple views. Specifically, we first employ deep
+autoencoders to extract view-dependent features in the pretraining stage. Then,
+a cluster-level CVCL strategy is presented to explore consistent semantic label
+information among the multiple views in the fine-tuning stage. Thus, the
+proposed CVCL method is able to produce more discriminative cluster assignments
+by virtue of this learning strategy. Moreover, we provide a theoretical
+analysis of soft cluster assignment alignment. Extensive experimental results
+obtained on several datasets demonstrate that the proposed CVCL method
+outperforms several state-of-the-art approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collaborative Perception in Autonomous Driving: Methods, <span class="highlight-title">Dataset</span>s and
+  Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06262v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06262v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yushan Han, Hui Zhang, Huifang Li, Yi Jin, Congyan Lang, Yidong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative perception is essential to address occlusion and sensor failure
+issues in autonomous driving. In recent years, theoretical and experimental
+investigations of novel works for collaborative perception have increased
+tremendously. So far, however, few reviews have focused on systematical
+collaboration modules and large-scale collaborative perception datasets. This
+work reviews recent achievements in this field to bridge this gap and motivate
+future research. We start with a brief overview of collaboration schemes. After
+that, we systematically summarize the collaborative perception methods for
+ideal scenarios and real-world issues. The former focuses on collaboration
+modules and efficiency, and the latter is devoted to addressing the problems in
+actual application. Furthermore, we present large-scale public datasets and
+summarize quantitative results on these benchmarks. Finally, we highlight gaps
+and overlook challenges between current academic research and real-world
+applications. The project page is
+https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 6 figures. Accepted by IEEE Intelligent Transportation
+  Systems Magazine. URL:
+  https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdjointDPM: Adjoint Sensitivity Method for Gradient Backpropagation of
+  Diffusion Probabilistic Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10711v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10711v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachun Pan, Jun Hao Liew, Vincent Y. F. Tan, Jiashi Feng, Hanshu Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing customization methods require access to multiple reference examples
+to align pre-trained diffusion probabilistic models (DPMs) with user-provided
+concepts. This paper aims to address the challenge of DPM customization when
+the only available supervision is a differentiable metric defined on the
+generated contents. Since the sampling procedure of DPMs involves recursive
+calls to the denoising UNet, na\"ive gradient backpropagation requires storing
+the intermediate states of all iterations, resulting in extremely high memory
+consumption. To overcome this issue, we propose a novel method AdjointDPM,
+which first generates new samples from diffusion models by solving the
+corresponding probability-flow ODEs. It then uses the adjoint sensitivity
+method to backpropagate the gradients of the loss to the models' parameters
+(including conditioning signals, network weights, and initial noises) by
+solving another augmented ODE. To reduce numerical errors in both the forward
+generation and gradient backpropagation processes, we further reparameterize
+the probability-flow ODE and augmented ODE as simple non-stiff ODEs using
+exponential integration. Finally, we demonstrate the effectiveness of
+AdjointDPM on three interesting tasks: converting visual effects into
+identification text embeddings, finetuning DPMs for specific types of
+stylization, and optimizing initial noise to generate adversarial samples for
+security auditing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Invariant Slot Attention: Object Discovery with Slot-Centric Reference
+  Frames <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04973v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04973v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ondrej Biza, Sjoerd van Steenkiste, Mehdi S. M. Sajjadi, Gamaleldin F. Elsayed, Aravindh Mahendran, Thomas Kipf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatically discovering composable abstractions from raw perceptual data is
+a long-standing challenge in machine learning. Recent slot-based neural
+networks that learn about objects in a self-supervised manner have made
+exciting progress in this direction. However, they typically fall short at
+adequately capturing spatial symmetries present in the visual world, which
+leads to sample inefficiency, such as when entangling object appearance and
+pose. In this paper, we present a simple yet highly effective method for
+incorporating spatial symmetries via slot-centric reference frames. We
+incorporate equivariance to per-object pose transformations into the attention
+and generation mechanism of Slot Attention by translating, scaling, and
+rotating position encodings. These changes result in little computational
+overhead, are easy to implement, and can result in large gains in terms of data
+efficiency and overall improvements to object discovery. We evaluate our method
+on a wide range of synthetic object discovery benchmarks namely CLEVR,
+Tetrominoes, CLEVRTex, Objects Room and MultiShapeNet, and show promising
+improvements on the challenging real-world Waymo Open dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2023. Project page: https://invariantsa.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">9</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Alleviating the Long-Tail Problem in Conversational Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Zhao, Kun Zhou, Xiaolei Wang, Wayne Xin Zhao, Fan Pan, Zhao Cao, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational recommender systems (CRS) aim to provide the recommendation
+service via natural language conversations. To develop an effective CRS,
+high-quality CRS datasets are very crucial. However, existing CRS datasets
+suffer from the long-tail issue, \ie a large proportion of items are rarely (or
+even never) mentioned in the conversations, which are called long-tail items.
+As a result, the CRSs trained on these datasets tend to recommend frequent
+items, and the diversity of the recommended items would be largely reduced,
+making users easier to get bored.
+  To address this issue, this paper presents \textbf{LOT-CRS}, a novel
+framework that focuses on simulating and utilizing a balanced CRS dataset (\ie
+covering all the items evenly) for improving \textbf{LO}ng-\textbf{T}ail
+recommendation performance of CRSs. In our approach, we design two pre-training
+tasks to enhance the understanding of simulated conversation for long-tail
+items, and adopt retrieval-augmented fine-tuning with label smoothness strategy
+to further improve the recommendation of long-tail items. Extensive experiments
+on two public CRS datasets have demonstrated the effectiveness and
+extensibility of our approach, especially on long-tail recommendation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identifying document similarity using a fast estimation of the
+  Levenshtein Distance based on compression and signatures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Coates, Frank Breitinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying document similarity has many applications, e.g., source code
+analysis or plagiarism detection. However, identifying similarities is not
+trivial and can be time complex. For instance, the Levenshtein Distance is a
+common metric to define the similarity between two documents but has quadratic
+runtime which makes it impractical for large documents where large starts with
+a few hundred kilobytes. In this paper, we present a novel concept that allows
+estimating the Levenshtein Distance: the algorithm first compresses documents
+to signatures (similar to hash values) using a user-defined compression ratio.
+Signatures can then be compared against each other (some constrains apply)
+where the outcome is the estimated Levenshtein Distance. Our evaluation shows
+promising results in terms of runtime efficiency and accuracy. In addition, we
+introduce a significance score allowing examiners to set a threshold and
+identify related documents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In: Proceedings of the Digital Forensics Research Conference Europe
+  (DFRWS EU). 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of Elephant Movement in Sub-Saharan Africa: Ecological,
+  Climatic, and Conservation Perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Hines, Gregory Glatzer, Shreya Ghosh, Prasenjit Mitra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The interaction between elephants and their environment has profound
+implications for both ecology and conservation strategies. This study presents
+an analytical approach to decipher the intricate patterns of elephant movement
+in Sub-Saharan Africa, concentrating on key ecological drivers such as seasonal
+variations and rainfall patterns. Despite the complexities surrounding these
+influential factors, our analysis provides a holistic view of elephant
+migratory behavior in the context of the dynamic African landscape. Our
+comprehensive approach enables us to predict the potential impact of these
+ecological determinants on elephant migration, a critical step in establishing
+informed conservation strategies. This projection is particularly crucial given
+the impacts of global climate change on seasonal and rainfall patterns, which
+could substantially influence elephant movements in the future. The findings of
+our work aim to not only advance the understanding of movement ecology but also
+foster a sustainable coexistence of humans and elephants in Sub-Saharan Africa.
+By predicting potential elephant routes, our work can inform strategies to
+minimize human-elephant conflict, effectively manage land use, and enhance
+anti-poaching efforts. This research underscores the importance of integrating
+movement ecology and climatic variables for effective wildlife management and
+conservation planning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 17 figures, Accepted in ACM SIGCAS SIGCHI Conference on
+  Computing and Sustainable Societies (COMPASS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MythQA: Query-Based Large-Scale Check-Worthy Claim Detection through
+  Multi-Answer Open-Domain Question Answering <span class="chip">SIGIR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Bai, Anthony Colas, Daisy Zhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Check-worthy claim detection aims at providing plausible misinformation to
+downstream fact-checking systems or human experts to check. This is a crucial
+step toward accelerating the fact-checking process. Many efforts have been put
+into how to identify check-worthy claims from a small scale of pre-collected
+claims, but how to efficiently detect check-worthy claims directly from a
+large-scale information source, such as Twitter, remains underexplored. To fill
+this gap, we introduce MythQA, a new multi-answer open-domain question
+answering(QA) task that involves contradictory stance mining for query-based
+large-scale check-worthy claim detection. The idea behind this is that
+contradictory claims are a strong indicator of misinformation that merits
+scrutiny by the appropriate authorities. To study this task, we construct
+TweetMythQA, an evaluation dataset containing 522 factoid multi-answer
+questions based on controversial topics. Each question is annotated with
+multiple answers. Moreover, we collect relevant tweets for each distinct
+answer, then classify them into three categories: "Supporting", "Refuting", and
+"Neutral". In total, we annotated 5.3K tweets. Contradictory evidence is
+collected for all answers in the dataset. Finally, we present a baseline system
+for MythQA and evaluate existing NLP models for each system component using the
+TweetMythQA dataset. We provide initial benchmarks and identify key challenges
+for future models to improve upon. Code and data are available at:
+https://github.com/TonyBY/Myth-QA
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SIGIR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Going Beyond Local: Global Graph-Enhanced Personalized News
+  Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06576v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06576v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boming Yang, Dairui Liu, Toyotaro Suzumura, Ruihai Dong, Irene Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Precisely recommending candidate news articles to users has always been a
+core challenge for personalized news recommendation systems. Most recent works
+primarily focus on using advanced natural language processing techniques to
+extract semantic information from rich textual data, employing content-based
+methods derived from local historical news. However, this approach lacks a
+global perspective, failing to account for users' hidden motivations and
+behaviors beyond semantic information. To address this challenge, we propose a
+novel model called GLORY (Global-LOcal news Recommendation sYstem), which
+combines global representations learned from other users with local
+representations to enhance personalized recommendation systems. We accomplish
+this by constructing a Global-aware Historical News Encoder, which includes a
+global news graph and employs gated graph neural networks to enrich news
+representations, thereby fusing historical news representations by a historical
+news aggregator. Similarly, we extend this approach to a Global Candidate News
+Encoder, utilizing a global entity graph and a candidate news aggregator to
+enhance candidate news representation. Evaluation results on two public news
+datasets demonstrate that our method outperforms existing approaches.
+Furthermore, our model offers more diverse recommendations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, Recsys 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unmasking Falsehoods in <span class="highlight-title">Review</span>s: An Exploration of NLP Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10617v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10617v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anusuya Baby Hari Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the contemporary digital landscape, online reviews have become an
+indispensable tool for promoting products and services across various
+businesses. Marketers, advertisers, and online businesses have found incentives
+to create deceptive positive reviews for their products and negative reviews
+for their competitors' offerings. As a result, the writing of deceptive reviews
+has become an unavoidable practice for businesses seeking to promote themselves
+or undermine their rivals. Detecting such deceptive reviews has become an
+intense and ongoing area of research. This research paper proposes a machine
+learning model to identify deceptive reviews, with a particular focus on
+restaurants. This study delves into the performance of numerous experiments
+conducted on a dataset of restaurant reviews known as the Deceptive Opinion
+Spam Corpus. To accomplish this, an n-gram model and max features are developed
+to effectively identify deceptive content, particularly focusing on fake
+reviews. A benchmark study is undertaken to explore the performance of two
+different feature extraction techniques, which are then coupled with five
+distinct machine learning classification algorithms. The experimental results
+reveal that the passive aggressive classifier stands out among the various
+algorithms, showcasing the highest accuracy not only in text classification but
+also in identifying fake reviews. Moreover, the research delves into data
+augmentation and implements various deep learning techniques to further enhance
+the process of detecting deceptive reviews. The findings shed light on the
+efficacy of the proposed machine learning approach and offer valuable insights
+into dealing with deceptive reviews in the realm of online businesses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Model Augmented Narrative Driven Recommendations <span class="chip">RecSys 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02250v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02250v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheshera Mysore, Andrew McCallum, Hamed Zamani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Narrative-driven recommendation (NDR) presents an information access problem
+where users solicit recommendations with verbose descriptions of their
+preferences and context, for example, travelers soliciting recommendations for
+points of interest while describing their likes/dislikes and travel
+circumstances. These requests are increasingly important with the rise of
+natural language-based conversational interfaces for search and recommendation
+systems. However, NDR lacks abundant training data for models, and current
+platforms commonly do not support these requests. Fortunately, classical
+user-item interaction datasets contain rich textual data, e.g., reviews, which
+often describe user preferences and context - this may be used to bootstrap
+training for NDR models. In this work, we explore using large language models
+(LLMs) for data augmentation to train NDR models. We use LLMs for authoring
+synthetic narrative queries from user-item interactions with few-shot prompting
+and train retrieval models for NDR on synthetic queries and user-item
+interaction data. Our experiments demonstrate that this is an effective
+strategy for training small-parameter retrieval models that outperform other
+retrieval and LLM baselines for narrative-driven recommendation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>RecSys 2023 Camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Editable User Profiles for Controllable Text Recommendation <span class="chip">SIGIR-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04250v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04250v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheshera Mysore, Mahmood Jasim, Andrew McCallum, Hamed Zamani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods for making high-quality recommendations often rely on learning latent
+representations from interaction data. These methods, while performant, do not
+provide ready mechanisms for users to control the recommendation they receive.
+Our work tackles this problem by proposing LACE, a novel concept value
+bottleneck model for controllable text recommendations. LACE represents each
+user with a succinct set of human-readable concepts through retrieval given
+user-interacted documents and learns personalized representations of the
+concepts based on user documents. This concept based user profile is then
+leveraged to make recommendations. The design of our model affords control over
+the recommendations through a number of intuitive interactions with a
+transparent user profile. We first establish the quality of recommendations
+obtained from LACE in an offline evaluation on three recommendation tasks
+spanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we
+validate the controllability of LACE under simulated user interactions.
+Finally, we implement LACE in an interactive controllable recommender system
+and conduct a user study to demonstrate that users are able to improve the
+quality of recommendations they receive through interactions with an editable
+user profile.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGIR-2023 Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Approximate Nearest Neighbor Search with a Dynamic Exploration
+  Graph using Continuous Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10479v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10479v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nico Hezel, Kai Uwe Barthel, Konstantin Schall, Klaus Jung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For approximate nearest neighbor search, graph-based algorithms have shown to
+offer the best trade-off between accuracy and search time. We propose the
+Dynamic Exploration Graph (DEG) which significantly outperforms existing
+algorithms in terms of search and exploration efficiency by combining two new
+ideas: First, a single undirected even regular graph is incrementally built by
+partially replacing existing edges to integrate new vertices and to update old
+neighborhoods at the same time. Secondly, an edge optimization algorithm is
+used to continuously improve the quality of the graph. Combining this ongoing
+refinement with the graph construction process leads to a well-organized graph
+structure at all times, resulting in: (1) increased search efficiency, (2)
+predictable index size, (3) guaranteed connectivity and therefore reachability
+of all vertices, and (4) a dynamic graph structure. In addition we investigate
+how well existing graph-based search systems can handle indexed queries where
+the seed vertex of a search is the query itself. Such exploration tasks,
+despite their good starting point, are not necessarily easy. High efficiency in
+approximate nearest neighbor search (ANNS) does not automatically imply good
+performance in exploratory search. Extensive experiments show that our new
+Dynamic Exploration Graph outperforms existing algorithms significantly for
+indexed and unindexed queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">97</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differentially Private Heavy Hitter Detection using Federated Analytics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11749v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11749v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karan Chadha, Junye Chen, John Duchi, Vitaly Feldman, Hanieh Hashemi, Omid Javidbakht, Audra McMillan, Kunal Talwar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we study practical heuristics to improve the performance of
+prefix-tree based algorithms for differentially private heavy hitter detection.
+Our model assumes each user has multiple data points and the goal is to learn
+as many of the most frequent data points as possible across all users' data
+with aggregate and local differential privacy. We propose an adaptive
+hyperparameter tuning algorithm that improves the performance of the algorithm
+while satisfying computational, communication and privacy constraints. We
+explore the impact of different data-selection schemes as well as the impact of
+introducing deny lists during multiple runs of the algorithm. We test these
+improvements using extensive experimentation on the Reddit
+dataset~\cite{caldas2018leaf} on the task of learning the most frequent words.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Ad Auction Realism: Practical Insights & Modeling Implications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Chen, Sareh Nabi, Marciano Siniscalchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a learning model of online ad auctions that allows for
+the following four key realistic characteristics of contemporary online
+auctions: (1) ad slots can have different values and click-through rates
+depending on users' search queries, (2) the number and identity of competing
+advertisers are unobserved and change with each auction, (3) advertisers only
+receive partial, aggregated feedback, and (4) payment rules are only partially
+specified. We model advertisers as agents governed by an adversarial bandit
+algorithm, independent of auction mechanism intricacies. Our objective is to
+simulate the behavior of advertisers for counterfactual analysis, prediction,
+and inference purposes. Our findings reveal that, in such richer environments,
+"soft floors" can enhance key performance metrics even when bidders are drawn
+from the same population. We further demonstrate how to infer advertiser value
+distributions from observed bids, thereby affirming the practical efficacy of
+our approach even in a more realistic auction setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Communications Threats in Decentralized Federated Learning
+  through Moving Target Defense 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enrique Tomás Martínez Beltrán, Pedro Miguel Sánchez Sánchez, Sergio López Bernal, Gérôme Bovet, Manuel Gil Pérez, Gregorio Martínez Pérez, Alberto Huertas Celdrán
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of Decentralized Federated Learning (DFL) has enabled the training
+of machine learning models across federated participants, fostering
+decentralized model aggregation and reducing dependence on a server. However,
+this approach introduces unique communication security challenges that have yet
+to be thoroughly addressed in the literature. These challenges primarily
+originate from the decentralized nature of the aggregation process, the varied
+roles and responsibilities of the participants, and the absence of a central
+authority to oversee and mitigate threats. Addressing these challenges, this
+paper first delineates a comprehensive threat model, highlighting the potential
+risks of DFL communications. In response to these identified risks, this work
+introduces a security module designed for DFL platforms to counter
+communication-based attacks. The module combines security techniques such as
+symmetric and asymmetric encryption with Moving Target Defense (MTD)
+techniques, including random neighbor selection and IP/port switching. The
+security module is implemented in a DFL platform called Fedstellar, allowing
+the deployment and monitoring of the federation. A DFL scenario has been
+deployed, involving eight physical devices implementing three security
+configurations: (i) a baseline with no security, (ii) an encrypted
+configuration, and (iii) a configuration integrating both encryption and MTD
+techniques. The effectiveness of the security module is validated through
+experiments with the MNIST dataset and eclipse attacks. The results indicated
+an average F1 score of 95%, with moderate increases in CPU usage (up to 63.2%
++-3.5%) and network traffic (230 MB +-15 MB) under the most secure
+configuration, mitigating the risks posed by eavesdropping or eclipse attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convergence of SGD for Training Neural Networks with Sliced Wasserstein
+  Losses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eloi Tanguy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal Transport has sparked vivid interest in recent years, in particular
+thanks to the Wasserstein distance, which provides a geometrically sensible and
+intuitive way of comparing probability measures. For computational reasons, the
+Sliced Wasserstein (SW) distance was introduced as an alternative to the
+Wasserstein distance, and has seen uses for training generative Neural Networks
+(NNs). While convergence of Stochastic Gradient Descent (SGD) has been observed
+practically in such a setting, there is to our knowledge no theoretical
+guarantee for this observation. Leveraging recent works on convergence of SGD
+on non-smooth and non-convex functions by Bianchi et al. (2022), we aim to
+bridge that knowledge gap, and provide a realistic context under which
+fixed-step SGD trajectories for the SW loss on NN parameters converge. More
+precisely, we show that the trajectories approach the set of (sub)-gradient
+flow equations as the step decreases. Under stricter assumptions, we show a
+much stronger convergence result for noised and projected SGD schemes, namely
+that the long-run limits of the trajectories approach a set of generalised
+critical points of the loss function.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ JoinGym: An Efficient Query Optimization Environment for Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11704v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11704v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiwen Wang, Junxiong Wang, Yueying Li, Nathan Kallus, Immanuel Trummer, Wen Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present \textsc{JoinGym}, an efficient and lightweight
+query optimization environment for reinforcement learning (RL). Join order
+selection (JOS) is a classic NP-hard combinatorial optimization problem from
+database query optimization and can serve as a practical testbed for the
+generalization capabilities of RL algorithms. We describe how to formulate each
+of the left-deep and bushy variants of the JOS problem as a Markov Decision
+Process (MDP), and we provide an implementation adhering to the standard
+Gymnasium API. We highlight that our implementation \textsc{JoinGym} is
+completely based on offline traces of all possible joins, which enables RL
+practitioners to easily and quickly test their methods on a realistic data
+management problem without needing to setup any systems. Moreover, we also
+provide all possible join traces on $3300$ novel SQL queries generated from the
+IMDB dataset. Upon benchmarking popular RL algorithms, we find that at least
+one method can obtain near-optimal performance on train-set queries but their
+performance degrades by several orders of magnitude on test-set queries. This
+gap motivates further research for RL algorithms that generalize well in
+multi-task combinatorial optimization problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We will make all the queries available soon</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using simulation to calibrate real data acquisition in veterinary
+  medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krystian Strzałka, Szymon Mazurek, Maciej Wielgosz, Paweł Russek, Jakub Caputa, Daria Łukasik, Jan Krupiński, Jakub Grzeszczyk, Michał Karwatowski, Rafał Frączek, Ernest Jamro, Marcin Pietroń, Sebastian Koryciak, Agnieszka Dąbrowska-Boruch, Kazimierz Wiatr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the innovative use of simulation environments to enhance
+data acquisition and diagnostics in veterinary medicine, focusing specifically
+on gait analysis in dogs. The study harnesses the power of Blender and the
+Blenderproc library to generate synthetic datasets that reflect diverse
+anatomical, environmental, and behavioral conditions. The generated data,
+represented in graph form and standardized for optimal analysis, is utilized to
+train machine learning algorithms for identifying normal and abnormal gaits.
+Two distinct datasets with varying degrees of camera angle granularity are
+created to further investigate the influence of camera perspective on model
+accuracy. Preliminary results suggest that this simulation-based approach holds
+promise for advancing veterinary diagnostics by enabling more precise data
+acquisition and more effective machine learning models. By integrating
+synthetic and real-world patient data, the study lays a robust foundation for
+improving overall effectiveness and efficiency in veterinary medicine.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Adaptive Test-Time Defense with Robust Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anurag Singh, Mahalakshmi Sabanayagam, Krikamol Muandet, Debarghya Ghoshdastidar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adaptive test-time defenses are used to improve the robustness of deep neural
+networks to adversarial examples. However, existing methods significantly
+increase the inference time due to additional optimization on the model
+parameters or the input at test time. In this work, we propose a novel adaptive
+test-time defense strategy that is easy to integrate with any existing (robust)
+training procedure without additional test-time computation. Based on the
+notion of robustness of features that we present, the key idea is to project
+the trained models to the most robust feature space, thereby reducing the
+vulnerability to adversarial attacks in non-robust directions. We theoretically
+show that the top eigenspace of the feature matrix are more robust for a
+generalized additive model and support our argument for a large width neural
+network with the Neural Tangent Kernel (NTK) equivalence. We conduct extensive
+experiments on CIFAR-10 and CIFAR-100 datasets for several robustness
+benchmarks, including the state-of-the-art methods in RobustBench, and observe
+that the proposed method outperforms existing adaptive test-time defenses at
+much lower computation costs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Efficient Interior-Point Method for Online Convex Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11668v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11668v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elad Hazan, Nimrod Megiddo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A new algorithm for regret minimization in online convex optimization is
+described. The regret of the algorithm after $T$ time periods is $O(\sqrt{T
+\log T})$ - which is the minimum possible up to a logarithmic term. In
+addition, the new algorithm is adaptive, in the sense that the regret bounds
+hold not only for the time periods $1,\ldots,T$ but also for every sub-interval
+$s,s+1,\ldots,t$. The running time of the algorithm matches that of newly
+introduced interior point algorithms for regret minimization: in
+$n$-dimensional space, during each iteration the new algorithm essentially
+solves a system of linear equations of order $n$, rather than solving some
+constrained convex optimization problem in $n$ dimensions and possibly many
+constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing CLIP with <span class="highlight-title">GPT</span>-4: Harnessing Visual Descriptions as <span class="highlight-title">Prompt</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11661v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11661v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayug Maniparambil, Chris Vorster, Derek Molloy, Noel Murphy, Kevin McGuinness, Noel E. O'Connor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have
+revolutionized visual representation learning by providing good performance on
+downstream datasets. VLMs are 0-shot adapted to a downstream dataset by
+designing prompts that are relevant to the dataset. Such prompt engineering
+makes use of domain expertise and a validation dataset. Meanwhile, recent
+developments in generative pretrained models like GPT-4 mean they can be used
+as advanced internet search tools. They can also be manipulated to provide
+visual information in any structure. In this work, we show that GPT-4 can be
+used to generate text that is visually descriptive and how this can be used to
+adapt CLIP to downstream tasks. We show considerable improvements in 0-shot
+transfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD
+(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.
+We also design a simple few-shot adapter that learns to choose the best
+possible sentences to construct generalizable classifiers that outperform the
+recently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized
+fine-grained datasets. We will release the code, prompts, and auxiliary text
+dataset upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, Pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bandits with Deterministically Evolving States 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khashayar Khosravi, Renato Paes Leme, Chara Podimata, Apostolis Tsorvantzis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a model for learning with bandit feedback while accounting for
+deterministically evolving and unobservable states that we call Bandits with
+Deterministically Evolving States. The workhorse applications of our model are
+learning for recommendation systems and learning for online ads. In both cases,
+the reward that the algorithm obtains at each round is a function of the
+short-term reward of the action chosen and how ``healthy'' the system is (i.e.,
+as measured by its state). For example, in recommendation systems, the reward
+that the platform obtains from a user's engagement with a particular type of
+content depends not only on the inherent features of the specific content, but
+also on how the user's preferences have evolved as a result of interacting with
+other types of content on the platform. Our general model accounts for the
+different rate $\lambda \in [0,1]$ at which the state evolves (e.g., how fast a
+user's preferences shift as a result of previous content consumption) and
+encompasses standard multi-armed bandits as a special case. The goal of the
+algorithm is to minimize a notion of regret against the best-fixed sequence of
+arms pulled. We analyze online learning algorithms for any possible
+parametrization of the evolution rate $\lambda$. Specifically, the regret rates
+obtained are: for $\lambda \in [0, 1/T^2]$: $\widetilde O(\sqrt{KT})$; for
+$\lambda = T^{-a/b}$ with $b < a < 2b$: $\widetilde O (T^{b/a})$; for $\lambda
+\in (1/T, 1 - 1/\sqrt{T}): \widetilde O (K^{1/3}T^{2/3})$; and for $\lambda \in
+[1 - 1/\sqrt{T}, 1]: \widetilde O (K\sqrt{T})$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable Multi-agent Skill Discovery based on Kronecker Graphs <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Chen, Jingdi Chen, Tian Lan, Vaneet Aggarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Covering skill (a.k.a., option) discovery has been developed to improve the
+exploration of RL in single-agent scenarios with sparse reward signals, through
+connecting the most distant states in the embedding space provided by the
+Fiedler vector of the state transition graph. Given that joint state space
+grows exponentially with the number of agents in multi-agent systems, existing
+researches still relying on single-agent option discovery either become
+prohibitive or fail to directly discover joint options that improve the
+connectivity of the joint state space. In this paper, we show how to directly
+compute multi-agent options with collaborative exploratory behaviors while
+still enjoying the ease of decomposition. Our key idea is to approximate the
+joint state space as a Kronecker graph, based on which we can directly estimate
+its Fiedler vector using the Laplacian spectrum of individual agents'
+transition graphs. Further, considering that directly computing the Laplacian
+spectrum is intractable for tasks with infinite-scale state spaces, we further
+propose a deep learning extension of our method by estimating eigenfunctions
+through NN-based representation learning techniques. The evaluation on
+multi-agent tasks built with simulators like Mujoco, shows that the proposed
+algorithm can successfully identify multi-agent options, and significantly
+outperforms the state-of-the-art. Codes are available at:
+https://github.itap.purdue.edu/Clan-labs/Scalable_MAOD_via_KP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2022. arXiv admin note: substantial text overlap
+  with arXiv:2201.08227</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Offline Multi-Agent Reinforcement Learning with Implicit Global-to-Local
+  Value Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangsen Wang, Haoran Xu, Yinan Zheng, Xianyuan Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline reinforcement learning (RL) has received considerable attention in
+recent years due to its attractive capability of learning policies from offline
+datasets without environmental interactions. Despite some success in the
+single-agent setting, offline multi-agent RL (MARL) remains to be a challenge.
+The large joint state-action space and the coupled multi-agent behaviors pose
+extra complexities for offline policy optimization. Most existing offline MARL
+studies simply apply offline data-related regularizations on individual agents,
+without fully considering the multi-agent system at the global level. In this
+work, we present OMIGA, a new offline m ulti-agent RL algorithm with implicit
+global-to-local v alue regularization. OMIGA provides a principled framework to
+convert global-level value regularization into equivalent implicit local value
+regularizations and simultaneously enables in-sample learning, thus elegantly
+bridging multi-agent value decomposition and policy learning with offline
+regularizations. Based on comprehensive experiments on the offline multi-agent
+MuJoCo and StarCraft II micro-management tasks, we show that OMIGA achieves
+superior performance over the state-of-the-art offline MARL methods in almost
+all tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Fully-Asynchronous Methods for Distributed Training over General
+  Architecture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehan Zhu, Ye Tian, Yan Huang, Jinming Xu, Shibo He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Perfect synchronization in distributed machine learning problems is
+inefficient and even impossible due to the existence of latency, package losses
+and stragglers. We propose a Robust Fully-Asynchronous Stochastic Gradient
+Tracking method (R-FAST), where each device performs local computation and
+communication at its own pace without any form of synchronization. Different
+from existing asynchronous distributed algorithms, R-FAST can eliminate the
+impact of data heterogeneity across devices and allow for packet losses by
+employing a robust gradient tracking strategy that relies on properly designed
+auxiliary variables for tracking and buffering the overall gradient vector.
+More importantly, the proposed method utilizes two spanning-tree graphs for
+communication so long as both share at least one common root, enabling flexible
+designs in communication architectures. We show that R-FAST converges in
+expectation to a neighborhood of the optimum with a geometric rate for smooth
+and strongly convex objectives; and to a stationary point with a sublinear rate
+for general non-convex settings. Extensive experiments demonstrate that R-FAST
+runs 1.5-2 times faster than synchronous benchmark algorithms, such as
+Ring-AllReduce and D-PSGD, while still achieving comparable accuracy, and
+outperforms existing asynchronous SOTA algorithms, such as AD-PSGD and OSGP,
+especially in the presence of stragglers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Persistent Ballistic Entanglement Spreading with Optimal Control in
+  Quantum Spin Chains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Lu, Pei Shi, Xiao-Han Wang, Jie Hu, Shi-Ju Ran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entanglement propagation provides a key routine to understand quantum
+many-body dynamics in and out of equilibrium. In this work, we uncover that the
+``variational entanglement-enhancing'' field (VEEF) robustly induces a
+persistent ballistic spreading of entanglement in quantum spin chains. The VEEF
+is time dependent, and is optimally controlled to maximize the bipartite
+entanglement entropy (EE) of the final state. Such a linear growth persists
+till the EE reaches the genuine saturation $\tilde{S} = - \log_{2}
+2^{-\frac{N}{2}}=\frac{N}{2}$ with $N$ the total number of spins. The EE
+satisfies $S(t) = v t$ for the time $t \leq \frac{N}{2v}$, with $v$ the
+velocity. These results are in sharp contrast with the behaviors without VEEF,
+where the EE generally approaches a sub-saturation known as the Page value
+$\tilde{S}_{P} =\tilde{S} - \frac{1}{2\ln{2}}$ in the long-time limit, and the
+entanglement growth deviates from being linear before the Page value is
+reached. The dependence between the velocity and interactions is explored, with
+$v \simeq 2.76$, $4.98$, and $5.75$ for the spin chains with Ising, XY, and
+Heisenberg interactions, respectively. We further show that the nonlinear
+growth of EE emerges with the presence of long-range interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning minimal representations of stochastic processes with
+  variational autoencoders <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Fernández-Fernández, Carlo Manzo, Maciej Lewenstein, Alexandre Dauphin, Gorka Muñoz-Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic processes have found numerous applications in science, as they are
+broadly used to model a variety of natural phenomena. Due to their intrinsic
+randomness and uncertainty, they are however difficult to characterize. Here,
+we introduce an unsupervised machine learning approach to determine the minimal
+set of parameters required to effectively describe the dynamics of a stochastic
+process. Our method builds upon an extended $\beta$-variational autoencoder
+architecture. By means of simulated datasets corresponding to paradigmatic
+diffusion models, we showcase its effectiveness in extracting the minimal
+relevant parameters that accurately describe these dynamics. Furthermore, the
+method enables the generation of new trajectories that faithfully replicate the
+expected stochastic behavior. Overall, our approach enables for the autonomous
+discovery of unknown parameters describing stochastic processes, hence
+enhancing our comprehension of complex phenomena across various fields.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, 1 table. Code available at
+  https://github.com/GabrielFernandezFernandez/SPIVAE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding Optimal Diverse Feature Sets with Alternative Feature Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11607v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11607v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakob Bach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Feature selection is popular for obtaining small, interpretable, yet highly
+accurate prediction models. Conventional feature-selection methods typically
+yield one feature set only, which might not suffice in some scenarios. For
+example, users might be interested in finding alternative feature sets with
+similar prediction quality, offering different explanations of the data. In
+this article, we introduce alternative feature selection and formalize it as an
+optimization problem. In particular, we define alternatives via constraints and
+enable users to control the number and dissimilarity of alternatives. Next, we
+analyze the complexity of this optimization problem and show NP-hardness.
+Further, we discuss how to integrate conventional feature-selection methods as
+objectives. Finally, we evaluate alternative feature selection with 30
+classification datasets. We observe that alternative feature sets may indeed
+have high prediction quality, and we analyze several factors influencing this
+outcome.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transferability of Convolutional Neural Networks in Stationary Learning
+  Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Damian Owerko, Charilaos I. Kanatsoulis, Jennifer Bondarchuk, Donald J. Bucci Jr, Alejandro Ribeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in hardware and big data acquisition have accelerated the
+development of deep learning techniques. For an extended period of time,
+increasing the model complexity has led to performance improvements for various
+tasks. However, this trend is becoming unsustainable and there is a need for
+alternative, computationally lighter methods. In this paper, we introduce a
+novel framework for efficient training of convolutional neural networks (CNNs)
+for large-scale spatial problems. To accomplish this we investigate the
+properties of CNNs for tasks where the underlying signals are stationary. We
+show that a CNN trained on small windows of such signals achieves a nearly
+performance on much larger windows without retraining. This claim is supported
+by our theoretical analysis, which provides a bound on the performance
+degradation. Additionally, we conduct thorough experimental analysis on two
+tasks: multi-target tracking and mobile infrastructure on demand. Our results
+show that the CNN is able to tackle problems with many hundreds of agents after
+being trained with fewer than ten. Thus, CNN architectures provide solutions to
+these problems at previously computationally intractable scales.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 figures, for associated code see
+  https://github.com/damowerko/mtt</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Change of Heart: Improving Speech Emotion Recognition through
+  Speech-to-Text Modality Conversion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11584v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11584v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeinab Sadat Taghavi, Ali Satvaty, Hossein Sameti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech Emotion Recognition (SER) is a challenging task. In this paper, we
+introduce a modality conversion concept aimed at enhancing emotion recognition
+performance on the MELD dataset. We assess our approach through two
+experiments: first, a method named Modality-Conversion that employs automatic
+speech recognition (ASR) systems, followed by a text classifier; second, we
+assume perfect ASR output and investigate the impact of modality conversion on
+SER, this method is called Modality-Conversion++. Our findings indicate that
+the first method yields substantial results, while the second method
+outperforms state-of-the-art (SOTA) speech-based approaches in terms of SER
+weighted-F1 (WF1) score on the MELD dataset. This research highlights the
+potential of modality conversion for tasks that can be conducted in alternative
+modalities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FMT: Removing Backdoor Feature Maps via Feature Map Testing in Deep
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Huang, Qingwen Bu, Yahao Qing, Yichao Fu, Heming Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have been widely used in many critical applications,
+such as autonomous vehicles and medical diagnosis. However, their security is
+threatened by backdoor attack, which is achieved by adding artificial patterns
+to specific training data. Existing defense strategies primarily focus on using
+reverse engineering to reproduce the backdoor trigger generated by attackers
+and subsequently repair the DNN model by adding the trigger into inputs and
+fine-tuning the model with ground-truth labels. However, once the trigger
+generated by the attackers is complex and invisible, the defender can not
+successfully reproduce the trigger. Consequently, the DNN model will not be
+repaired since the trigger is not effectively removed.
+  In this work, we propose Feature Map Testing~(FMT). Different from existing
+defense strategies, which focus on reproducing backdoor triggers, FMT tries to
+detect the backdoor feature maps, which are trained to extract backdoor
+information from the inputs. After detecting these backdoor feature maps, FMT
+will erase them and then fine-tune the model with a secure subset of training
+data. Our experiments demonstrate that, compared to existing defense
+strategies, FMT can effectively reduce the Attack Success Rate (ASR) even
+against the most complex and invisible attack triggers. Second, unlike
+conventional defense methods that tend to exhibit low Robust Accuracy (i.e.,
+the model's accuracy on the poisoned data), FMT achieves higher RA, indicating
+its superiority in maintaining model performance while mitigating the effects
+of backdoor attacks~(e.g., FMT obtains 87.40\% RA in CIFAR10). Third, compared
+to existing feature map pruning techniques, FMT can cover more backdoor feature
+maps~(e.g., FMT removes 83.33\% of backdoor feature maps from the model in the
+CIFAR10 \& BadNet scenario).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A multi-modal representation of El Niño Southern Oscillation Diversity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakob Schlör, Felix Strnad, Antonietta Capotondi, Bedartha Goswami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The El Ni\~no-Southern Oscillation (ENSO) is characterized by alternating
+periods of warm (El Ni\~no) and cold (La Ni\~na) sea surface temperature
+anomalies (SSTA) in the equatorial Pacific. Although El Ni\~no and La Ni\~na
+are well-defined climate patterns, no two events are alike. To date, ENSO
+diversity has been described primarily in terms of the longitudinal location of
+peak SSTA, used to define a bimodal classification of events in Eastern Pacific
+(EP) and Central Pacific (CP) types. Here, we use low-dimensional
+representations of Pacific SSTAs to argue that binary categorical memberships
+are unsuitable to describe ENSO events. Using fuzzy unsupervised clustering, we
+recover the four known ENSO categories, along with a fifth category: an Extreme
+El Ni\~no. We show that Extreme El Ni\~nos differ both in their intensity and
+temporal evolution from canonical EP El Ni\~nos. We also find that CP La
+Ni\~nas, EP El Ni\~nos, and Extreme El Ni\~nos contribute the most to
+interdecadal ENSO variability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards practical reinforcement learning for tokamak magnetic control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11546v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11546v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brendan D. Tracey, Andrea Michi, Yuri Chervonyi, Ian Davies, Cosmin Paduraru, Nevena Lazic, Federico Felici, Timo Ewalds, Craig Donner, Cristian Galperti, Jonas Buchli, Michael Neunert, Andrea Huber, Jonathan Evens, Paula Kurylowicz, Daniel J. Mankowitz, Martin Riedmiller, The TCV Team
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) has shown promising results for real-time control
+systems, including the domain of plasma magnetic control. However, there are
+still significant drawbacks compared to traditional feedback control approaches
+for magnetic confinement. In this work, we address key drawbacks of the RL
+method; achieving higher control accuracy for desired plasma properties,
+reducing the steady-state error, and decreasing the required time to learn new
+tasks. We build on top of \cite{degrave2022magnetic}, and present algorithmic
+improvements to the agent architecture and training procedure. We present
+simulation results that show up to 65\% improvement in shape accuracy, achieve
+substantial reduction in the long-term bias of the plasma current, and
+additionally reduce the training time required to learn new tasks by a factor
+of 3 or more. We present new experiments using the upgraded RL-based
+controllers on the TCV tokamak, which validate the simulation results achieved,
+and point the way towards routinely achieving accurate discharges using the RL
+approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training Latency Minimization for Model-Splitting Allowed Federated Edge
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Wen, Guopeng Zhang, Kezhi Wang, Kun Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To alleviate the shortage of computing power faced by clients in training
+deep neural networks (DNNs) using federated learning (FL), we leverage the edge
+computing and split learning to propose a model-splitting allowed FL (SFL)
+framework, with the aim to minimize the training latency without loss of test
+accuracy. Under the synchronized global update setting, the latency to complete
+a round of global training is determined by the maximum latency for the clients
+to complete a local training session. Therefore, the training latency
+minimization problem (TLMP) is modelled as a minimizing-maximum problem. To
+solve this mixed integer nonlinear programming problem, we first propose a
+regression method to fit the quantitative-relationship between the cut-layer
+and other parameters of an AI-model, and thus, transform the TLMP into a
+continuous problem. Considering that the two subproblems involved in the TLMP,
+namely, the cut-layer selection problem for the clients and the computing
+resource allocation problem for the parameter-server are relative independence,
+an alternate-optimization-based algorithm with polynomial time complexity is
+developed to obtain a high-quality solution to the TLMP. Extensive experiments
+are performed on a popular DNN-model EfficientNetV2 using dataset MNIST, and
+the results verify the validity and improved performance of the proposed SFL
+framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ General regularization in covariate shift adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11503v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11503v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duc Hoan Nguyen, Sergei V. Pereverzyev, Werner Zellinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sample reweighting is one of the most widely used methods for correcting the
+error of least squares learning algorithms in reproducing kernel Hilbert spaces
+(RKHS), that is caused by future data distributions that are different from the
+training data distribution. In practical situations, the sample weights are
+determined by values of the estimated Radon-Nikod\'ym derivative, of the future
+data distribution w.r.t.~the training data distribution. In this work, we
+review known error bounds for reweighted kernel regression in RKHS and obtain,
+by combination, novel results. We show under weak smoothness conditions, that
+the amount of samples, needed to achieve the same order of accuracy as in the
+standard supervised learning without differences in data distributions, is
+smaller than proven by state-of-the-art analyses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predict, Refine, Synthesize: Self-Guiding Diffusion Models for
+  Probabilistic Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11494v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11494v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcel Kollovieh, Abdul Fatir Ansari, Michael Bohlke-Schneider, Jasper Zschiegner, Hao Wang, Yuyang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have achieved state-of-the-art performance in generative
+modeling tasks across various domains. Prior works on time series diffusion
+models have primarily focused on developing conditional models tailored to
+specific forecasting or imputation tasks. In this work, we explore the
+potential of task-agnostic, unconditional diffusion models for several time
+series applications. We propose TSDiff, an unconditionally trained diffusion
+model for time series. Our proposed self-guidance mechanism enables
+conditioning TSDiff for downstream tasks during inference, without requiring
+auxiliary networks or altering the training procedure. We demonstrate the
+effectiveness of our method on three different time series tasks: forecasting,
+refinement, and synthetic data generation. First, we show that TSDiff is
+competitive with several task-specific conditional forecasting methods
+(predict). Second, we leverage the learned implicit probability density of
+TSDiff to iteratively refine the predictions of base forecasters with reduced
+computational overhead over reverse diffusion (refine). Notably, the generative
+performance of the model remains intact -- downstream forecasters trained on
+synthetic samples from TSDiff outperform forecasters that are trained on
+samples from other state-of-the-art generative time series models, occasionally
+even outperforming models trained on real data (synthesize).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Deep State-Space Analysis Framework for Patient Latent State
+  Estimation and Classification from EHR Time Series Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aya Nakamura, Ryosuke Kojima, Yuji Okamoto, Eiichiro Uchino, Yohei Mineharu, Yohei Harada, Mayumi Kamada, Manabu Muto, Motoko Yanagita, Yasushi Okuno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many diseases, including cancer and chronic conditions, require extended
+treatment periods and long-term strategies. Machine learning and AI research
+focusing on electronic health records (EHRs) have emerged to address this need.
+Effective treatment strategies involve more than capturing sequential changes
+in patient test values. It requires an explainable and clinically interpretable
+model by capturing the patient's internal state over time.
+  In this study, we propose the "deep state-space analysis framework," using
+time-series unsupervised learning of EHRs with a deep state-space model. This
+framework enables learning, visualizing, and clustering of temporal changes in
+patient latent states related to disease progression.
+  We evaluated our framework using time-series laboratory data from 12,695
+cancer patients. By estimating latent states, we successfully discover latent
+states related to prognosis. By visualization and cluster analysis, the
+temporal transition of patient status and test items during state transitions
+characteristic of each anticancer drug were identified. Our framework surpasses
+existing methods in capturing interpretable latent space. It can be expected to
+enhance our comprehension of disease progression from EHRs, aiding treatment
+adjustments and prognostic determinations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep Learning Approach for Overall Survival Analysis with Missing
+  Values 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Camillo Maria Caruso, Valerio Guarrasi, Sara Ramella, Paolo Soda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most challenging fields where Artificial Intelligence (AI) can be
+applied is lung cancer research, specifically non-small cell lung cancer
+(NSCLC). In particular, overall survival (OS) is a vital indicator of patient
+status, helping to identify subgroups with diverse survival probabilities,
+enabling tailored treatment and improved OS rates. In this analysis, there are
+two challenges to take into account. First, few studies effectively exploit the
+information available from each patient, leveraging both uncensored (i.e.,
+dead) and censored (i.e., survivors) patients, considering also the death
+times. Second, the handling of incomplete data is a common issue in the medical
+field. This problem is typically tackled through the use of imputation methods.
+Our objective is to present an AI model able to overcome these limits,
+effectively learning from both censored and uncensored patients and their
+available features, for the prediction of OS for NSCLC patients. We present a
+novel approach to survival analysis in the context of NSCLC, which exploits the
+strengths of the transformer architecture accounting for only available
+features without requiring any imputation strategy. By making use of ad-hoc
+losses for OS, it accounts for both censored and uncensored patients,
+considering risks over time. We evaluated the results over a period of 6 years
+using different time granularities obtaining a Ct-index, a time-dependent
+variant of the C-index, of 71.97, 77.58 and 80.72 for time units of 1 month, 1
+year and 2 years, respectively, outperforming all state-of-the-art methods
+regardless of the imputation method used.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improve Long-term Memory Learning Through Rescaling the Error Temporally 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11462v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11462v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shida Wang, Zhanglu Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the error metric selection for long-term memory learning
+in sequence modelling. We examine the bias towards short-term memory in
+commonly used errors, including mean absolute/squared error. Our findings show
+that all temporally positive-weighted errors are biased towards short-term
+memory in learning linear functionals. To reduce this bias and improve
+long-term memory learning, we propose the use of a temporally rescaled error.
+In addition to reducing the bias towards short-term memory, this approach can
+also alleviate the vanishing gradient issue. We conduct numerical experiments
+on different long-memory tasks and sequence models to validate our claims.
+Numerical results confirm the importance of appropriate temporally rescaled
+error for effective long-term memory learning. To the best of our knowledge,
+this is the first work that quantitatively analyzes different errors' memory
+bias towards short-term memory in sequence modelling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Operators for Delay-Compensating Control of Hyperbolic PIDEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11436v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11436v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Qi, Jing Zhang, Miroslav Krstic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently introduced DeepONet operator-learning framework for PDE control
+is extended from the results for basic hyperbolic and parabolic PDEs to an
+advanced hyperbolic class that involves delays on both the state and the system
+output or input. The PDE backstepping design produces gain functions that are
+outputs of a nonlinear operator, mapping functions on a spatial domain into
+functions on a spatial domain, and where this gain-generating operator's inputs
+are the PDE's coefficients. The operator is approximated with a DeepONet neural
+network to a degree of accuracy that is provably arbitrarily tight. Once we
+produce this approximation-theoretic result in infinite dimension, with it we
+establish stability in closed loop under feedback that employs approximate
+gains. In addition to supplying such results under full-state feedback, we also
+develop DeepONet-approximated observers and output-feedback laws and prove
+their own stabilizing properties under neural operator approximations. With
+numerical simulations we illustrate the theoretical results and quantify the
+numerical effort savings, which are of two orders of magnitude, thanks to
+replacing the numerical PDE solving with the DeepONet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Batching for Green AI -- An Exploratory Study on Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11434v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11434v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Yarally, Luís Cruz, Daniel Feitosa, June Sallou, Arie van Deursen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The batch size is an essential parameter to tune during the development of
+new neural networks. Amongst other quality indicators, it has a large degree of
+influence on the model's accuracy, generalisability, training times and
+parallelisability. This fact is generally known and commonly studied. However,
+during the application phase of a deep learning model, when the model is
+utilised by an end-user for inference, we find that there is a disregard for
+the potential benefits of introducing a batch size. In this study, we examine
+the effect of input batching on the energy consumption and response times of
+five fully-trained neural networks for computer vision that were considered
+state-of-the-art at the time of their publication. The results suggest that
+batching has a significant effect on both of these metrics. Furthermore, we
+present a timeline of the energy efficiency and accuracy of neural networks
+over the past decade. We find that in general, energy consumption rises at a
+much steeper pace than accuracy and question the necessity of this evolution.
+Additionally, we highlight one particular network, ShuffleNetV2(2018), that
+achieved a competitive performance for its time while maintaining a much lower
+energy consumption. Nevertheless, we highlight that the results are model
+dependent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, 1 table. Accepted at Euromicro Conference Series
+  on Software Engineering and Advanced Applications (SEAA) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Analysis of Multi-Agent Reinforcement Learning for Decentralized
+  Inventory Control Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marwan Mousa, Damien van de Berg, Niki Kotecha, Ehecatl Antonio del Rio-Chanona, Max Mowbray
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most solutions to the inventory management problem assume a centralization of
+information that is incompatible with organisational constraints in real supply
+chain networks. The inventory management problem is a well-known planning
+problem in operations research, concerned with finding the optimal re-order
+policy for nodes in a supply chain. While many centralized solutions to the
+problem exist, they are not applicable to real-world supply chains made up of
+independent entities. The problem can however be naturally decomposed into
+sub-problems, each associated with an independent entity, turning it into a
+multi-agent system. Therefore, a decentralized data-driven solution to
+inventory management problems using multi-agent reinforcement learning is
+proposed where each entity is controlled by an agent. Three multi-agent
+variations of the proximal policy optimization algorithm are investigated
+through simulations of different supply chain networks and levels of
+uncertainty. The centralized training decentralized execution framework is
+deployed, which relies on offline centralization during simulation-based policy
+identification, but enables decentralization when the policies are deployed
+online to the real system. Results show that using multi-agent proximal policy
+optimization with a centralized critic leads to performance very close to that
+of a centralized data-driven solution and outperforms a distributed model-based
+solution in most cases while respecting the information constraints of the
+system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention to Entropic Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11423v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11423v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Torsten Enßlin, Carolin Weidinger, Philipp Frank
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The concept of attention, numerical weights that emphasize the importance of
+particular data, has proven to be very relevant in artificial intelligence.
+Relative entropy (RE, aka Kullback-Leibler divergence) plays a central role in
+communication theory. Here we combine these concepts, attention and RE. RE
+guides optimal encoding of messages in bandwidth-limited communication as well
+as optimal message decoding via the maximum entropy principle (MEP). In the
+coding scenario, RE can be derived from four requirements, namely being
+analytical, local, proper, and calibrated. Weighted RE, used for attention
+steering in communications, turns out to be improper. To see how proper
+attention communication can emerge, we analyze a scenario of a message sender
+who wants to ensure that the receiver of the message can perform well-informed
+actions. If the receiver decodes the message using the MEP, the sender only
+needs to know the receiver's utility function to inform optimally, but not the
+receiver's initial knowledge state. In case only the curvature of the utility
+function maxima are known, it becomes desirable to accurately communicate an
+attention function, in this case a by this curvature weighted and re-normalized
+probability function. Entropic attention communication is here proposed as the
+desired generalization of entropic communication that permits weighting while
+being proper, thereby aiding the design of optimal communication protocols in
+technical applications and helping to understand human communication. For
+example, our analysis shows how to derive the level of cooperation expected
+under misaligned interests of otherwise honest communication partners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 4 figures, submitted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Direct and inverse modeling of soft robots by learning a condensed FEM
+  model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Etienne Ménager, Tanguy Navez, Olivier Goury, Christian Duriez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Finite Element Method (FEM) is a powerful modeling tool for predicting
+the behavior of soft robots. However, its use for control can be difficult for
+non-specialists of numerical computation: it requires an optimization of the
+computation to make it real-time. In this paper, we propose a learning-based
+approach to obtain a compact but sufficiently rich mechanical representation.
+Our choice is based on nonlinear compliance data in the actuator/effector space
+provided by a condensation of the FEM model. We demonstrate that this compact
+model can be learned with a reasonable amount of data and, at the same time, be
+very efficient in terms of modeling, since we can deduce the direct and inverse
+kinematics of the robot. We also show how to couple some models learned
+individually in particular on an example of a gripper composed of two soft
+fingers. Other results are shown by comparing the inverse model derived from
+the full FEM model and the one from the compact learned version. This work
+opens new perspectives, namely for the embedded control of soft robots, but
+also for their design. These perspectives are also discussed in the paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Modeling of Inter- and Intra-observer Variability in
+  Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arne Schmidt, Pablo Morales-Álvarez, Rafael Molina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation is a challenging task, particularly due to inter-
+and intra-observer variability, even between medical experts. In this paper, we
+propose a novel model, called Probabilistic Inter-Observer and iNtra-Observer
+variation NetwOrk (Pionono). It captures the labeling behavior of each rater
+with a multidimensional probability distribution and integrates this
+information with the feature maps of the image to produce probabilistic
+segmentation predictions. The model is optimized by variational inference and
+can be trained end-to-end. It outperforms state-of-the-art models such as
+STAPLE, Probabilistic U-Net, and models based on confusion matrices.
+Additionally, Pionono predicts multiple coherent segmentation maps that mimic
+the rater's expert opinion, which provides additional valuable information for
+the diagnostic process. Experiments on real-world cancer segmentation datasets
+demonstrate the high accuracy and efficiency of Pionono, making it a powerful
+tool for medical image analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Better Fairness-Utility Trade-off: A Comprehensive
+  Measurement-Based Reinforcement Learning Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simiao Zhang, Jitao Bai, Menghong Guan, Yihao Huang, Yueling Zhang, Jun Sun, Geguang Pu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning is widely used to make decisions with societal impact such
+as bank loan approving, criminal sentencing, and resume filtering. How to
+ensure its fairness while maintaining utility is a challenging but crucial
+issue. Fairness is a complex and context-dependent concept with over 70
+different measurement metrics. Since existing regulations are often vague in
+terms of which metric to use and different organizations may prefer different
+fairness metrics, it is important to have means of improving fairness
+comprehensively. Existing mitigation techniques often target at one specific
+fairness metric and have limitations in improving multiple notions of fairness
+simultaneously. In this work, we propose CFU (Comprehensive Fairness-Utility),
+a reinforcement learning-based framework, to efficiently improve the
+fairness-utility trade-off in machine learning classifiers. A comprehensive
+measurement that can simultaneously consider multiple fairness notions as well
+as utility is established, and new metrics are proposed based on an in-depth
+analysis of the relationship between different fairness metrics. The reward
+function of CFU is constructed with comprehensive measurement and new metrics.
+We conduct extensive experiments to evaluate CFU on 6 tasks, 3 machine learning
+models, and 15 fairness-utility measurements. The results demonstrate that CFU
+can improve the classifier on multiple fairness metrics without sacrificing its
+utility. It outperforms all state-of-the-art techniques and has witnessed a
+37.5% improvement on average.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LatentAugment: Data Augmentation via Guided Manipulation of GAN's Latent
+  Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Tronchin, Minh H. Vu, Paolo Soda, Tommy Löfstedt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data Augmentation (DA) is a technique to increase the quantity and diversity
+of the training data, and by that alleviate overfitting and improve
+generalisation. However, standard DA produces synthetic data for augmentation
+with limited diversity. Generative Adversarial Networks (GANs) may unlock
+additional information in a dataset by generating synthetic samples having the
+appearance of real images. However, these models struggle to simultaneously
+address three key requirements: fidelity and high-quality samples; diversity
+and mode coverage; and fast sampling. Indeed, GANs generate high-quality
+samples rapidly, but have poor mode coverage, limiting their adoption in DA
+applications. We propose LatentAugment, a DA strategy that overcomes the low
+diversity of GANs, opening up for use in DA applications. Without external
+supervision, LatentAugment modifies latent vectors and moves them into latent
+space regions to maximise the synthetic images' diversity and fidelity. It is
+also agnostic to the dataset and the downstream task. A wide set of experiments
+shows that LatentAugment improves the generalisation of a deep model
+translating from MRI-to-CT beating both standard DA as well GAN-based sampling.
+Moreover, still in comparison with GAN-based sampling, LatentAugment synthetic
+samples show superior mode coverage and diversity. Code is available at:
+https://github.com/ltronchin/LatentAugment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diverse Offline Imitation via Fenchel Duality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marin Vlastelica, Pavel Kolev, Jin Cheng, Georg Martius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been significant recent progress in the area of unsupervised skill
+discovery, with various works proposing mutual information based objectives, as
+a source of intrinsic motivation. Prior works predominantly focused on
+designing algorithms that require online access to the environment. In
+contrast, we develop an \textit{offline} skill discovery algorithm. Our problem
+formulation considers the maximization of a mutual information objective
+constrained by a KL-divergence. More precisely, the constraints ensure that the
+state occupancy of each skill remains close to the state occupancy of an
+expert, within the support of an offline dataset with good state-action
+coverage. Our main contribution is to connect Fenchel duality, reinforcement
+learning and unsupervised skill discovery, and to give a simple offline
+algorithm for learning diverse skills that are aligned with an expert.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Random Separating Hyperplane Theorem and Learning Polytopes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chiranjib Bhattacharyya, Ravindran Kannan, Amit Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Separating Hyperplane theorem is a fundamental result in Convex Geometry
+with myriad applications. Our first result, Random Separating Hyperplane
+Theorem (RSH), is a strengthening of this for polytopes. $\rsh$ asserts that if
+the distance between $a$ and a polytope $K$ with $k$ vertices and unit diameter
+in $\Re^d$ is at least $\delta$, where $\delta$ is a fixed constant in $(0,1)$,
+then a randomly chosen hyperplane separates $a$ and $K$ with probability at
+least $1/poly(k)$ and margin at least $\Omega \left(\delta/\sqrt{d} \right)$.
+An immediate consequence of our result is the first near optimal bound on the
+error increase in the reduction from a Separation oracle to an Optimization
+oracle over a polytope.
+  RSH has algorithmic applications in learning polytopes. We consider a
+fundamental problem, denoted the ``Hausdorff problem'', of learning a unit
+diameter polytope $K$ within Hausdorff distance $\delta$, given an optimization
+oracle for $K$. Using RSH, we show that with polynomially many random queries
+to the optimization oracle, $K$ can be approximated within error $O(\delta)$.
+To our knowledge this is the first provable algorithm for the Hausdorff
+Problem. Building on this result, we show that if the vertices of $K$ are
+well-separated, then an optimization oracle can be used to generate a list of
+points, each within Hausdorff distance $O(\delta)$ of $K$, with the property
+that the list contains a point close to each vertex of $K$. Further, we show
+how to prune this list to generate a (unique) approximation to each vertex of
+the polytope. We prove that in many latent variable settings, e.g., topic
+modeling, LDA, optimization oracles do exist provided we project to a suitable
+SVD subspace. Thus, our work yields the first efficient algorithm for finding
+approximations to the vertices of the latent polytope under the
+well-separatedness assumption.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging the Reality Gap of Reinforcement Learning based Traffic Signal
+  Control using Domain Randomization and Meta Learning <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arthur Müller, Matthia Sabatelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning (RL) has been widely explored in Traffic Signal
+Control (TSC) applications, however, still no such system has been deployed in
+practice. A key barrier to progress in this area is the reality gap, the
+discrepancy that results from differences between simulation models and their
+real-world equivalents. In this paper, we address this challenge by first
+presenting a comprehensive analysis of potential simulation parameters that
+contribute to this reality gap. We then also examine two promising strategies
+that can bridge this gap: Domain Randomization (DR) and Model-Agnostic
+Meta-Learning (MAML). Both strategies were trained with a traffic simulation
+model of an intersection. In addition, the model was embedded in LemgoRL, a
+framework that integrates realistic, safety-critical requirements into the
+control system. Subsequently, we evaluated the performance of the two methods
+on a separate model of the same intersection that was developed with a
+different traffic simulator. In this way, we mimic the reality gap. Our
+experimental results show that both DR and MAML outperform a state-of-the-art
+RL algorithm, therefore highlighting their potential to mitigate the reality
+gap in RLbased TSC systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper was accepted by the ITSC 2023 (26th IEEE International
+  Conference on Intelligent Transportation Systems)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What can a Single Attention Layer Learn? A Study Through the Random
+  Features Lens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11353v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11353v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengyu Fu, Tianyu Guo, Yu Bai, Song Mei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention layers -- which map a sequence of inputs to a sequence of outputs
+-- are core building blocks of the Transformer architecture which has achieved
+significant breakthroughs in modern artificial intelligence. This paper
+presents a rigorous theoretical study on the learning and generalization of a
+single multi-head attention layer, with a sequence of key vectors and a
+separate query vector as input. We consider the random feature setting where
+the attention layer has a large number of heads, with randomly sampled frozen
+query and key matrices, and trainable value matrices. We show that such a
+random-feature attention layer can express a broad class of target functions
+that are permutation invariant to the key vectors. We further provide
+quantitative excess risk bounds for learning these target functions from finite
+samples, using random feature attention with finitely many heads.
+  Our results feature several implications unique to the attention structure
+compared with existing random features theory for neural networks, such as (1)
+Advantages in the sample complexity over standard two-layer random-feature
+networks; (2) Concrete and natural classes of functions that can be learned
+efficiently by a random-feature attention layer; and (3) The effect of the
+sampling distribution of the query-key weight matrix (the product of the query
+and key matrix), where Gaussian random weights with a non-zero mean result in
+better sample complexities over the zero-mean counterpart for learning certain
+natural target functions. Experiments on simulated data corroborate our
+theoretical findings and further illustrate the interplay between the sample
+size and the complexity of the target function.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model-based Offline Reinforcement Learning with Count-based Conservatism <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Byeongchan Kim, Min-hwan Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a model-based offline reinforcement learning method
+that integrates count-based conservatism, named $\texttt{Count-MORL}$. Our
+method utilizes the count estimates of state-action pairs to quantify model
+estimation error, marking the first algorithm of demonstrating the efficacy of
+count-based conservatism in model-based offline deep RL to the best of our
+knowledge. For our proposed method, we first show that the estimation error is
+inversely proportional to the frequency of state-action pairs. Secondly, we
+demonstrate that the learned policy under the count-based conservative model
+offers near-optimality performance guarantees. Through extensive numerical
+experiments, we validate that $\texttt{Count-MORL}$ with hash code
+implementation significantly outperforms existing offline RL algorithms on the
+D4RL benchmark datasets. The code is accessible at
+$\href{https://github.com/oh-lab/Count-MORL}{https://github.com/oh-lab/Count-MORL}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bounded P-values in Parametric Programming-based Selective Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11351v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11351v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomohiro Shiraishi, Daiki Miwa, Vo Nguyen Le Duy, Ichiro Takeuchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Selective inference (SI) has been actively studied as a promising framework
+for statistical hypothesis testing for data-driven hypotheses. The basic idea
+of SI is to make inferences conditional on an event that a hypothesis is
+selected. In order to perform SI, this event must be characterized in a
+traceable form. When selection event is too difficult to characterize,
+additional conditions are introduced for tractability. This additional
+conditions often causes the loss of power, and this issue is referred to as
+over-conditioning. Parametric programming-based SI (PP-based SI) has been
+proposed as one way to address the over-conditioning issue. The main problem of
+PP-based SI is its high computational cost due to the need to exhaustively
+explore the data space. In this study, we introduce a procedure to reduce the
+computational cost while guaranteeing the desired precision, by proposing a
+method to compute the upper and lower bounds of p-values. We also proposed
+three types of search strategies that efficiently improve these bounds. We
+demonstrate the effectiveness of the proposed method in hypothesis testing
+problems for feature selection in linear models and attention region
+identification in deep neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>47pages, 14figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Transferability of Adversarial Examples via Bayesian Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhang Li, Yiwen Guo, Xiaochen Yang, Wangmeng Zuo, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a substantial extension of our work published at ICLR.
+Our ICLR work advocated for enhancing transferability in adversarial examples
+by incorporating a Bayesian formulation into model parameters, which
+effectively emulates the ensemble of infinitely many deep neural networks,
+while, in this paper, we introduce a novel extension by incorporating the
+Bayesian formulation into the model input as well, enabling the joint
+diversification of both the model input and model parameters. Our empirical
+findings demonstrate that: 1) the combination of Bayesian formulations for both
+the model input and model parameters yields significant improvements in
+transferability; 2) by introducing advanced approximations of the posterior
+distribution over the model input, adversarial transferability achieves further
+enhancement, surpassing all state-of-the-arts when attacking without model
+fine-tuning. Moreover, we propose a principled approach to fine-tune model
+parameters in such an extended Bayesian formulation. The derived optimization
+objective inherently encourages flat minima in the parameter space and input
+space. Extensive experiments demonstrate that our method achieves a new
+state-of-the-art on transfer-based attacks, improving the average success rate
+on ImageNet and CIFAR-10 by 19.14% and 2.08%, respectively, when comparing with
+our ICLR basic Bayesian method. We will make our code publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Demystifying Local and Global Fairness Trade-offs in Federated Learning
+  Using Partial Information Decomposition <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faisal Hamman, Sanghamitra Dutta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present an information-theoretic perspective to group
+fairness trade-offs in federated learning (FL) with respect to sensitive
+attributes, such as gender, race, etc. Existing works mostly focus on either
+\emph{global fairness} (overall disparity of the model across all clients) or
+\emph{local fairness} (disparity of the model at each individual client),
+without always considering their trade-offs. There is a lack of understanding
+of the interplay between global and local fairness in FL, and if and when one
+implies the other. To address this gap, we leverage a body of work in
+information theory called partial information decomposition (PID) which first
+identifies three sources of unfairness in FL, namely, \emph{Unique Disparity},
+\emph{Redundant Disparity}, and \emph{Masked Disparity}. Using canonical
+examples, we demonstrate how these three disparities contribute to global and
+local fairness. This decomposition helps us derive fundamental limits and
+trade-offs between global or local fairness, particularly under data
+heterogeneity, as well as, derive conditions under which one implies the other.
+We also present experimental results on benchmark datasets to support our
+theoretical findings. This work offers a more nuanced understanding of the
+sources of disparity in FL that can inform the use of local disparity
+mitigation techniques, and their convergence and effectiveness when deployed in
+practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML Workshop on Federated Learning and Analytics in
+  Practice</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Convergence: Identifiability of Machine Learning and Deep
+  Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reza Sameni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) and deep learning models are extensively used for
+parameter optimization and regression problems. However, not all inverse
+problems in ML are ``identifiable,'' indicating that model parameters may not
+be uniquely determined from the available data and the data model's
+input-output relationship. In this study, we investigate the notion of model
+parameter identifiability through a case study focused on parameter estimation
+from motion sensor data. Utilizing a bipedal-spring mass human walk dynamics
+model, we generate synthetic data representing diverse gait patterns and
+conditions. Employing a deep neural network, we attempt to estimate
+subject-wise parameters, including mass, stiffness, and equilibrium leg length.
+The results show that while certain parameters can be identified from the
+observation data, others remain unidentifiable, highlighting that
+unidentifiability is an intrinsic limitation of the experimental setup,
+necessitating a change in data collection and experimental scenarios. Beyond
+this specific case study, the concept of identifiability has broader
+implications in ML and deep learning. Addressing unidentifiability requires
+proven identifiable models (with theoretical support), multimodal data fusion
+techniques, and advancements in model-based machine learning. Understanding and
+resolving unidentifiability challenges will lead to more reliable and accurate
+applications across diverse domains, transcending mere model convergence and
+enhancing the reliability of machine learning models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Systematic Adaptation of Communication-focused Machine Learning Models
+  from Real to Virtual Environments for Human-Robot Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debasmita Mukherjee, Ritwik Singhai, Homayoun Najjaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual reality has proved to be useful in applications in several fields
+ranging from gaming, medicine, and training to development of interfaces that
+enable human-robot collaboration. It empowers designers to explore applications
+outside of the constraints posed by the real world environment and develop
+innovative solutions and experiences. Hand gestures recognition which has been
+a topic of much research and subsequent commercialization in the real world has
+been possible because of the creation of large, labelled datasets. In order to
+utilize the power of natural and intuitive hand gestures in the virtual domain
+for enabling embodied teleoperation of collaborative robots, similarly large
+datasets must be created so as to keep the working interface easy to learn and
+flexible enough to add more gestures. Depending on the application, this may be
+computationally or economically prohibitive. Thus, the adaptation of trained
+deep learning models that perform well in the real environment to the virtual
+may be a solution to this challenge. This paper presents a systematic framework
+for the real to virtual adaptation using limited size of virtual dataset along
+with guidelines for creating a curated dataset. Finally, while hand gestures
+have been considered as the communication mode, the guidelines and
+recommendations presented are generic. These are applicable to other modes such
+as body poses and facial expressions which have large datasets available in the
+real domain which must be adapted to the virtual one.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of Elephant Movement in Sub-Saharan Africa: Ecological,
+  Climatic, and Conservation Perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Hines, Gregory Glatzer, Shreya Ghosh, Prasenjit Mitra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The interaction between elephants and their environment has profound
+implications for both ecology and conservation strategies. This study presents
+an analytical approach to decipher the intricate patterns of elephant movement
+in Sub-Saharan Africa, concentrating on key ecological drivers such as seasonal
+variations and rainfall patterns. Despite the complexities surrounding these
+influential factors, our analysis provides a holistic view of elephant
+migratory behavior in the context of the dynamic African landscape. Our
+comprehensive approach enables us to predict the potential impact of these
+ecological determinants on elephant migration, a critical step in establishing
+informed conservation strategies. This projection is particularly crucial given
+the impacts of global climate change on seasonal and rainfall patterns, which
+could substantially influence elephant movements in the future. The findings of
+our work aim to not only advance the understanding of movement ecology but also
+foster a sustainable coexistence of humans and elephants in Sub-Saharan Africa.
+By predicting potential elephant routes, our work can inform strategies to
+minimize human-elephant conflict, effectively manage land use, and enhance
+anti-poaching efforts. This research underscores the importance of integrating
+movement ecology and climatic variables for effective wildlife management and
+conservation planning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 17 figures, Accepted in ACM SIGCAS SIGCHI Conference on
+  Computing and Sustainable Societies (COMPASS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XLDA: Linear Discriminant Analysis for Scaling Continual Learning to
+  Extreme Classification at the Edge <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karan Shah, Vishruth Veerendranath, Anushka Hebbar, Raghavendra Bhat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Streaming Linear Discriminant Analysis (LDA) while proven in
+Class-incremental Learning deployments at the edge with limited classes (upto
+1000), has not been proven for deployment in extreme classification scenarios.
+In this paper, we present: (a) XLDA, a framework for Class-IL in edge
+deployment where LDA classifier is proven to be equivalent to FC layer
+including in extreme classification scenarios, and (b) optimizations to enable
+XLDA-based training and inference for edge deployment where there is a
+constraint on available compute resources. We show up to 42x speed up using a
+batched training approach and up to 5x inference speedup with nearest neighbor
+search on extreme datasets like AliProducts (50k classes) and Google Landmarks
+V2 (81k classes)
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted at ICML 2023: PAC-Bayes Interactive Learning Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making <span class="highlight-title">Pre-train</span>ed Language Models both Task-solvers and
+  Self-calibrators <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangyi Chen, Xingyao Wang, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained language models (PLMs) serve as backbones for various real-world
+systems. For high-stake applications, it's equally essential to have reasonable
+confidence estimations in predictions. While the vanilla confidence scores of
+PLMs can already be effectively utilized, PLMs consistently become
+overconfident in their wrong predictions, which is not desirable in practice.
+Previous work shows that introducing an extra calibration task can mitigate
+this issue. The basic idea involves acquiring additional data to train models
+in predicting the confidence of their initial predictions. However, it only
+demonstrates the feasibility of this kind of method, assuming that there are
+abundant extra available samples for the introduced calibration task. In this
+work, we consider the practical scenario that we need to effectively utilize
+training samples to make PLMs both task-solvers and self-calibrators. Three
+challenges are presented, including limited training samples, data imbalance,
+and distribution shifts. We first conduct pilot experiments to quantify various
+decisive factors in the calibration task. Based on the empirical analysis
+results, we propose a training algorithm LM-TOAST to tackle the challenges.
+Experimental results show that LM-TOAST can effectively utilize the training
+data to make PLMs have reasonable confidence estimations while maintaining the
+original task performance. Further, we consider three downstream applications,
+namely selective classification, adversarial defense, and model cascading, to
+show the practical usefulness of LM-TOAST. The code will be made public at
+\url{https://github.com/Yangyi-Chen/LM-TOAST}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neuromorphic Online Learning for Spatiotemporal Patterns with a
+  Forward-only Timeline 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhang Zhang, Jingang Jin, Haowen Fang, Qinru Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs) are bio-plausible computing models with high
+energy efficiency. The temporal dynamics of neurons and synapses enable them to
+detect temporal patterns and generate sequences. While Backpropagation Through
+Time (BPTT) is traditionally used to train SNNs, it is not suitable for online
+learning of embedded applications due to its high computation and memory cost
+as well as extended latency. Previous works have proposed online learning
+algorithms, but they often utilize highly simplified spiking neuron models
+without synaptic dynamics and reset feedback, resulting in subpar performance.
+In this work, we present Spatiotemporal Online Learning for Synaptic Adaptation
+(SOLSA), specifically designed for online learning of SNNs composed of Leaky
+Integrate and Fire (LIF) neurons with exponentially decayed synapses and soft
+reset. The algorithm not only learns the synaptic weight but also adapts the
+temporal filters associated to the synapses. Compared to the BPTT algorithm,
+SOLSA has much lower memory requirement and achieves a more balanced temporal
+workload distribution. Moreover, SOLSA incorporates enhancement techniques such
+as scheduled weight update, early stop training and adaptive synapse filter,
+which speed up the convergence and enhance the learning performance. When
+compared to other non-BPTT based SNN learning, SOLSA demonstrates an average
+learning accuracy improvement of 14.2%. Furthermore, compared to BPTT, SOLSA
+achieves a 5% higher average learning accuracy with a 72% reduction in memory
+cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PI-VEGAN: Physics Informed Variational Embedding Generative Adversarial
+  Networks for Stochastic Differential Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11289v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11289v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruisong Gao, Yufeng Wang, Min Yang, Chuanjun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new category of physics-informed neural networks called physics
+informed variational embedding generative adversarial network (PI-VEGAN), that
+effectively tackles the forward, inverse, and mixed problems of stochastic
+differential equations. In these scenarios, the governing equations are known,
+but only a limited number of sensor measurements of the system parameters are
+available. We integrate the governing physical laws into PI-VEGAN with
+automatic differentiation, while introducing a variational encoder for
+approximating the latent variables of the actual distribution of the
+measurements. These latent variables are integrated into the generator to
+facilitate accurate learning of the characteristics of the stochastic partial
+equations. Our model consists of three components, namely the encoder,
+generator, and discriminator, each of which is updated alternatively employing
+the stochastic gradient descent algorithm. We evaluate the effectiveness of
+PI-VEGAN in addressing forward, inverse, and mixed problems that require the
+concurrent calculation of system parameters and solutions. Numerical results
+demonstrate that the proposed method achieves satisfactory stability and
+accuracy in comparison with the previous physics-informed generative
+adversarial network (PI-WGAN).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kernelized Offline Contextual Dueling Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viraj Mehta, Ojash Neopane, Vikramjeet Das, Sen Lin, Jeff Schneider, Willie Neiswanger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preference-based feedback is important for many applications where direct
+evaluation of a reward function is not feasible. A notable recent example
+arises in reinforcement learning from human feedback on large language models.
+For many of these applications, the cost of acquiring the human feedback can be
+substantial or even prohibitive. In this work, we take advantage of the fact
+that often the agent can choose contexts at which to obtain human feedback in
+order to most efficiently identify a good policy, and introduce the offline
+contextual dueling bandit setting. We give an upper-confidence-bound style
+algorithm for this setting and prove a regret bound. We also give empirical
+confirmation that this method outperforms a similar strategy that uses
+uniformly sampled contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAS: Towards Resource-Efficient Federated Multiple-Task Learning <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiming Zhuang, Yonggang Wen, Lingjuan Lyu, Shuai Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is an emerging distributed machine learning method
+that empowers in-situ model training on decentralized edge devices. However,
+multiple simultaneous FL tasks could overload resource-constrained devices. In
+this work, we propose the first FL system to effectively coordinate and train
+multiple simultaneous FL tasks. We first formalize the problem of training
+simultaneous FL tasks. Then, we present our new approach, MAS (Merge and
+Split), to optimize the performance of training multiple simultaneous FL tasks.
+MAS starts by merging FL tasks into an all-in-one FL task with a multi-task
+architecture. After training for a few rounds, MAS splits the all-in-one FL
+task into two or more FL tasks by using the affinities among tasks measured
+during the all-in-one training. It then continues training each split of FL
+tasks based on model parameters from the all-in-one training. Extensive
+experiments demonstrate that MAS outperforms other methods while reducing
+training time by 2x and reducing energy consumption by 40%. We hope this work
+will inspire the community to further study and optimize training simultaneous
+FL tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV'23. arXiv admin note: substantial text overlap with
+  arXiv:2207.04202</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Epsilon*: Privacy Metric for Machine Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diana M. Negoescu, Humberto Gonzalez, Saad Eddin Al Orjany, Jilei Yang, Yuliia Lut, Rahul Tandra, Xiaowen Zhang, Xinyi Zheng, Zach Douglas, Vidita Nolkha, Parvez Ahammad, Gennady Samorodnitsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Epsilon*, a new privacy metric for measuring the privacy risk of
+a single model instance prior to, during, or after deployment of privacy
+mitigation strategies. The metric does not require access to the training data
+sampling or model training algorithm. Epsilon* is a function of true positive
+and false positive rates in a hypothesis test used by an adversary in a
+membership inference attack. We distinguish between quantifying the privacy
+loss of a trained model instance and quantifying the privacy loss of the
+training mechanism which produces this model instance. Existing approaches in
+the privacy auditing literature provide lower bounds for the latter, while our
+metric provides a lower bound for the former by relying on an
+(${\epsilon}$,${\delta}$)-type of quantification of the privacy of the trained
+model instance. We establish a relationship between these lower bounds and show
+how to implement Epsilon* to avoid numerical and noise amplification
+instability. We further show in experiments on benchmark public data sets that
+Epsilon* is sensitive to privacy risk mitigation by training with differential
+privacy (DP), where the value of Epsilon* is reduced by up to 800% compared to
+the Epsilon* values of non-DP trained baseline models. This metric allows
+privacy auditors to be independent of model owners, and enables all
+decision-makers to visualize the privacy-utility landscape to make informed
+decisions regarding the trade-offs between model privacy and utility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Screening Mammography Breast Cancer Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debajyoti Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast cancer is a leading cause of cancer-related deaths, but current
+programs are expensive and prone to false positives, leading to unnecessary
+follow-up and patient anxiety. This paper proposes a solution to automated
+breast cancer detection, to improve the efficiency and accuracy of screening
+programs. Different methodologies were tested against the RSNA dataset of
+radiographic breast images of roughly 20,000 female patients and yielded an
+average validation case pF1 score of 0.56 across methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Released @ Apr 2023. For associated project files, see
+  https://github.com/chakrabortyde/rsna-breast-cancer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tight Bounds for $γ$-Regret via the Decision-Estimation Coefficient 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.03327v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.03327v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Margalit Glasgow, Alexander Rakhlin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we give a statistical characterization of the $\gamma$-regret
+for arbitrary structured bandit problems, the regret which arises when
+comparing against a benchmark that is $\gamma$ times the optimal solution. The
+$\gamma$-regret emerges in structured bandit problems over a function class
+$\mathcal{F}$ where finding an exact optimum of $f \in \mathcal{F}$ is
+intractable. Our characterization is given in terms of the $\gamma$-DEC, a
+statistical complexity parameter for the class $\mathcal{F}$, which is a
+modification of the constrained Decision-Estimation Coefficient (DEC) of Foster
+et al., 2023 (and closely related to the original offset DEC of Foster et al.,
+2021). Our lower bound shows that the $\gamma$-DEC is a fundamental limit for
+any model class $\mathcal{F}$: for any algorithm, there exists some $f \in
+\mathcal{F}$ for which the $\gamma$-regret of that algorithm scales (nearly)
+with the $\gamma$-DEC of $\mathcal{F}$. We provide an upper bound showing that
+there exists an algorithm attaining a nearly matching $\gamma$-regret. Due to
+significant challenges in applying the prior results on the DEC to the
+$\gamma$-regret case, both our lower and upper bounds require novel techniques
+and a new algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Provable Copyright Protection for Generative Models <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10870v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10870v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Vyas, Sham Kakade, Boaz Barak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a growing concern that learned conditional generative models may
+output samples that are substantially similar to some copyrighted data $C$ that
+was in their training set. We give a formal definition of $\textit{near
+access-freeness (NAF)}$ and prove bounds on the probability that a model
+satisfying this definition outputs a sample similar to $C$, even if $C$ is
+included in its training set. Roughly speaking, a generative model $p$ is
+$\textit{$k$-NAF}$ if for every potentially copyrighted data $C$, the output of
+$p$ diverges by at most $k$-bits from the output of a model $q$ that
+$\textit{did not access $C$ at all}$. We also give generative model learning
+algorithms, which efficiently modify the original generative model learning
+algorithm in a black box manner, that output generative models with strong
+bounds on the probability of sampling protected content. Furthermore, we
+provide promising experiments for both language (transformers) and image
+(diffusion) generative models, showing minimal degradation in output quality
+while ensuring strong protections against sampling protected content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Competitive Learning Approach for Specialized Models: A Solution for
+  Complex Physical Systems with Distinct Functional Regimes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Okezzi F. Ukorigho, Opeoluwa Owoyele
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Complex systems in science and engineering sometimes exhibit behavior that
+changes across different regimes. Traditional global models struggle to capture
+the full range of this complex behavior, limiting their ability to accurately
+represent the system. In response to this challenge, we propose a novel
+competitive learning approach for obtaining data-driven models of physical
+systems. The primary idea behind the proposed approach is to employ dynamic
+loss functions for a set of models that are trained concurrently on the data.
+Each model competes for each observation during training, allowing for the
+identification of distinct functional regimes within the dataset. To
+demonstrate the effectiveness of the learning approach, we coupled it with
+various regression methods that employ gradient-based optimizers for training.
+The proposed approach was tested on various problems involving model discovery
+and function approximation, demonstrating its ability to successfully identify
+functional regimes, discover true governing equations, and reduce test errors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Neural Network Warm-Start Approach for the Inverse Acoustic Obstacle
+  Scattering Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08736v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08736v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mo Zhou, Jiequn Han, Manas Rachh, Carlos Borges
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the inverse acoustic obstacle problem for sound-soft star-shaped
+obstacles in two dimensions wherein the boundary of the obstacle is determined
+from measurements of the scattered field at a collection of receivers outside
+the object. One of the standard approaches for solving this problem is to
+reformulate it as an optimization problem: finding the boundary of the domain
+that minimizes the $L^2$ distance between computed values of the scattered
+field and the given measurement data. The optimization problem is
+computationally challenging since the local set of convexity shrinks with
+increasing frequency and results in an increasing number of local minima in the
+vicinity of the true solution. In many practical experimental settings, low
+frequency measurements are unavailable due to limitations of the experimental
+setup or the sensors used for measurement. Thus, obtaining a good initial guess
+for the optimization problem plays a vital role in this environment.
+  We present a neural network warm-start approach for solving the inverse
+scattering problem, where an initial guess for the optimization problem is
+obtained using a trained neural network. We demonstrate the effectiveness of
+our method with several numerical examples. For high frequency problems, this
+approach outperforms traditional iterative methods such as Gauss-Newton
+initialized without any prior (i.e., initialized using a unit circle), or
+initialized using the solution of a direct method such as the linear sampling
+method. The algorithm remains robust to noise in the scattered field
+measurements and also converges to the true solution for limited aperture data.
+However, the number of training samples required to train the neural network
+scales exponentially in frequency and the complexity of the obstacles
+considered. We conclude with a discussion of this phenomenon and potential
+directions for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Embedding Contextual Information through Reward Shaping in Multi-Agent
+  Learning: A Case Study from Google Football 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15471v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15471v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyi Gu, Varuna De Silva, Corentin Artaud, Rafael Pina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence has been used to help human complete difficult tasks
+in complicated environments by providing optimized strategies for
+decision-making or replacing the manual labour. In environments including
+multiple agents, such as football, the most common methods to train agents are
+Imitation Learning and Multi-Agent Reinforcement Learning (MARL). However, the
+agents trained by Imitation Learning cannot outperform the expert demonstrator,
+which makes humans hardly get new insights from the learnt policy. Besides,
+MARL is prone to the credit assignment problem. In environments with sparse
+reward signal, this method can be inefficient. The objective of our research is
+to create a novel reward shaping method by embedding contextual information in
+reward function to solve the aforementioned challenges. We demonstrate this in
+the Google Research Football (GRF) environment. We quantify the contextual
+information extracted from game state observation and use this quantification
+together with original sparse reward to create the shaped reward. The
+experiment results in the GRF environment prove that our reward shaping method
+is a useful addition to state-of-the-art MARL algorithms for training agents in
+environments with sparse reward signal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ (Ab)using Images and Sounds for Indirect Instruction Injection in
+  Multi-Modal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10490v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10490v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugene Bagdasaryan, Tsung-Yin Hsieh, Ben Nassi, Vitaly Shmatikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate how images and sounds can be used for indirect prompt and
+instruction injection in multi-modal LLMs. An attacker generates an adversarial
+perturbation corresponding to the prompt and blends it into an image or audio
+recording. When the user asks the (unmodified, benign) model about the
+perturbed image or audio, the perturbation steers the model to output the
+attacker-chosen text and/or make the subsequent dialog follow the attacker's
+instruction. We illustrate this attack with several proof-of-concept examples
+targeting LLaVa and PandaGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Universal consistency of the $k$-NN rule in metric spaces and Nagata
+  dimension. II 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17282v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17282v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sushma Kumari, Vladimir G. Pestov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We continue to investigate the $k$ nearest neighbour learning rule in
+separable metric spaces. Thanks to the results of C\'erou and Guyader (2006)
+and Preiss (1983), this rule is known to be universally consistent in every
+metric space $X$ that is sigma-finite dimensional in the sense of Nagata. Here
+we show that the rule is strongly universally consistent in such spaces in the
+absence of ties. Under the tie-breaking strategy applied by Devroye,
+Gy\"{o}rfi, Krzy\.{z}ak, and Lugosi (1994) in the Euclidean setting, we manage
+to show the strong universal consistency in non-Archimedian metric spaces (that
+is, those of Nagata dimension zero). Combining the theorem of C\'erou and
+Guyader with results of Assouad and Quentin de Gromard (2006), one deduces that
+the $k$-NN rule is universally consistent in metric spaces having finite
+dimension in the sense of de Groot. In particular, the $k$-NN rule is
+universally consistent in the Heisenberg group which is not sigma-finite
+dimensional in the sense of Nagata as follows from an example independently
+constructed by Kor\'anyi and Reimann (1995) and Sawyer and Wheeden (1992).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Latex 2e, 17 pages. The Heisenberg group is now presented in more
+  detail, with some proofs and more references added, and a discussion of open
+  problems added at the end</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CALDA: Improving Multi-Source Time Series Domain Adaptation with
+  Contrastive Adversarial Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.14778v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.14778v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Garrett Wilson, Janardhan Rao Doppa, Diane J. Cook
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation (UDA) provides a strategy for improving
+machine learning performance in data-rich (target) domains where ground truth
+labels are inaccessible but can be found in related (source) domains. In cases
+where meta-domain information such as label distributions is available, weak
+supervision can further boost performance. We propose a novel framework, CALDA,
+to tackle these two problems. CALDA synergistically combines the principles of
+contrastive learning and adversarial learning to robustly support multi-source
+UDA (MS-UDA) for time series data. Similar to prior methods, CALDA utilizes
+adversarial learning to align source and target feature representations. Unlike
+prior approaches, CALDA additionally leverages cross-source label information
+across domains. CALDA pulls examples with the same label close to each other,
+while pushing apart examples with different labels, reshaping the space through
+contrastive learning. Unlike prior contrastive adaptation methods, CALDA
+requires neither data augmentation nor pseudo labeling, which may be more
+challenging for time series. We empirically validate our proposed approach.
+Based on results from human activity recognition, electromyography, and
+synthetic datasets, we find utilizing cross-source information improves
+performance over prior time series and contrastive methods. Weak supervision
+further improves performance, even in the presence of noise, allowing CALDA to
+offer generalizable strategies for MS-UDA. Code is available at:
+https://github.com/floft/calda
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE Transactions on Pattern Analysis and Machine
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Torchhd: An Open Source Python Library to Support Research on
+  Hyperdimensional Computing and Vector Symbolic Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.09208v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.09208v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mike Heddes, Igor Nunes, Pere Vergés, Denis Kleyko, Danny Abraham, Tony Givargis, Alexandru Nicolau, Alexander Veidenbaum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperdimensional computing (HD), also known as vector symbolic architectures
+(VSA), is a framework for computing with distributed representations by
+exploiting properties of random high-dimensional vector spaces. The commitment
+of the scientific community to aggregate and disseminate research in this
+particularly multidisciplinary area has been fundamental for its advancement.
+Joining these efforts, we present Torchhd, a high-performance open source
+Python library for HD/VSA. Torchhd seeks to make HD/VSA more accessible and
+serves as an efficient foundation for further research and application
+development. The easy-to-use library builds on top of PyTorch and features
+state-of-the-art HD/VSA functionality, clear documentation, and implementation
+examples from well-known publications. Comparing publicly available code with
+their corresponding Torchhd implementation shows that experiments can run up to
+100x faster. Torchhd is available at:
+https://github.com/hyperdimensional-computing/torchhd.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multiresolution Graph <span class="highlight-title">Transformer</span>s and Wavelet Positional Encoding for
+  Learning Hierarchical Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08647v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08647v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nhat Khang Ngo, Truong Son Hy, Risi Kondor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary graph learning algorithms are not well-defined for large
+molecules since they do not consider the hierarchical interactions among the
+atoms, which are essential to determine the molecular properties of
+macromolecules. In this work, we propose Multiresolution Graph Transformers
+(MGT), the first graph transformer architecture that can learn to represent
+large molecules at multiple scales. MGT can learn to produce representations
+for the atoms and group them into meaningful functional groups or repeating
+units. We also introduce Wavelet Positional Encoding (WavePE), a new positional
+encoding method that can guarantee localization in both spectral and spatial
+domains. Our proposed model achieves competitive results on two macromolecule
+datasets consisting of polymers and peptides, and one drug-like molecule
+dataset. Importantly, our model outperforms other state-of-the-art methods and
+achieves chemical accuracy in estimating molecular properties (e.g., GAP, HOMO
+and LUMO) calculated by Density Functional Theory (DFT) in the polymers
+dataset. Furthermore, the visualizations, including clustering results on
+macromolecules and low-dimensional spaces of their representations, demonstrate
+the capability of our methodology in learning to represent long-range and
+hierarchical structures. Our PyTorch implementation is publicly available at
+https://github.com/HySonLab/Multires-Graph-Transformer
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unified Algorithm Framework for Unsupervised Discovery of Skills based
+  on Determinantal Point Process 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.00211v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.00211v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Chen, Vaneet Aggarwal, Tian Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning rich skills through temporal abstractions without supervision of
+external rewards is at the frontier of Reinforcement Learning research.
+Existing works mainly fall into two distinctive categories: variational and
+Laplacian-based skill (a.k.a., option) discovery. The former maximizes the
+diversity of the discovered options through a mutual information loss but
+overlooks coverage of the state space, while the latter focuses on improving
+the coverage of options by increasing connectivity during exploration, but does
+not consider diversity. In this paper, we propose a unified framework that
+quantifies diversity and coverage through a novel use of the Determinantal
+Point Process (DPP) and enables unsupervised option discovery explicitly
+optimizing both objectives. Specifically, we define the DPP kernel matrix with
+the Laplacian spectrum of the state transition graph and use the expected mode
+number in the trajectories as the objective to capture and enhance both
+diversity and coverage of the learned options. The proposed option discovery
+algorithm is extensively evaluated using challenging tasks built with Mujoco
+and Atari, demonstrating that our proposed algorithm substantially outperforms
+SOTA baselines from both diversity- and coverage-driven categories. The codes
+are available at https://github.com/LucasCJYSDL/ODPP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Multi-agent Skills for Tabular Reinforcement Learning using
+  Factor Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.08227v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.08227v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Chen, Jingdi Chen, Tian Lan, Vaneet Aggarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Covering skill (a.k.a., option) discovery has been developed to improve the
+exploration of reinforcement learning in single-agent scenarios with sparse
+reward signals, through connecting the most distant states in the embedding
+space provided by the Fiedler vector of the state transition graph. However,
+these option discovery methods cannot be directly extended to multi-agent
+scenarios, since the joint state space grows exponentially with the number of
+agents in the system. Thus, existing researches on adopting options in
+multi-agent scenarios still rely on single-agent option discovery and fail to
+directly discover the joint options that can improve the connectivity of the
+joint state space of agents. In this paper, we show that it is indeed possible
+to directly compute multi-agent options with collaborative exploratory
+behaviors among the agents, while still enjoying the ease of decomposition. Our
+key idea is to approximate the joint state space as a Kronecker graph -- the
+Kronecker product of individual agents' state transition graphs, based on which
+we can directly estimate the Fiedler vector of the joint state space using the
+Laplacian spectrum of individual agents' transition graphs. This decomposition
+enables us to efficiently construct multi-agent joint options by encouraging
+agents to connect the sub-goal joint states which are corresponding to the
+minimum or maximum values of the estimated joint Fiedler vector. The evaluation
+based on multi-agent collaborative tasks shows that the proposed algorithm can
+successfully identify multi-agent options, and significantly outperforms prior
+works using single-agent options or no options, in terms of both faster
+exploration and higher cumulative rewards.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-agent Deep Covering Skill Discovery <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03269v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03269v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Chen, Marina Haliem, Tian Lan, Vaneet Aggarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of skills (a.k.a., options) can greatly accelerate exploration in
+reinforcement learning, especially when only sparse reward signals are
+available. While option discovery methods have been proposed for individual
+agents, in multi-agent reinforcement learning settings, discovering
+collaborative options that can coordinate the behavior of multiple agents and
+encourage them to visit the under-explored regions of their joint state space
+has not been considered. In this case, we propose Multi-agent Deep Covering
+Option Discovery, which constructs the multi-agent options through minimizing
+the expected cover time of the multiple agents' joint state space. Also, we
+propose a novel framework to adopt the multi-agent options in the MARL process.
+In practice, a multi-agent task can usually be divided into some sub-tasks,
+each of which can be completed by a sub-group of the agents. Therefore, our
+algorithm framework first leverages an attention mechanism to find
+collaborative agent sub-groups that would benefit most from coordinated
+actions. Then, a hierarchical algorithm, namely HA-MSAC, is developed to learn
+the multi-agent options for each sub-group to complete their sub-tasks first,
+and then to integrate them through a high-level policy as the solution of the
+whole task. This hierarchical option construction allows our framework to
+strike a balance between scalability and effective collaboration among the
+agents. The evaluation based on multi-agent collaborative tasks shows that the
+proposed algorithm can effectively capture the agent interactions with the
+attention mechanism, successfully identify multi-agent options, and
+significantly outperforms prior works using single-agent options or no options,
+in terms of both faster exploration and higher task rewards.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was presented in part at the ICML Reinforcement Learning
+  for Real Life Workshop, July 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conditional Diffusion Models for Semantic 3D Medical Image Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18453v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18453v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zolnamar Dorjsembe, Hsing-Kuo Pao, Sodtavilan Odonchimed, Furen Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The demand for artificial intelligence (AI) in healthcare is rapidly
+increasing. However, significant challenges arise from data scarcity and
+privacy concerns, particularly in medical imaging. While existing generative
+models have achieved success in image synthesis and image-to-image translation
+tasks, there remains a gap in the generation of 3D semantic medical images. To
+address this gap, we introduce Med-DDPM, a diffusion model specifically
+designed for semantic 3D medical image synthesis, effectively tackling data
+scarcity and privacy issues. The novelty of Med-DDPM lies in its incorporation
+of semantic conditioning, enabling precise control during the image generation
+process. Our model outperforms Generative Adversarial Networks (GANs) in terms
+of stability and performance, generating diverse and anatomically coherent
+images with high visual fidelity. Comparative analysis against state-of-the-art
+augmentation techniques demonstrates that Med-DDPM produces comparable results,
+highlighting its potential as a data augmentation tool for enhancing model
+accuracy. In conclusion, Med-DDPM pioneers 3D semantic medical image synthesis
+by delivering high-quality and anatomically coherent images. Furthermore, the
+integration of semantic conditioning with Med-DDPM holds promise for image
+anonymization in the field of biomedical imaging, showcasing the capabilities
+of the model in addressing challenges related to data scarcity and privacy
+concerns.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpArX: Sparse Argumentative Explanations for Neural Networks <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09559v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09559v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamed Ayoobi, Nico Potyka, Francesca Toni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks (NNs) have various applications in AI, but explaining their
+decisions remains challenging. Existing approaches often focus on explaining
+how changing individual inputs affects NNs' outputs. However, an explanation
+that is consistent with the input-output behaviour of an NN is not necessarily
+faithful to the actual mechanics thereof. In this paper, we exploit
+relationships between multi-layer perceptrons (MLPs) and quantitative
+argumentation frameworks (QAFs) to create argumentative explanations for the
+mechanics of MLPs. Our SpArX method first sparsifies the MLP while maintaining
+as much of the original structure as possible. It then translates the sparse
+MLP into an equivalent QAF to shed light on the underlying decision process of
+the MLP, producing global and/or local explanations. We demonstrate
+experimentally that SpArX can give more faithful explanations than existing
+approaches, while simultaneously providing deeper insights into the actual
+reasoning process of MLPs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the European Conference on Artificial Intelligence (ECAI)
+  2023 Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reduction of finite sampling noise in quantum neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01639v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01639v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David A. Kreplin, Marco Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum neural networks (QNNs) use parameterized quantum circuits with
+data-dependent inputs and generate outputs through the evaluation of
+expectation values. Calculating these expectation values necessitates repeated
+circuit evaluations, thus introducing fundamental finite-sampling noise even on
+error-free quantum computers. We reduce this noise by introducing the variance
+regularization, a technique for reducing the variance of the expectation value
+during the quantum model training. This technique requires no additional
+circuit evaluations if the QNN is properly constructed. Our empirical findings
+demonstrate the reduced variance speeds up the training and lowers the output
+noise as well as decreases the number of necessary evaluations of gradient
+circuits. This regularization method is benchmarked on the regression of
+multiple functions. We show that in our examples, it lowers the variance by an
+order of magnitude on average and leads to a significantly reduced noise level
+of the QNN. We finally demonstrate QNN training on a real quantum device and
+evaluate the impact of error mitigation. Here, the optimization is feasible
+only due to the reduced number of necessary shots in the gradient evaluation
+resulting from the reduced variance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 10 figures; refined section 5</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Hyperspectral Inpainting with the Optimisation inspired
+  Deep Neural Network Prior <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07308v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07308v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Li, Mehrdad Yaghoobi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral Image (HSI)s cover hundreds or thousands of narrow spectral
+bands, conveying a wealth of spatial and spectral information. However, due to
+the instrumental errors and the atmospheric changes, the HSI obtained in
+practice are often contaminated by noise and dead pixels(lines), resulting in
+missing information that may severely compromise the subsequent applications.
+We introduce here a novel HSI missing pixel prediction algorithm, called Low
+Rank and Sparsity Constraint Plug-and-Play (LRS-PnP). It is shown that LRS-PnP
+is able to predict missing pixels and bands even when all spectral bands of the
+image are missing. The proposed LRS-PnP algorithm is further extended to a
+self-supervised model by combining the LRS-PnP with the Deep Image Prior (DIP),
+called LRS-PnP-DIP. In a series of experiments with real data, It is shown that
+the LRS-PnP-DIP either achieves state-of-the-art inpainting performance
+compared to other learning-based methods, or outperforms them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented in ISCS23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modeling Events and Interactions through Temporal Processes -- A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06067v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06067v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angelica Liguori, Luciano Caroprese, Marco Minici, Bruno Veloso, Francesco Spinnato, Mirco Nanni, Giuseppe Manco, Joao Gama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world scenario, many phenomena produce a collection of events that
+occur in continuous time. Point Processes provide a natural mathematical
+framework for modeling these sequences of events. In this survey, we
+investigate probabilistic models for modeling event sequences through temporal
+processes. We revise the notion of event modeling and provide the mathematical
+foundations that characterize the literature on the topic. We define an
+ontology to categorize the existing approaches in terms of three families:
+simple, marked, and spatio-temporal point processes. For each family, we
+systematically review the existing approaches based based on deep learning.
+Finally, we analyze the scenarios where the proposed techniques can be used for
+addressing prediction and modeling aspects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Image replacements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Neural PDE Solvers with Parameter-Guided Channel Attention <span class="chip">ICML2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14118v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14118v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Makoto Takamoto, Francesco Alesiani, Mathias Niepert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scientific Machine Learning (SciML) is concerned with the development of
+learned emulators of physical systems governed by partial differential
+equations (PDE). In application domains such as weather forecasting, molecular
+dynamics, and inverse design, ML-based surrogate models are increasingly used
+to augment or replace inefficient and often non-differentiable numerical
+simulation algorithms. While a number of ML-based methods for approximating the
+solutions of PDEs have been proposed in recent years, they typically do not
+adapt to the parameters of the PDEs, making it difficult to generalize to PDE
+parameters not seen during training. We propose a Channel Attention mechanism
+guided by PDE Parameter Embeddings (CAPE) component for neural surrogate models
+and a simple yet effective curriculum learning strategy. The CAPE module can be
+combined with neural PDE solvers allowing them to adapt to unseen PDE
+parameters. The curriculum learning strategy provides a seamless transition
+between teacher-forcing and fully auto-regressive training. We compare CAPE in
+conjunction with the curriculum learning strategy using a popular PDE benchmark
+and obtain consistent and significant improvements over the baseline models.
+The experiments also show several advantages of CAPE, such as its increased
+ability to generalize to unseen PDE parameters without large increases
+inference time and parameter count.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted for publication in ICML2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Learning for Abdominal Multi-Organ and Tumor Segmentation <span class="chip">MICCAI-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00988v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00988v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixiao Zhang, Xinyi Li, Huimiao Chen, Alan Yuille, Yaoyao Liu, Zongwei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to dynamically extend a model to new data and classes is critical
+for multiple organ and tumor segmentation. However, due to privacy regulations,
+accessing previous data and annotations can be problematic in the medical
+domain. This poses a significant barrier to preserving the high segmentation
+accuracy of the old classes when learning from new classes because of the
+catastrophic forgetting problem. In this paper, we first empirically
+demonstrate that simply using high-quality pseudo labels can fairly mitigate
+this problem in the setting of organ segmentation. Furthermore, we put forward
+an innovative architecture designed specifically for continuous organ and tumor
+segmentation, which incurs minimal computational overhead. Our proposed design
+involves replacing the conventional output layer with a suite of lightweight,
+class-specific heads, thereby offering the flexibility to accommodate newly
+emerging classes. These heads enable independent predictions for newly
+introduced and previously learned classes, effectively minimizing the impact of
+new classes on old ones during the course of continual learning. We further
+propose incorporating Contrastive Language-Image Pretraining (CLIP) embeddings
+into the organ-specific heads. These embeddings encapsulate the semantic
+information of each class, informed by extensive image-text co-training. The
+proposed method is evaluated on both in-house and public abdominal CT datasets
+under organ and tumor segmentation tasks. Empirical results suggest that the
+proposed design improves the segmentation performance of a baseline neural
+network on newly-introduced and previously-learned classes along the learning
+trajectory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantitative CLTs in Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06092v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06092v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefano Favaro, Boris Hanin, Domenico Marinucci, Ivan Nourdin, Giovanni Peccati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the distribution of a fully connected neural network with random
+Gaussian weights and biases in which the hidden layer widths are proportional
+to a large constant $n$. Under mild assumptions on the non-linearity, we obtain
+quantitative bounds on normal approximations valid at large but finite $n$ and
+any fixed network depth. Our theorems show both for the finite-dimensional
+distributions and the entire process, that the distance between a random fully
+connected network (and its derivatives) to the corresponding infinite width
+Gaussian process scales like $n^{-\gamma}$ for $\gamma>0$, with the exponent
+depending on the metric used to measure discrepancy. Our bounds are strictly
+stronger in terms of their dependence on network width than any previously
+available in the literature; in the one-dimensional case, we also prove that
+they are optimal, i.e., we establish matching lower bounds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unmasking Falsehoods in <span class="highlight-title">Review</span>s: An Exploration of NLP Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10617v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10617v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anusuya Baby Hari Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the contemporary digital landscape, online reviews have become an
+indispensable tool for promoting products and services across various
+businesses. Marketers, advertisers, and online businesses have found incentives
+to create deceptive positive reviews for their products and negative reviews
+for their competitors' offerings. As a result, the writing of deceptive reviews
+has become an unavoidable practice for businesses seeking to promote themselves
+or undermine their rivals. Detecting such deceptive reviews has become an
+intense and ongoing area of research. This research paper proposes a machine
+learning model to identify deceptive reviews, with a particular focus on
+restaurants. This study delves into the performance of numerous experiments
+conducted on a dataset of restaurant reviews known as the Deceptive Opinion
+Spam Corpus. To accomplish this, an n-gram model and max features are developed
+to effectively identify deceptive content, particularly focusing on fake
+reviews. A benchmark study is undertaken to explore the performance of two
+different feature extraction techniques, which are then coupled with five
+distinct machine learning classification algorithms. The experimental results
+reveal that the passive aggressive classifier stands out among the various
+algorithms, showcasing the highest accuracy not only in text classification but
+also in identifying fake reviews. Moreover, the research delves into data
+augmentation and implements various deep learning techniques to further enhance
+the process of detecting deceptive reviews. The findings shed light on the
+efficacy of the proposed machine learning approach and offer valuable insights
+into dealing with deceptive reviews in the realm of online businesses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian taut splines for estimating the number of modes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05825v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05825v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        José E. Chacón, Javier Fernández Serrano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The number of modes in a probability density function is representative of
+the model's complexity and can also be viewed as the number of existing
+subpopulations. Despite its relevance, little research has been devoted to its
+estimation. Focusing on the univariate setting, we propose a novel approach
+targeting prediction accuracy inspired by some overlooked aspects of the
+problem. We argue for the need for structure in the solutions, the subjective
+and uncertain nature of modes, and the convenience of a holistic view blending
+global and local density properties. Our method builds upon a combination of
+flexible kernel estimators and parsimonious compositional splines. Feature
+exploration, model selection and mode testing are implemented in the Bayesian
+inference paradigm, providing soft solutions and allowing to incorporate expert
+judgement in the process. The usefulness of our proposal is illustrated through
+a case study in sports analytics, showcasing multiple companion visualisation
+tools. A thorough simulation study demonstrates that traditional
+modality-driven approaches paradoxically struggle to provide accurate results.
+In this context, our method emerges as a top-tier alternative offering
+innovative solutions for analysts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 8 figures (manuscript) + 19 pages, 16 figures
+  (supplementary material)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Confidence intervals for performance estimates in 3D medical image
+  segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10926v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10926v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        R. El Jurdi, G. Varoquaux, O. Colliot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical segmentation models are evaluated empirically. As such an evaluation
+is based on a limited set of example images, it is unavoidably noisy. Beyond a
+mean performance measure, reporting confidence intervals is thus crucial.
+However, this is rarely done in medical image segmentation. The width of the
+confidence interval depends on the test set size and on the spread of the
+performance measure (its standard-deviation across of the test set). For
+classification, many test images are needed to avoid wide confidence intervals.
+Segmentation, however, has not been studied, and it differs by the amount of
+information brought by a given test image. In this paper, we study the typical
+confidence intervals in medical image segmentation. We carry experiments on 3D
+image segmentation using the standard nnU-net framework, two datasets from the
+Medical Decathlon challenge and two performance measures: the Dice accuracy and
+the Hausdorff distance. We show that the parametric confidence intervals are
+reasonable approximations of the bootstrap estimates for varying test set sizes
+and spread of the performance metric. Importantly, we show that the test size
+needed to achieve a given precision is often much lower than for classification
+tasks. Typically, a 1% wide confidence interval requires about 100-200 test
+samples when the spread is low (standard-deviation around 3%). More difficult
+segmentation tasks may lead to higher spreads and require over 1000 samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SegNetr: Rethinking the local-global interactions and skip connections
+  in U-shaped networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02953v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02953v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junlong Cheng, Chengrui Gao, Fengjie Wang, Min Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, U-shaped networks have dominated the field of medical image
+segmentation due to their simple and easily tuned structure. However, existing
+U-shaped segmentation networks: 1) mostly focus on designing complex
+self-attention modules to compensate for the lack of long-term dependence based
+on convolution operation, which increases the overall number of parameters and
+computational complexity of the network; 2) simply fuse the features of encoder
+and decoder, ignoring the connection between their spatial locations. In this
+paper, we rethink the above problem and build a lightweight medical image
+segmentation network, called SegNetr. Specifically, we introduce a novel
+SegNetr block that can perform local-global interactions dynamically at any
+stage and with only linear complexity. At the same time, we design a general
+information retention skip connection (IRSC) to preserve the spatial location
+information of encoder features and achieve accurate fusion with the decoder
+features. We validate the effectiveness of SegNetr on four mainstream medical
+image segmentation datasets, with 59\% and 76\% fewer parameters and GFLOPs
+than vanilla U-Net, while achieving segmentation performance comparable to
+state-of-the-art methods. Notably, the components proposed in this paper can be
+applied to other U-shaped networks to improve their segmentation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Shortcut Detection with Variational Autoencoders <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04246v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04246v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas M. Müller, Simon Roschmann, Shahbaz Khan, Philip Sperl, Konstantin Böttinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For real-world applications of machine learning (ML), it is essential that
+models make predictions based on well-generalizing features rather than
+spurious correlations in the data. The identification of such spurious
+correlations, also known as shortcuts, is a challenging problem and has so far
+been scarcely addressed. In this work, we present a novel approach to detect
+shortcuts in image and audio datasets by leveraging variational autoencoders
+(VAEs). The disentanglement of features in the latent space of VAEs allows us
+to discover feature-target correlations in datasets and semi-automatically
+evaluate them for ML shortcuts. We demonstrate the applicability of our method
+on several real-world datasets and identify shortcuts that have not been
+discovered before.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the ICML 2023 Workshop on Spurious Correlations,
+  Invariance and Stability</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedNeXt: <span class="highlight-title">Transformer</span>-driven Scaling of ConvNets for Medical Image
+  Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09975v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09975v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saikat Roy, Gregor Koehler, Constantin Ulrich, Michael Baumgartner, Jens Petersen, Fabian Isensee, Paul F. Jaeger, Klaus Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been exploding interest in embracing Transformer-based
+architectures for medical image segmentation. However, the lack of large-scale
+annotated medical datasets make achieving performances equivalent to those in
+natural images challenging. Convolutional networks, in contrast, have higher
+inductive biases and consequently, are easily trainable to high performance.
+Recently, the ConvNeXt architecture attempted to modernize the standard ConvNet
+by mirroring Transformer blocks. In this work, we improve upon this to design a
+modernized and scalable convolutional architecture customized to challenges of
+data-scarce medical settings. We introduce MedNeXt, a Transformer-inspired
+large kernel segmentation network which introduces - 1) A fully ConvNeXt 3D
+Encoder-Decoder Network for medical image segmentation, 2) Residual ConvNeXt up
+and downsampling blocks to preserve semantic richness across scales, 3) A novel
+technique to iteratively increase kernel sizes by upsampling small kernel
+networks, to prevent performance saturation on limited medical data, 4)
+Compound scaling at multiple levels (depth, width, kernel size) of MedNeXt.
+This leads to state-of-the-art performance on 4 tasks on CT and MRI modalities
+and varying dataset sizes, representing a modernized deep architecture for
+medical image segmentation. Our code is made publicly available at:
+https://github.com/MIC-DKFZ/MedNeXt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Your Model "MADD"? A Novel Metric to Evaluate Algorithmic Fairness
+  for Predictive Student Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15342v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15342v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mélina Verger, Sébastien Lallé, François Bouchet, Vanda Luengo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predictive student models are increasingly used in learning environments due
+to their ability to enhance educational outcomes and support stakeholders in
+making informed decisions. However, predictive models can be biased and produce
+unfair outcomes, leading to potential discrimination against some students and
+possible harmful long-term implications. This has prompted research on fairness
+metrics meant to capture and quantify such biases. Nonetheless, so far,
+existing fairness metrics used in education are predictive
+performance-oriented, focusing on assessing biased outcomes across groups of
+students, without considering the behaviors of the models nor the severity of
+the biases in the outcomes. Therefore, we propose a novel metric, the Model
+Absolute Density Distance (MADD), to analyze models' discriminatory behaviors
+independently from their predictive performance. We also provide a
+complementary visualization-based analysis to enable fine-grained human
+assessment of how the models discriminate between groups of students. We
+evaluate our approach on the common task of predicting student success in
+online courses, using several common predictive classification models on an
+open educational dataset. We also compare our metric to the only predictive
+performance-oriented fairness metric developed in education, ABROCA. Results on
+this dataset show that: (1) fair predictive performance does not guarantee fair
+models' behaviors and thus fair outcomes, (2) there is no direct relationship
+between data bias and predictive performance bias nor discriminatory behaviors
+bias, and (3) trained on the same data, models exhibit different discriminatory
+behaviors, according to different sensitive features too. We thus recommend
+using the MADD on models that show satisfying predictive performance, to gain a
+finer-grained understanding on how they behave and to refine models selection
+and their usage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep learning based Meta-modeling for Multi-objective Technology
+  Optimization of Electrical Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09087v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09087v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Parekh, Dominik Flore, Sebastian Schöps
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimization of rotating electrical machines is both time- and
+computationally expensive. Because of the different parametrization, design
+optimization is commonly executed separately for each machine technology. In
+this paper, we present the application of a variational auto-encoder (VAE) to
+optimize two different machine technologies simultaneously, namely an
+asynchronous machine and a permanent magnet synchronous machine. After
+training, we employ a deep neural network and a decoder as meta-models to
+predict global key performance indicators (KPIs) and generate associated new
+designs, respectively, through unified latent space in the optimization loop.
+Numerical results demonstrate concurrent parametric multi-objective technology
+optimization in the high-dimensional design space. The VAE-based approach is
+quantitatively compared to a classical deep learning-based direct approach for
+KPIs prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IsoEx: an explainable unsupervised approach to process event logs cyber
+  investigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Lavieille, Ismail Alaoui Hassani Atlas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  39 seconds. That is the timelapse between two consecutive cyber attacks as of
+2023. Meaning that by the time you are done reading this abstract, about 1 or 2
+additional cyber attacks would have occurred somewhere in the world. In this
+context of highly increased frequency of cyber threats, Security Operation
+Centers (SOC) and Computer Emergency Response Teams (CERT) can be overwhelmed.
+In order to relieve the cybersecurity teams in their investigative effort and
+help them focus on more added-value tasks, machine learning approaches and
+methods started to emerge. This paper introduces a novel method, IsoEx, for
+detecting anomalous and potentially problematic command lines during the
+investigation of contaminated devices. IsoEx is built around a set of features
+that leverages the log structure of the command line, as well as its
+parent/child relationship, to achieve a greater accuracy than traditional
+methods. To detect anomalies, IsoEx resorts to an unsupervised anomaly
+detection technique that is both highly sensitive and lightweight. A key
+contribution of the paper is its emphasis on interpretability, achieved through
+the features themselves and the application of eXplainable Artificial
+Intelligence (XAI) techniques and visualizations. This is critical to ensure
+the adoption of the method by SOC and CERT teams, as the paper argues that the
+current literature on machine learning for log investigation has not adequately
+addressed the issue of explainability. This method was proven efficient in a
+real-life environment as it was built to support a company\'s SOC and CERT
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sound Demixing Challenge 2023 Music Demixing Track Technical Report:
+  TFC-TDF-UNet v3 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09382v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09382v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minseok Kim, Jun Hyung Lee, Soonyoung Jung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this report, we present our award-winning solutions for the Music Demixing
+Track of Sound Demixing Challenge 2023. First, we propose TFC-TDF-UNet v3, a
+time-efficient music source separation model that achieves state-of-the-art
+results on the MUSDB benchmark. We then give full details regarding our
+solutions for each Leaderboard, including a loss masking approach for
+noise-robust training. Code for reproducing model training and final
+submissions is available at github.com/kuielab/sdx23.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Editable User Profiles for Controllable Text Recommendation <span class="chip">SIGIR-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04250v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04250v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheshera Mysore, Mahmood Jasim, Andrew McCallum, Hamed Zamani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods for making high-quality recommendations often rely on learning latent
+representations from interaction data. These methods, while performant, do not
+provide ready mechanisms for users to control the recommendation they receive.
+Our work tackles this problem by proposing LACE, a novel concept value
+bottleneck model for controllable text recommendations. LACE represents each
+user with a succinct set of human-readable concepts through retrieval given
+user-interacted documents and learns personalized representations of the
+concepts based on user documents. This concept based user profile is then
+leveraged to make recommendations. The design of our model affords control over
+the recommendations through a number of intuitive interactions with a
+transparent user profile. We first establish the quality of recommendations
+obtained from LACE in an offline evaluation on three recommendation tasks
+spanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we
+validate the controllability of LACE under simulated user interactions.
+Finally, we implement LACE in an interactive controllable recommender system
+and conduct a user study to demonstrate that users are able to improve the
+quality of recommendations they receive through interactions with an editable
+user profile.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGIR-2023 Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StyleGANEX: StyleGAN-Based Manipulation Beyond Cropped Aligned Faces <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06146v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06146v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Yang, Liming Jiang, Ziwei Liu, Chen Change Loy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in face manipulation using StyleGAN have produced impressive
+results. However, StyleGAN is inherently limited to cropped aligned faces at a
+fixed image resolution it is pre-trained on. In this paper, we propose a simple
+and effective solution to this limitation by using dilated convolutions to
+rescale the receptive fields of shallow layers in StyleGAN, without altering
+any model parameters. This allows fixed-size small features at shallow layers
+to be extended into larger ones that can accommodate variable resolutions,
+making them more robust in characterizing unaligned faces. To enable real face
+inversion and manipulation, we introduce a corresponding encoder that provides
+the first-layer feature of the extended StyleGAN in addition to the latent
+style code. We validate the effectiveness of our method using unaligned face
+inputs of various resolutions in a diverse set of face manipulation tasks,
+including facial attribute editing, super-resolution, sketch/mask-to-face
+translation, and face toonification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Code: https://github.com/williamyang1991/StyleGANEX
+  Project page: https://www.mmlab-ntu.com/project/styleganex/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-scale Attention Flow for Probabilistic Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.07493v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.07493v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shibo Feng, Chunyan Miao, Ke Xu, Jiaxiang Wu, Pengcheng Wu, Yang Zhang, Peilin Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The probability prediction of multivariate time series is a notoriously
+challenging but practical task. On the one hand, the challenge is how to
+effectively capture the cross-series correlations between interacting time
+series, to achieve accurate distribution modeling. On the other hand, we should
+consider how to capture the contextual information within time series more
+accurately to model multivariate temporal dynamics of time series. In this
+work, we proposed a novel non-autoregressive deep learning model, called
+Multi-scale Attention Normalizing Flow(MANF), where we integrate multi-scale
+attention and relative position information and the multivariate data
+distribution is represented by the conditioned normalizing flow. Additionally,
+compared with autoregressive modeling methods, our model avoids the influence
+of cumulative error and does not increase the time complexity. Extensive
+experiments demonstrate that our model achieves state-of-the-art performance on
+many popular multivariate datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MolFM: A Multimodal Molecular Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09484v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09484v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhen Luo, Kai Yang, Massimo Hong, Xing Yi Liu, Zaiqing Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Molecular knowledge resides within three different modalities of information
+sources: molecular structures, biomedical documents, and knowledge bases.
+Effective incorporation of molecular knowledge from these modalities holds
+paramount significance in facilitating biomedical research. However, existing
+multimodal molecular foundation models exhibit limitations in capturing
+intricate connections between molecular structures and texts, and more
+importantly, none of them attempt to leverage a wealth of molecular expertise
+derived from knowledge graphs. In this study, we introduce MolFM, a multimodal
+molecular foundation model designed to facilitate joint representation learning
+from molecular structures, biomedical texts, and knowledge graphs. We propose
+cross-modal attention between atoms of molecular structures, neighbors of
+molecule entities and semantically related texts to facilitate cross-modal
+comprehension. We provide theoretical analysis that our cross-modal
+pre-training captures local and global molecular knowledge by minimizing the
+distance in the feature space between different modalities of the same
+molecule, as well as molecules sharing similar structures or functions. MolFM
+achieves state-of-the-art performance on various downstream tasks. On
+cross-modal retrieval, MolFM outperforms existing models with 12.13% and 5.04%
+absolute gains under the zero-shot and fine-tuning settings, respectively.
+Furthermore, qualitative analysis showcases MolFM's implicit ability to provide
+grounding from molecular substructures and knowledge graphs. Code and models
+are available on https://github.com/BioFM/OpenBioMed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 15 figures, and 15 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Homophily a Necessity for Graph Neural Networks? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.06134v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.06134v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Ma, Xiaorui Liu, Neil Shah, Jiliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have shown great prowess in learning
+representations suitable for numerous graph-based machine learning tasks. When
+applied to semi-supervised node classification, GNNs are widely believed to
+work well due to the homophily assumption ("like attracts like"), and fail to
+generalize to heterophilous graphs where dissimilar nodes connect. Recent works
+design new architectures to overcome such heterophily-related limitations,
+citing poor baseline performance and new architecture improvements on a few
+heterophilous graph benchmark datasets as evidence for this notion. In our
+experiments, we empirically find that standard graph convolutional networks
+(GCNs) can actually achieve better performance than such carefully designed
+methods on some commonly used heterophilous graphs. This motivates us to
+reconsider whether homophily is truly necessary for good GNN performance. We
+find that this claim is not quite true, and in fact, GCNs can achieve strong
+performance on heterophilous graphs under certain conditions. Our work
+carefully characterizes these conditions, and provides supporting theoretical
+understanding and empirical observations. Finally, we examine existing
+heterophilous graphs benchmarks and reconcile how the GCN (under)performs on
+them based on this understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simplifying Momentum-based Positive-definite Submanifold Optimization
+  with Applications to Deep Learning <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09738v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09738v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wu Lin, Valentin Duruisseaux, Melvin Leok, Frank Nielsen, Mohammad Emtiyaz Khan, Mark Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Riemannian submanifold optimization with momentum is computationally
+challenging because, to ensure that the iterates remain on the submanifold, we
+often need to solve difficult differential equations. Here, we simplify such
+difficulties for a class of structured symmetric positive-definite matrices
+with the affine-invariant metric. We do so by proposing a generalized version
+of the Riemannian normal coordinates that dynamically orthonormalizes the
+metric and locally converts the problem into an unconstrained problem in the
+Euclidean space. We use our approach to simplify existing approaches for
+structured covariances and develop matrix-inverse-free $2^\text{nd}$-order
+optimizers for deep learning with low precision by using only matrix
+multiplications. Code: https://github.com/yorkerlin/StructuredNGD-DL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An updated version of the ICML 2023 paper. Updated the main text and
+  added more numerical results for DNNs including a new baseline method and
+  improving existing baseline methods</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The activity-weight duality in feed forward neural networks: The
+  geometric determinants of generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.10736v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.10736v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Feng, Yuhai Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the fundamental problems in machine learning is generalization. In
+neural network models with a large number of weights (parameters), many
+solutions can be found to fit the training data equally well. The key question
+is which solution can describe testing data not in the training set. Here, we
+report the discovery of an exact duality (equivalence) between changes in
+activities in a given layer of neurons and changes in weights that connect to
+the next layer of neurons in a densely connected layer in any feed forward
+neural network. The activity-weight (A-W) duality allows us to map variations
+in inputs (data) to variations of the corresponding dual weights. By using this
+mapping, we show that the generalization loss can be decomposed into a sum of
+contributions from different eigen-directions of the Hessian matrix of the loss
+function at the solution in weight space. The contribution from a given
+eigen-direction is the product of two geometric factors (determinants): the
+sharpness of the loss landscape and the standard deviation of the dual weights,
+which is found to scale with the weight norm of the solution. Our results
+provide an unified framework, which we used to reveal how different
+regularization schemes (weight decay, stochastic gradient descent with
+different batch sizes and learning rates, dropout), training data size, and
+labeling noise affect generalization performance by controlling either one or
+both of these two geometric determinants for generalization. These insights can
+be used to guide development of algorithms for finding more generalizable
+solutions in overparametrized neural networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SecureBoost Hyperparameter Tuning via Multi-Objective Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10579v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10579v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyao Ren, Yan Kang, Lixin Fan, Linghua Yang, Tao Fan, Yongxin Tong, Qiang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SecureBoost is a tree-boosting algorithm leveraging homomorphic encryption to
+protect data privacy in vertical federated learning setting. It is widely used
+in fields such as finance and healthcare due to its interpretability,
+effectiveness, and privacy-preserving capability. However, SecureBoost suffers
+from high computational complexity and risk of label leakage. To harness the
+full potential of SecureBoost, hyperparameters of SecureBoost should be
+carefully chosen to strike an optimal balance between utility, efficiency, and
+privacy. Existing methods either set hyperparameters empirically or
+heuristically, which are far from optimal. To fill this gap, we propose a
+Constrained Multi-Objective SecureBoost (CMOSB) algorithm to find Pareto
+optimal solutions that each solution is a set of hyperparameters achieving
+optimal tradeoff between utility loss, training cost, and privacy leakage. We
+design measurements of the three objectives. In particular, the privacy leakage
+is measured using our proposed instance clustering attack. Experimental results
+demonstrate that the CMOSB yields not only hyperparameters superior to the
+baseline but also optimal sets of hyperparameters that can support the flexible
+requirements of FL participants.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>FL-ICAI'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Factoring the Matrix of Domination: A Critical <span class="highlight-title">Review</span> and Reimagination
+  of Intersectionality in AI Fairness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17555v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17555v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anaelia Ovalle, Arjun Subramonian, Vagrant Gautam, Gilbert Gee, Kai-Wei Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intersectionality is a critical framework that, through inquiry and praxis,
+allows us to examine how social inequalities persist through domains of
+structure and discipline. Given AI fairness' raison d'etre of "fairness", we
+argue that adopting intersectionality as an analytical framework is pivotal to
+effectively operationalizing fairness. Through a critical review of how
+intersectionality is discussed in 30 papers from the AI fairness literature, we
+deductively and inductively: 1) map how intersectionality tenets operate within
+the AI fairness paradigm and 2) uncover gaps between the conceptualization and
+operationalization of intersectionality. We find that researchers
+overwhelmingly reduce intersectionality to optimizing for fairness metrics over
+demographic subgroups. They also fail to discuss their social context and when
+mentioning power, they mostly situate it only within the AI pipeline. We: 3)
+outline and assess the implications of these gaps for critical inquiry and
+praxis, and 4) provide actionable recommendations for AI fairness researchers
+to engage with intersectionality in their work by grounding it in AI
+epistemology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at AIES 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Invariant Slot Attention: Object Discovery with Slot-Centric Reference
+  Frames <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04973v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04973v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ondrej Biza, Sjoerd van Steenkiste, Mehdi S. M. Sajjadi, Gamaleldin F. Elsayed, Aravindh Mahendran, Thomas Kipf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatically discovering composable abstractions from raw perceptual data is
+a long-standing challenge in machine learning. Recent slot-based neural
+networks that learn about objects in a self-supervised manner have made
+exciting progress in this direction. However, they typically fall short at
+adequately capturing spatial symmetries present in the visual world, which
+leads to sample inefficiency, such as when entangling object appearance and
+pose. In this paper, we present a simple yet highly effective method for
+incorporating spatial symmetries via slot-centric reference frames. We
+incorporate equivariance to per-object pose transformations into the attention
+and generation mechanism of Slot Attention by translating, scaling, and
+rotating position encodings. These changes result in little computational
+overhead, are easy to implement, and can result in large gains in terms of data
+efficiency and overall improvements to object discovery. We evaluate our method
+on a wide range of synthetic object discovery benchmarks namely CLEVR,
+Tetrominoes, CLEVRTex, Objects Room and MultiShapeNet, and show promising
+improvements on the challenging real-world Waymo Open dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2023. Project page: https://invariantsa.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Parallel and Distributed Graph Neural Networks: An In-Depth Concurrency
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.09702v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.09702v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maciej Besta, Torsten Hoefler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) are among the most powerful tools in deep
+learning. They routinely solve complex problems on unstructured networks, such
+as node classification, graph classification, or link prediction, with high
+accuracy. However, both inference and training of GNNs are complex, and they
+uniquely combine the features of irregular graph processing with dense and
+regular computations. This complexity makes it very challenging to execute GNNs
+efficiently on modern massively parallel architectures. To alleviate this, we
+first design a taxonomy of parallelism in GNNs, considering data and model
+parallelism, and different forms of pipelining. Then, we use this taxonomy to
+investigate the amount of parallelism in numerous GNN models, GNN-driven
+machine learning tasks, software frameworks, or hardware accelerators. We use
+the work-depth model, and we also assess communication volume and
+synchronization. We specifically focus on the sparsity/density of the
+associated tensors, in order to understand how to effectively apply techniques
+such as vectorization. We also formally analyze GNN pipelining, and we
+generalize the established Message-Passing class of GNN models to cover
+arbitrary pipeline depths, facilitating future optimizations. Finally, we
+investigate different forms of asynchronicity, navigating the path for future
+asynchronous parallel GNN pipelines. The outcomes of our analysis are
+synthesized in a set of insights that help to maximize GNN performance, and a
+comprehensive list of challenges and opportunities for further research into
+efficient GNN computations. Our work will help to advance the design of future
+GNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Asynchronous Multi-Model Dynamic Federated Learning over Wireless
+  Networks: Theory, Modeling, and Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13503v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13503v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhan-Lun Chang, Seyyedali Hosseinalipour, Mung Chiang, Christopher G. Brinton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) has emerged as a key technique for distributed
+machine learning (ML). Most literature on FL has focused on ML model training
+for (i) a single task/model, with (ii) a synchronous scheme for uplink/downlink
+transfer of model parameters, and (iii) a static data distribution setting
+across devices. These assumptions are often not well representative of
+conditions encountered in practical FL environments. To address this, we
+develop DMA-FL, which considers dynamic FL with multiple downstream tasks to be
+trained over an asynchronous model transmission architecture. We first
+characterize the convergence of ML model training under DMA-FL via introducing
+a family of scheduling tensors and rectangular functions to capture the
+scheduling of devices. Our convergence analysis sheds light on the impact of
+resource allocation, device scheduling, and individual model states on the
+performance of ML models. We then formulate a non-convex mixed integer
+optimization problem for jointly configuring the resource allocation and device
+scheduling to strike an efficient trade-off between energy consumption and ML
+performance. We develop a solution methodology employing successive convex
+approximations with convergence guarantee to a stationary point. Through
+numerical simulations, we reveal the advantages of DMA-FL in terms of model
+performance and network resource savings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submission to IEEE Transactions on Cognitive Communications and
+  Networking</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">3</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VERITE: A Robust Benchmark for Multimodal Misinformation Detection
+  Accounting for Unimodal Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14133v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14133v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefanos-Iordanis Papadopoulos, Christos Koutlis, Symeon Papadopoulos, Panagiotis C. Petrantonakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimedia content has become ubiquitous on social media platforms, leading
+to the rise of multimodal misinformation (MM) and the urgent need for effective
+strategies to detect and prevent its spread. In recent years, the challenge of
+multimodal misinformation detection (MMD) has garnered significant attention by
+researchers and has mainly involved the creation of annotated, weakly
+annotated, or synthetically generated training datasets, along with the
+development of various deep learning MMD models. However, the problem of
+unimodal bias in MMD benchmarks -- where biased or unimodal methods outperform
+their multimodal counterparts on an inherently multimodal task -- has been
+overlooked. In this study, we systematically investigate and identify the
+presence of unimodal bias in widely-used MMD benchmarks (VMU-Twitter, COSMOS),
+raising concerns about their suitability for reliable evaluation. To address
+this issue, we introduce the "VERification of Image-TExtpairs" (VERITE)
+benchmark for MMD which incorporates real-world data, excludes "asymmetric
+multimodal misinformation" and utilizes "modality balancing". We conduct an
+extensive comparative study with a Transformer-based architecture that shows
+the ability of VERITE to effectively address unimodal bias, rendering it a
+robust evaluation framework for MMD. Furthermore, we introduce a new method --
+termed Crossmodal HArd Synthetic MisAlignment (CHASMA) -- for generating
+realistic synthetic training data that preserve crossmodal relations between
+legitimate images and false human-written captions. By leveraging CHASMA in the
+training process, we observe consistent and notable improvements in predictive
+performance on VERITE; with a 9.2% increase in accuracy. We release our code
+at: https://github.com/stevejpapad/image-text-verification
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sound Demixing Challenge 2023 Music Demixing Track Technical Report:
+  TFC-TDF-UNet v3 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09382v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09382v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minseok Kim, Jun Hyung Lee, Soonyoung Jung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this report, we present our award-winning solutions for the Music Demixing
+Track of Sound Demixing Challenge 2023. First, we propose TFC-TDF-UNet v3, a
+time-efficient music source separation model that achieves state-of-the-art
+results on the MUSDB benchmark. We then give full details regarding our
+solutions for each Leaderboard, including a loss masking approach for
+noise-robust training. Code for reproducing model training and final
+submissions is available at github.com/kuielab/sdx23.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Storyboard Generation in an Engine-based Virtual Environment for
+  Video Production 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.12688v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.12688v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anyi Rao, Xuekun Jiang, Yuwei Guo, Linning Xu, Lei Yang, Libiao Jin, Dahua Lin, Bo Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Amateurs working on mini-films and short-form videos usually spend lots of
+time and effort on the multi-round complicated process of setting and adjusting
+scenes, plots, and cameras to deliver satisfying video shots. We present
+Virtual Dynamic Storyboard (VDS) to allow users storyboarding shots in virtual
+environments, where the filming staff can easily test the settings of shots
+before the actual filming. VDS runs on a "propose-simulate-discriminate" mode:
+Given a formatted story script and a camera script as input, it generates
+several character animation and camera movement proposals following predefined
+story and cinematic rules to allow an off-the-shelf simulation engine to render
+videos. To pick up the top-quality dynamic storyboard from the candidates, we
+equip it with a shot ranking discriminator based on shot quality criteria
+learned from professional manual-created data. VDS is comprehensively validated
+via extensive experiments and user studies, demonstrating its efficiency,
+effectiveness, and great potential in assisting amateur video production.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://virtualfilmstudio.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-20T00:00:00Z">2023-07-20</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">61</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ L-Eval: Instituting Standardized Evaluation for Long Context Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxin An, Shansan Gong, Ming Zhong, Mukai Li, Jun Zhang, Lingpeng Kong, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been growing interest in extending the context length of
+instruction-following models in order to effectively process single-turn long
+input (e.g. summarizing a paper) and conversations with more extensive
+histories. While proprietary models such as GPT-4 and Claude have demonstrated
+considerable advancements in handling tens of thousands of tokens of context,
+open-sourced models are still in the early stages of experimentation. It also
+remains unclear whether developing these long context models can offer
+substantial gains on practical downstream tasks over retrieval-based methods or
+models simply trained on chunked contexts. To address this challenge, we
+propose to institute standardized evaluation for long context language models.
+Concretely, we develop L-Eval which contains 411 long documents and over 2,000
+query-response pairs manually annotated and checked by the authors encompassing
+areas such as law, finance, school lectures, lengthy conversations, news,
+long-form novels, and meetings. L-Eval also adopts diverse evaluation methods
+and instruction styles, enabling a more reliable assessment of Long Context
+Language Models (LCLMs). Our findings indicate that while open-source models
+typically lag behind their commercial counterparts, they still exhibit
+impressive performance. LLaMA2 achieves the best results (win 45\% vs
+turbo-16k) on open-ended tasks with only 4k context length and ChatGLM2
+achieves the best results on closed-ended tasks with 8k input tokens. We
+release our new evaluation suite, code, and all generation results including
+predictions from all open-sourced LCLMs, GPT4-32k, Cluade-100k at
+{\url{https://github.com/OpenLMLab/LEval}}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embroid: Unsupervised Prediction Smoothing Can Improve Few-Shot
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neel Guha, Mayee F. Chen, Kush Bhatia, Azalia Mirhoseini, Frederic Sala, Christopher Ré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that language models' (LMs) prompt-based learning
+capabilities make them well suited for automating data labeling in domains
+where manual annotation is expensive. The challenge is that while writing an
+initial prompt is cheap, improving a prompt is costly -- practitioners often
+require significant labeled data in order to evaluate the impact of prompt
+modifications. Our work asks whether it is possible to improve prompt-based
+learning without additional labeled data. We approach this problem by
+attempting to modify the predictions of a prompt, rather than the prompt
+itself. Our intuition is that accurate predictions should also be consistent:
+samples which are similar under some feature representation should receive the
+same prompt prediction. We propose Embroid, a method which computes multiple
+representations of a dataset under different embedding functions, and uses the
+consistency between the LM predictions for neighboring samples to identify
+mispredictions. Embroid then uses these neighborhoods to create additional
+predictions for each sample, and combines these predictions with a simple
+latent variable graphical model in order to generate a final corrected
+prediction. In addition to providing a theoretical analysis of Embroid, we
+conduct a rigorous empirical evaluation across six different LMs and up to 95
+different tasks. We find that (1) Embroid substantially improves performance
+over original prompts (e.g., by an average of 7.3 points on GPT-JT), (2) also
+realizes improvements for more sophisticated prompting strategies (e.g.,
+chain-of-thought), and (3) can be specialized to domains like law through the
+embedding functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 22 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "It Felt Like Having a Second Mind": Investigating Human-AI
+  Co-creativity in Prewriting with Large Language Models <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10811v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10811v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Wan, Siying Hu, Yu Zhang, Piaohong Wang, Bo Wen, Zhicong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prewriting is the process of discovering and developing ideas before a first
+draft, which requires divergent thinking and often implies unstructured
+strategies such as diagramming, outlining, free-writing, etc. Although large
+language models (LLMs) have been demonstrated to be useful for a variety of
+tasks including creative writing, little is known about how users would
+collaborate with LLMs to support prewriting. The preferred collaborative role
+and initiative of LLMs during such a creativity process is also unclear. To
+investigate human-LLM collaboration patterns and dynamics during prewriting, we
+conducted a three-session qualitative study with 15 participants in two
+creative tasks: story writing and slogan writing. The findings indicated that
+during collaborative prewriting, there appears to be a three-stage iterative
+Human-AI Co-creativity process that includes Ideation, Illumination, and
+Implementation stages. This collaborative process champions the human in a
+dominant role, in addition to mixed and shifting levels of initiative that
+exist between humans and LLMs. This research also reports on collaboration
+breakdowns that occur during this process, user perceptions of using existing
+LLMs during Human-AI Co-creativity, and discusses design implications to
+support this co-creativity process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review at CSCW after a Major Revision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Factual Knowledge Boundary of Large Language Models
+  with Retrieval Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiyang Ren, Yuhao Wang, Yingqi Qu, Wayne Xin Zhao, Jing Liu, Hao Tian, Hua Wu, Ji-Rong Wen, Haifeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require
+a substantial amount of factual knowledge and often rely on external
+information for assistance. Recently, large language models (LLMs) (e.g.,
+ChatGPT), have demonstrated impressive prowess in solving a wide range of tasks
+with world knowledge, including knowledge-intensive tasks. However, it remains
+unclear how well LLMs are able to perceive their factual knowledge boundaries,
+particularly how they behave when incorporating retrieval augmentation. In this
+study, we present an initial analysis of the factual knowledge boundaries of
+LLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,
+we focus on three primary research questions and analyze them by examining QA
+performance, priori judgement and posteriori judgement of LLMs. We show
+evidence that LLMs possess unwavering confidence in their capabilities to
+respond to questions and the accuracy of their responses. Furthermore,
+retrieval augmentation proves to be an effective approach in enhancing LLMs'
+awareness of knowledge boundaries, thereby improving their judgemental
+abilities. Additionally, we also find that LLMs have a propensity to rely on
+the provided retrieval results when formulating answers, while the quality of
+these results significantly impacts their reliance. The code to reproduce this
+work is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating <span class="highlight-title">Pretrain</span>ed ASR and LM to Perform Sequence Generation for
+  Spoken Language Understanding <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddhant Arora, Hayato Futami, Yosuke Kashiwagi, Emiru Tsunoo, Brian Yan, Shinji Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been an increased interest in the integration of pretrained speech
+recognition (ASR) and language models (LM) into the SLU framework. However,
+prior methods often struggle with a vocabulary mismatch between pretrained
+models, and LM cannot be directly utilized as they diverge from its NLU
+formulation. In this study, we propose a three-pass end-to-end (E2E) SLU system
+that effectively integrates ASR and LM subnetworks into the SLU formulation for
+sequence generation tasks. In the first pass, our architecture predicts ASR
+transcripts using the ASR subnetwork. This is followed by the LM subnetwork,
+which makes an initial SLU prediction. Finally, in the third pass, the
+deliberation subnetwork conditions on representations from the ASR and LM
+subnetworks to make the final prediction. Our proposed three-pass SLU system
+shows improved performance over cascaded and E2E SLU models on two benchmark
+SLU datasets, SLURP and SLUE, especially on acoustically challenging
+utterances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MASR: Metadata Aware Speech Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anjali Raj, Shikhar Bharadwaj, Sriram Ganapathy, Min Ma, Shikhar Vashishth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the recent years, speech representation learning is constructed primarily
+as a self-supervised learning (SSL) task, using the raw audio signal alone,
+while ignoring the side-information that is often available for a given speech
+recording. In this paper, we propose MASR, a Metadata Aware Speech
+Representation learning framework, which addresses the aforementioned
+limitations. MASR enables the inclusion of multiple external knowledge sources
+to enhance the utilization of meta-data information. The external knowledge
+sources are incorporated in the form of sample-level pair-wise similarity
+matrices that are useful in a hard-mining loss. A key advantage of the MASR
+framework is that it can be combined with any choice of SSL method. Using MASR
+representations, we perform evaluations on several downstream tasks such as
+language identification, speech recognition and other non-semantic tasks such
+as speaker and emotion recognition. In these experiments, we illustrate
+significant performance improvements for the MASR over other established
+benchmarks. We perform a detailed analysis on the language identification task
+to provide insights on how the proposed loss function enables the
+representations to separate closely related languages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identical and Fraternal Twins: Fine-Grained Semantic Contrastive
+  Learning of Sentence Representations <span class="chip">ECAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10932v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10932v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingfa Xiao, Shuangyin Li, Lei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The enhancement of unsupervised learning of sentence representations has been
+significantly achieved by the utility of contrastive learning. This approach
+clusters the augmented positive instance with the anchor instance to create a
+desired embedding space. However, relying solely on the contrastive objective
+can result in sub-optimal outcomes due to its inability to differentiate subtle
+semantic variations between positive pairs. Specifically, common data
+augmentation techniques frequently introduce semantic distortion, leading to a
+semantic margin between the positive pair. While the InfoNCE loss function
+overlooks the semantic margin and prioritizes similarity maximization between
+positive pairs during training, leading to the insensitive semantic
+comprehension ability of the trained model. In this paper, we introduce a novel
+Identical and Fraternal Twins of Contrastive Learning (named IFTCL) framework,
+capable of simultaneously adapting to various positive pairs generated by
+different augmentation techniques. We propose a \textit{Twins Loss} to preserve
+the innate margin during training and promote the potential of data enhancement
+in order to overcome the sub-optimal issue. We also present proof-of-concept
+experiments combined with the contrastive objective to prove the validity of
+the proposed Twins Loss. Furthermore, we propose a hippocampus queue mechanism
+to restore and reuse the negative instances without additional calculation,
+which further enhances the efficiency and performance of the IFCL. We verify
+the IFCL framework on nine semantic textual similarity tasks with both English
+and Chinese datasets, and the experimental results show that IFCL outperforms
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article has been accepted for publication in European Conference
+  on Artificial Intelligence (ECAI2023). 9 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Media<span class="highlight-title">GPT</span> : A Large Language Model Target Chinese Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10930v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10930v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhonghao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of large language models (LLMs) has seen rapid progress in
+recent years. One of the most widely used LLMs is the Generative Pre-trained
+Transformer (GPT) series, which has been applied in various fields, including
+the media domain. However, in practical applications, the differences between
+the media's use cases and the general-purpose applications of LLMs have become
+increasingly apparent, especially Chinese. As a result, there is a growing need
+to develop LLM that are specifically tailored to the unique requirements of the
+media domain. In this paper, we present MediaGPT, a large language model
+training on variety of media data and addressing the practical needs of Chinese
+media. We have designed a diverse set of task instruction types to cater to the
+specific requirements of the domain. To further validate the effectiveness of
+our proposed LLM, we have constructed unique datasets that are tailored to the
+media domain and have also developed verification methods that are specifically
+designed for generative-type tasks. By doing so, we aim to bridge the gap
+between the general-purpose LLM and the requirements of the media domain, and
+to pave the way for more effective and efficient use of LLM in this field. This
+paper aims to explore the challenges and opportunities of developing LLM for
+media applications and to propose potential solutions for addressing these
+challenges.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLASK: Fine-grained Language Model Evaluation based on Alignment Skill
+  Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seonghyeon Ye, Doyoung Kim, Sungdong Kim, Hyeonbin Hwang, Seungone Kim, Yongrae Jo, James Thorne, Juho Kim, Minjoon Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluation of Large Language Models (LLMs) is challenging because aligning to
+human values requires the composition of multiple skills and the required set
+of skills varies depending on the instruction. Recent studies have evaluated
+the performance of LLMs in two ways, (1) automatic evaluation on several
+independent benchmarks and (2) human or machined-based evaluation giving an
+overall score to the response. However, both settings are coarse-grained
+evaluations, not considering the nature of user instructions that require
+instance-wise skill composition, which limits the interpretation of the true
+capabilities of LLMs. In this paper, we introduce FLASK (Fine-grained Language
+Model Evaluation based on Alignment SKill Sets), a fine-grained evaluation
+protocol that can be used for both model-based and human-based evaluation which
+decomposes coarse-level scoring to an instance-wise skill set-level.
+Specifically, we define 12 fine-grained skills needed for LLMs to follow
+open-ended user instructions and construct an evaluation set by allocating a
+set of skills for each instance. Additionally, by annotating the target domains
+and difficulty level for each instance, FLASK provides a holistic view with a
+comprehensive analysis of a model's performance depending on skill, domain, and
+difficulty. Through using FLASK, we compare multiple open-sourced and
+proprietary LLMs and observe highly-correlated findings between model-based and
+human-based evaluations. FLASK enables developers to more accurately measure
+the model performance and how it can be improved by analyzing factors that make
+LLMs proficient in particular skills. For practitioners, FLASK can be used to
+recommend suitable models for particular situations through comprehensive
+comparison among various LLMs. We release the evaluation data and code
+implementation at https://github.com/kaistAI/FLASK.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with
+  Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashish Singh, Prateek Agarwal, Zixuan Huang, Arpita Singh, Tong Yu, Sungchul Kim, Victor Bursztyn, Nikos Vlassis, Ryan A. Rossi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Captions are crucial for understanding scientific visualizations and
+documents. Existing captioning methods for scientific figures rely on
+figure-caption pairs extracted from documents for training, many of which fall
+short with respect to metrics like helpfulness, explainability, and
+visual-descriptiveness [15] leading to generated captions being misaligned with
+reader preferences. To enable the generation of high-quality figure captions,
+we introduce FigCaps-HF a new framework for figure-caption generation that can
+incorporate domain expert feedback in generating captions optimized for reader
+preferences. Our framework comprises of 1) an automatic method for evaluating
+quality of figure-caption pairs, 2) a novel reinforcement learning with human
+feedback (RLHF) method to optimize a generative figure-to-caption model for
+reader preferences. We demonstrate the effectiveness of our simple learning
+framework by improving performance over standard fine-tuning across different
+types of models. In particular, when using BLIP as the base model, our RLHF
+framework achieves a mean gain of 35.7%, 16.9%, and 9% in ROUGE, BLEU, and
+Meteor, respectively. Finally, we release a large-scale benchmark dataset with
+human feedback on figure-caption pairs to enable further evaluation and
+development of RLHF techniques for this problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 4 figures. Benchmark Documentation:
+  https://figcapshf.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Divide & Bind Your Attention for Improved Generative Semantic Nursing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yumeng Li, Margret Keuper, Dan Zhang, Anna Khoreva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emerging large-scale text-to-image generative models, e.g., Stable Diffusion
+(SD), have exhibited overwhelming results with high fidelity. Despite the
+magnificent progress, current state-of-the-art models still struggle to
+generate images fully adhering to the input prompt. Prior work, Attend &
+Excite, has introduced the concept of Generative Semantic Nursing (GSN), aiming
+to optimize cross-attention during inference time to better incorporate the
+semantics. It demonstrates promising results in generating simple prompts,
+e.g., ``a cat and a dog''. However, its efficacy declines when dealing with
+more complex prompts, and it does not explicitly address the problem of
+improper attribute binding. To address the challenges posed by complex prompts
+or scenarios involving multiple entities and to achieve improved attribute
+binding, we propose Divide & Bind. We introduce two novel loss objectives for
+GSN: a novel attendance loss and a binding loss. Our approach stands out in its
+ability to faithfully synthesize desired objects with improved attribute
+alignment from complex prompts and exhibits superior performance across
+multiple evaluation benchmarks. More videos and updates can be found on the
+project page \url{https://sites.google.com/view/divide-and-bind}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: \url{https://sites.google.com/view/divide-and-bind}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Yelp <span class="highlight-title">Review</span>s and Food Types: A Comparative Analysis of Ratings,
+  Sentiments, and Topics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenyu Liao, Yiqing Shi, Yujia Hu, Wei Quan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study examines the relationship between Yelp reviews and food types,
+investigating how ratings, sentiments, and topics vary across different types
+of food. Specifically, we analyze how ratings and sentiments of reviews vary
+across food types, cluster food types based on ratings and sentiments, infer
+review topics using machine learning models, and compare topic distributions
+among different food types. Our analyses reveal that some food types have
+similar ratings, sentiments, and topics distributions, while others have
+distinct patterns. We identify four clusters of food types based on ratings and
+sentiments and find that reviewers tend to focus on different topics when
+reviewing certain food types. These findings have important implications for
+understanding user behavior and cultural influence on digital media platforms
+and promoting cross-cultural understanding and appreciation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Corpus Multilingual Speech Emotion Recognition: Amharic vs. Other
+  Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10814v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10814v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ephrem Afele Retta, Richard Sutcliffe, Jabar Mahmood, Michael Abebe Berwo, Eiad Almekhlafi, Sajjad Ahmed Khan, Shehzad Ashraf Chaudhry, Mustafa Mhamed, Jun Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a conventional Speech emotion recognition (SER) task, a classifier for a
+given language is trained on a pre-existing dataset for that same language.
+However, where training data for a language does not exist, data from other
+languages can be used instead. We experiment with cross-lingual and
+multilingual SER, working with Amharic, English, German and URDU. For Amharic,
+we use our own publicly-available Amharic Speech Emotion Dataset (ASED). For
+English, German and Urdu we use the existing RAVDESS, EMO-DB and URDU datasets.
+We followed previous research in mapping labels for all datasets to just two
+classes, positive and negative. Thus we can compare performance on different
+languages directly, and combine languages for training and testing. In
+Experiment 1, monolingual SER trials were carried out using three classifiers,
+AlexNet, VGGE (a proposed variant of VGG), and ResNet50. Results averaged for
+the three models were very similar for ASED and RAVDESS, suggesting that
+Amharic and English SER are equally difficult. Similarly, German SER is more
+difficult, and Urdu SER is easier. In Experiment 2, we trained on one language
+and tested on another, in both directions for each pair: Amharic<->German,
+Amharic<->English, and Amharic<->Urdu. Results with Amharic as target suggested
+that using English or German as source will give the best result. In Experiment
+3, we trained on several non-Amharic languages and then tested on Amharic. The
+best accuracy obtained was several percent greater than the best accuracy in
+Experiment 2, suggesting that a better result can be obtained when using two or
+three non-Amharic languages for training than when using just one non-Amharic
+language. Overall, the results suggest that cross-lingual and multilingual
+training can be an effective strategy for training a SER classifier when
+resources for a language are scarce.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 tables, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-<span class="highlight-title">Transformer</span>: A Unified Framework for Multimodal Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyuan Zhang, Kaixiong Gong, Kaipeng Zhang, Hongsheng Li, Yu Qiao, Wanli Ouyang, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal learning aims to build models that can process and relate
+information from multiple modalities. Despite years of development in this
+field, it still remains challenging to design a unified network for processing
+various modalities ($\textit{e.g.}$ natural language, 2D images, 3D point
+clouds, audio, video, time series, tabular data) due to the inherent gaps among
+them. In this work, we propose a framework, named Meta-Transformer, that
+leverages a $\textbf{frozen}$ encoder to perform multimodal perception without
+any paired multimodal training data. In Meta-Transformer, the raw input data
+from various modalities are mapped into a shared token space, allowing a
+subsequent encoder with frozen parameters to extract high-level semantic
+features of the input data. Composed of three main components: a unified data
+tokenizer, a modality-shared encoder, and task-specific heads for downstream
+tasks, Meta-Transformer is the first framework to perform unified learning
+across 12 modalities with unpaired data. Experiments on different benchmarks
+reveal that Meta-Transformer can handle a wide range of tasks including
+fundamental perception (text, image, point cloud, audio, video), practical
+application (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph,
+tabular, and time-series). Meta-Transformer indicates a promising future for
+developing unified multimodal intelligence with transformers. Code will be
+available at https://github.com/invictus717/MetaTransformer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://kxgong.github.io/meta_transformer/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Layer-wise Representation Fusion for Compositional Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10799v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10799v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yafang Zheng, Lei Lin, Zhaohong Lai, Binling Wang, Shan Liu, Biao Fu, Wenhao Rao, Peigen Ye, Yidong Chen, Xiaodong Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite successes across a broad range of applications, sequence-to-sequence
+models' construct of solutions are argued to be less compositional than
+human-like generalization. There is mounting evidence that one of the reasons
+hindering compositional generalization is representations of the encoder and
+decoder uppermost layer are entangled. In other words, the syntactic and
+semantic representations of sequences are twisted inappropriately. However,
+most previous studies mainly concentrate on enhancing token-level semantic
+information to alleviate the representations entanglement problem, rather than
+composing and using the syntactic and semantic representations of sequences
+appropriately as humans do. In addition, we explain why the entanglement
+problem exists from the perspective of recent studies about training deeper
+Transformer, mainly owing to the ``shallow'' residual connections and its
+simple, one-step operations, which fails to fuse previous layers' information
+effectively. Starting from this finding and inspired by humans' strategies, we
+propose \textsc{FuSion} (\textbf{Fu}sing \textbf{S}yntactic and
+Semant\textbf{i}c Representati\textbf{on}s), an extension to
+sequence-to-sequence models to learn to fuse previous layers' information back
+into the encoding and decoding process appropriately through introducing a
+\emph{fuse-attention module} at each encoder and decoder layer. \textsc{FuSion}
+achieves competitive and even \textbf{state-of-the-art} results on two
+realistic benchmarks, which empirically demonstrates the effectiveness of our
+proposal.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>work in progress. arXiv admin note: substantial text overlap with
+  arXiv:2305.12169</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extreme Multi-Label Skill Extraction Training using Large Language
+  Models <span class="chip">ECML-PKDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10778v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10778v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jens-Joris Decorte, Severine Verlinden, Jeroen Van Hautte, Johannes Deleu, Chris Develder, Thomas Demeester
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online job ads serve as a valuable source of information for skill
+requirements, playing a crucial role in labor market analysis and e-recruitment
+processes. Since such ads are typically formatted in free text, natural
+language processing (NLP) technologies are required to automatically process
+them. We specifically focus on the task of detecting skills (mentioned
+literally, or implicitly described) and linking them to a large skill ontology,
+making it a challenging case of extreme multi-label classification (XMLC).
+Given that there is no sizable labeled (training) dataset are available for
+this specific XMLC task, we propose techniques to leverage general Large
+Language Models (LLMs). We describe a cost-effective approach to generate an
+accurate, fully synthetic labeled dataset for skill extraction, and present a
+contrastive learning strategy that proves effective in the task. Our results
+across three skill extraction benchmarks show a consistent increase of between
+15 to 25 percentage points in \textit{R-Precision@5} compared to previously
+published results that relied solely on distant supervision through literal
+matches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the International workshop on AI for Human Resources and
+  Public Employment Services (AI4HR&PES) as part of ECML-PKDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vesper: A Compact and Effective <span class="highlight-title">Pretrain</span>ed Model for Speech Emotion
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weidong Chen, Xiaofen Xing, Peihao Chen, Xiangmin Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a paradigm that adapts general large-scale pretrained
+models (PTMs) to speech emotion recognition task. Although PTMs shed new light
+on artificial general intelligence, they are constructed with general tasks in
+mind, and thus, their efficacy for specific tasks can be further improved.
+Additionally, employing PTMs in practical applications can be challenging due
+to their considerable size. Above limitations spawn another research direction,
+namely, optimizing large-scale PTMs for specific tasks to generate
+task-specific PTMs that are both compact and effective. In this paper, we focus
+on the speech emotion recognition task and propose an improved emotion-specific
+pretrained encoder called Vesper. Vesper is pretrained on a speech dataset
+based on WavLM and takes into account emotional characteristics. To enhance
+sensitivity to emotional information, Vesper employs an emotion-guided masking
+strategy to identify the regions that need masking. Subsequently, Vesper
+employs hierarchical and cross-layer self-supervision to improve its ability to
+capture acoustic and semantic representations, both of which are crucial for
+emotion recognition. Experimental results on the IEMOCAP, MELD, and CREMA-D
+datasets demonstrate that Vesper with 4 layers outperforms WavLM Base with 12
+layers, and the performance of Vesper with 12 layers surpasses that of WavLM
+Large with 24 layers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Perspectives on the Impact of Artificial Intelligence on the
+  Creativity of Knowledge Work: Beyond Mechanised Plagiarism and Stochastic
+  Parrots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Advait Sarkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence (AI), and in particular generative models, are
+transformative tools for knowledge work. They problematise notions of
+creativity, originality, plagiarism, the attribution of credit, and copyright
+ownership. Critics of generative models emphasise the reliance on large amounts
+of training data, and view the output of these models as no more than
+randomised plagiarism, remix, or collage of the source data. On these grounds,
+many have argued for stronger regulations on the deployment, use, and
+attribution of the output of these models. However, these issues are not new or
+unique to artificial intelligence. In this position paper, using examples from
+literary criticism, the history of art, and copyright law, I show how
+creativity and originality resist definition as a notatable or
+information-theoretic property of an object, and instead can be seen as the
+property of a process, an author, or a viewer. Further alternative views hold
+that all creative work is essentially reuse (mostly without attribution), or
+that randomness itself can be creative. I suggest that creativity is ultimately
+defined by communities of creators and receivers, and the deemed sources of
+creativity in a workflow often depend on which parts of the workflow can be
+automated. Using examples from recent studies of AI in creative knowledge work,
+I suggest that AI shifts knowledge work from material production to critical
+integration. This position paper aims to begin a conversation around a more
+nuanced approach to the problems of creativity and credit assignment for
+generative models, one which more fully recognises the importance of the
+creative and curatorial voice of the users of these models and moves away from
+simpler notational or information-theoretic views.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Advait Sarkar. 2023. Exploring Perspectives on the Impact of
+  Artificial Intelligence on the Creativity of Knowledge Work Beyond Mechanised
+  Plagiarism and Stochastic Parrots. In Annual Symposium on Human-Computer
+  Interaction for Work 2023 (CHIWORK 2023), June 13-16, 2023, Oldenburg,
+  Germany. ACM, New York, NY, USA, 17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large language models shape and are shaped by society: A <span class="highlight-title">survey</span> of arXiv
+  publication patterns 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajiv Movva, Sidhika Balachandar, Kenny Peng, Gabriel Agostini, Nikhil Garg, Emma Pierson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been a steep recent increase in the number of large language model
+(LLM) papers, producing a dramatic shift in the scientific landscape which
+remains largely undocumented through bibliometric analysis. Here, we analyze
+388K papers posted on the CS and Stat arXivs, focusing on changes in
+publication patterns in 2023 vs. 2018-2022. We analyze how the proportion of
+LLM papers is increasing; the LLM-related topics receiving the most attention;
+the authors writing LLM papers; how authors' research topics correlate with
+their backgrounds; the factors distinguishing highly cited LLM papers; and the
+patterns of international collaboration. We show that LLM research increasingly
+focuses on societal impacts: there has been an 18x increase in the proportion
+of LLM-related papers on the Computers and Society sub-arXiv, and authors newly
+publishing on LLMs are more likely to focus on applications and societal
+impacts than more experienced authors. LLM research is also shaped by social
+dynamics: we document gender and academic/industry disparities in the topics
+LLM authors focus on, and a US/China schism in the collaboration network.
+Overall, our analysis documents the profound ways in which LLM research both
+shapes and is shaped by society, attesting to the necessity of sociotechnical
+lenses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Working paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Dataset</span> and Strong Baselines for Classification of Czech News Texts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hynek Kydlíček, Jindřich Libovický
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained models for Czech Natural Language Processing are often evaluated
+on purely linguistic tasks (POS tagging, parsing, NER) and relatively simple
+classification tasks such as sentiment classification or article classification
+from a single news source. As an alternative, we present
+CZEch~NEws~Classification~dataset (CZE-NEC), one of the largest Czech
+classification datasets, composed of news articles from various sources
+spanning over twenty years, which allows a more rigorous evaluation of such
+models. We define four classification tasks: news source, news category,
+inferred author's gender, and day of the week. To verify the task difficulty,
+we conducted a human evaluation, which revealed that human performance lags
+behind strong machine-learning baselines built upon pre-trained transformer
+models. Furthermore, we show that language-specific pre-trained encoder
+analysis outperforms selected commercially available large-scale generative
+language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, Accepted to Text, Speech and Dialogue (TSD) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Landscape of Natural Language Processing Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Schopf, Karim Arabi, Florian Matthes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As an efficient approach to understand, generate, and process natural
+language texts, research in natural language processing (NLP) has exhibited a
+rapid spread and wide adoption in recent years. Given the increasing amount of
+research work in this area, several NLP-related approaches have been surveyed
+in the research community. However, a comprehensive study that categorizes
+established topics, identifies trends, and outlines areas for future research
+remains absent to this day. Contributing to closing this gap, we have
+systematically classified and analyzed research papers included in the ACL
+Anthology. As a result, we present a structured overview of the research
+landscape, provide a taxonomy of fields-of-study in NLP, analyze recent
+developments in NLP, summarize our findings, and highlight directions for
+future work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 14th International Conference on Recent Advances in
+  Natural Language Processing (RANLP 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SciBench: Evaluating College-Level Scientific Problem-Solving Abilities
+  of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxuan Wang, Ziniu Hu, Pan Lu, Yanqiao Zhu, Jieyu Zhang, Satyen Subramaniam, Arjun R. Loomba, Shichang Zhang, Yizhou Sun, Wei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language models (LLMs) have demonstrated notable
+progress on many mathematical benchmarks. However, most of these benchmarks
+only feature problems grounded in junior and senior high school subjects,
+contain only multiple-choice questions, and are confined to a limited scope of
+elementary arithmetic operations. To address these issues, this paper
+introduces an expansive benchmark suite SciBench that aims to systematically
+examine the reasoning capabilities required for complex scientific problem
+solving. SciBench contains two carefully curated datasets: an open set
+featuring a range of collegiate-level scientific problems drawn from
+mathematics, chemistry, and physics textbooks, and a closed set comprising
+problems from undergraduate-level exams in computer science and mathematics.
+Based on the two datasets, we conduct an in-depth benchmark study of two
+representative LLMs with various prompting strategies. The results reveal that
+current LLMs fall short of delivering satisfactory performance, with an overall
+score of merely 35.80%. Furthermore, through a detailed user study, we
+categorize the errors made by LLMs into ten problem-solving abilities. Our
+analysis indicates that no single prompting strategy significantly outperforms
+others and some strategies that demonstrate improvements in certain
+problem-solving skills result in declines in other skills. We envision that
+SciBench will catalyze further developments in the reasoning abilities of LLMs,
+thereby ultimately contributing to scientific research and discovery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress, 18 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Language Models on Nucleotide Sequences of Human Genes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10634v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10634v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Musa Nuri Ihtiyar, Arzucan Ozgur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models, primarily transformer-based ones, obtained colossal success
+in NLP. To be more precise, studies like BERT in NLU and works such as GPT-3
+for NLG are very crucial. DNA sequences are very close to natural language in
+terms of structure, so if the DNA-related bioinformatics domain is concerned,
+discriminative models, like DNABert, exist. Yet, the generative side of the
+coin is mainly unexplored to the best of our knowledge. Consequently, we
+focused on developing an autoregressive generative language model like GPT-3
+for DNA sequences. Because working with whole DNA sequences is challenging
+without substantial computational resources, we decided to carry out our study
+on a smaller scale, focusing on nucleotide sequences of human genes, unique
+parts in DNA with specific functionalities, instead of the whole DNA. This
+decision did not change the problem structure a lot due to the fact that both
+DNA and genes can be seen as 1D sequences consisting of four different
+nucleotides without losing much information and making too much simplification.
+First of all, we systematically examined an almost entirely unexplored problem
+and observed that RNNs performed the best while simple techniques like N-grams
+were also promising. Another beneficial point was learning how to work with
+generative models on languages we do not understand, unlike natural language.
+How essential using real-life tasks beyond the classical metrics such as
+perplexity is observed. Furthermore, checking whether the data-hungry nature of
+these models can be changed through selecting a language with minimal
+vocabulary size, four owing to four different types of nucleotides, is
+examined. The reason for reviewing this was that choosing such a language might
+make the problem easier. However, what we observed in this study was it did not
+provide that much of a change in the amount of data needed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Method Self-Training: Improving Code Generation With Text, And
+  Vice Versa 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shriyash K. Upadhyay, Etan J. Ginsberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models have many methods for solving the same problem. This
+introduces novel strengths (different methods may work well for different
+problems) and weaknesses (it may be difficult for users to know which method to
+use). In this paper, we introduce Multi-Method Self-Training (MMST), where one
+method is trained on the filtered outputs of another, allowing us to augment
+the strengths and ameliorate the weaknesses of each method. Using a 176B
+parameter model trained on both language and code, we show that MMST can 1)
+improve the less performant method (up to 30%) making the model easier to use,
+2) improve the more performant method (up to 32.2%) making the model more
+performant, and 3) improve the performance of related but distinct tasks (up to
+10.3%) by improving the ability of the model to generate rationales. We then
+conduct ablation analyses to explore why MMST works. We show that MMST
+generates more data than traditional self-training, but the improvement in
+performance is driven by the use of multiple methods. We also analyze
+prompt-engineering and anti-correlated performance between methods as means of
+making MMST more effective. We hope the evidence from our paper motivates
+machine learning researchers to explore ways in which advances in language
+models allow for new forms of training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep Dive into the Disparity of Word Error Rates Across Thousands of
+  NPTEL MOOC Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10587v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10587v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anand Kumar Rai, Siddharth D Jaiswal, Animesh Mukherjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic speech recognition (ASR) systems are designed to transcribe spoken
+language into written text and find utility in a variety of applications
+including voice assistants and transcription services. However, it has been
+observed that state-of-the-art ASR systems which deliver impressive benchmark
+results, struggle with speakers of certain regions or demographics due to
+variation in their speech properties. In this work, we describe the curation of
+a massive speech dataset of 8740 hours consisting of $\sim9.8$K technical
+lectures in the English language along with their transcripts delivered by
+instructors representing various parts of Indian demography. The dataset is
+sourced from the very popular NPTEL MOOC platform. We use the curated dataset
+to measure the existing disparity in YouTube Automatic Captions and OpenAI
+Whisper model performance across the diverse demographic traits of speakers in
+India. While there exists disparity due to gender, native region, age and
+speech rate of speakers, disparity based on caste is non-existent. We also
+observe statistically significant disparity across the disciplines of the
+lectures. These results indicate the need of more inclusive and robust ASR
+systems and more representational datasets for disparity evaluation in them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Instruction-following Evaluation through Verbalizer Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyang Li, Jun Yan, Hai Wang, Zheng Tang, Xiang Ren, Vijay Srinivasan, Hongxia Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While instruction-tuned models have shown remarkable success in various
+natural language processing tasks, accurately evaluating their ability to
+follow instructions remains challenging. Existing benchmarks primarily focus on
+common instructions that align well with what the model learned during
+training. However, proficiency in responding to these instructions does not
+necessarily imply strong ability in instruction following. In this paper, we
+propose a novel instruction-following evaluation protocol called verbalizer
+manipulation. It instructs the model to verbalize the task label with words
+aligning with model priors to different extents, adopting verbalizers from
+highly aligned (e.g., outputting ``postive'' for positive sentiment), to
+minimally aligned (e.g., outputting ``negative'' for positive sentiment).
+Verbalizer manipulation can be seamlessly integrated with any classification
+benchmark to examine the model's reliance on priors and its ability to override
+them to accurately follow the instructions. We conduct a comprehensive
+evaluation of four major model families across nine datasets, employing twelve
+sets of verbalizers for each of them. We observe that the instruction-following
+abilities of models, across different families and scales, are significantly
+distinguished by their performance on less natural verbalizers. Even the
+strongest GPT-4 model struggles to perform better than random guessing on the
+most challenging verbalizer, emphasizing the need for continued advancements to
+improve their instruction-following abilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Large Language Models on Blockchains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhao Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training and deploying the large language models requires a large mount of
+computational resource because the language models contain billions of
+parameters and the text has thousands of tokens. Another problem is that the
+large language models are static. They are fixed after the training process. To
+tackle these issues, in this paper, we propose to train and deploy the dynamic
+large language model on blockchains, which have high computation performance
+and are distributed across a network of computers. A blockchain is a secure,
+decentralized, and transparent system that allows for the creation of a
+tamper-proof ledger for transactions without the need for intermediaries. The
+dynamic large language models can continuously learn from the user input after
+the training process. Our method provides a new way to develop the large
+language models and also sheds a light on the next generation artificial
+intelligence systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gender-tuning: Empowering Fine-tuning for Debiasing <span class="highlight-title">Pre-train</span>ed Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Somayeh Ghanbarzadeh, Yan Huang, Hamid Palangi, Radames Cruz Moreno, Hamed Khanpour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have revealed that the widely-used Pre-trained Language Models
+(PLMs) propagate societal biases from the large unmoderated pre-training
+corpora. Existing solutions require debiasing training processes and datasets
+for debiasing, which are resource-intensive and costly. Furthermore, these
+methods hurt the PLMs' performance on downstream tasks. In this study, we
+propose Gender-tuning, which debiases the PLMs through fine-tuning on
+downstream tasks' datasets. For this aim, Gender-tuning integrates Masked
+Language Modeling (MLM) training objectives into fine-tuning's training
+process. Comprehensive experiments show that Gender-tuning outperforms the
+state-of-the-art baselines in terms of average gender bias scores in PLMs while
+improving PLMs' performance on downstream tasks solely using the downstream
+tasks' dataset. Also, Gender-tuning is a deployable debiasing tool for any PLM
+that works with original fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building Socio-culturally Inclusive Stereotype Resources with Community
+  Engagement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10514v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10514v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunipa Dev, Jaya Goyal, Dinesh Tewari, Shachi Dave, Vinodkumar Prabhakaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With rapid development and deployment of generative language models in global
+settings, there is an urgent need to also scale our measurements of harm, not
+just in the number and types of harms covered, but also how well they account
+for local cultural contexts, including marginalized identities and the social
+biases experienced by them. Current evaluation paradigms are limited in their
+abilities to address this, as they are not representative of diverse, locally
+situated but global, socio-cultural perspectives. It is imperative that our
+evaluation resources are enhanced and calibrated by including people and
+experiences from different cultures and societies worldwide, in order to
+prevent gross underestimations or skews in measurements of harm. In this work,
+we demonstrate a socio-culturally aware expansion of evaluation resources in
+the Indian societal context, specifically for the harm of stereotyping. We
+devise a community engaged effort to build a resource which contains
+stereotypes for axes of disparity that are uniquely present in India. The
+resultant resource increases the number of stereotypes known for and in the
+Indian context by over 1000 stereotypes across many unique identities. We also
+demonstrate the utility and effectiveness of such expanded resources for
+evaluations of language models. CONTENT WARNING: This paper contains examples
+of stereotypes that may be offensive.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ivy<span class="highlight-title">GPT</span>: InteractiVe Chinese pathwaY language model in medical domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10512v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10512v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongsheng Wang, Yaofei Duan, ChanTong Lam, Jiexi Chen, Jiangsheng Xu, Haoming Chen, Xiaohong Liu, Patrick Cheong-Iao Pang, Tao Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  General large language models (LLMs) such as ChatGPT have shown remarkable
+success. However, such LLMs have not been widely adopted for medical purposes,
+due to poor accuracy and inability to provide medical advice. We propose
+IvyGPT, an LLM based on LLaMA that is trained and fine-tuned with high-quality
+medical question-answer (QA) instances and Reinforcement Learning from Human
+Feedback (RLHF). After supervised fine-tuning, IvyGPT has good multi-turn
+conversation capabilities, but it cannot perform like a doctor in other
+aspects, such as comprehensive diagnosis. Through RLHF, IvyGPT can output
+richer diagnosis and treatment answers that are closer to human. In the
+training, we used QLoRA to train 33 billion parameters on a small number of
+NVIDIA A100 (80GB) GPUs. Experimental results show that IvyGPT has outperformed
+other medical GPT models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ General Debiasing for Multimodal Sentiment Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Teng Sun, Juntong Ni, Wenjie Wang, Liqiang Jing, Yinwei Wei, Liqiang Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing work on Multimodal Sentiment Analysis (MSA) utilizes multimodal
+information for prediction yet unavoidably suffers from fitting the spurious
+correlations between multimodal features and sentiment labels. For example, if
+most videos with a blue background have positive labels in a dataset, the model
+will rely on such correlations for prediction, while ``blue background'' is not
+a sentiment-related feature. To address this problem, we define a general
+debiasing MSA task, which aims to enhance the Out-Of-Distribution (OOD)
+generalization ability of MSA models by reducing their reliance on spurious
+correlations. To this end, we propose a general debiasing framework based on
+Inverse Probability Weighting (IPW), which adaptively assigns small weights to
+the samples with larger bias i.e., the severer spurious correlations). The key
+to this debiasing framework is to estimate the bias of each sample, which is
+achieved by two steps: 1) disentangling the robust features and biased features
+in each modality, and 2) utilizing the biased features to estimate the bias.
+Finally, we employ IPW to reduce the effects of large-biased samples,
+facilitating robust feature learning for sentiment prediction. To examine the
+model's generalization ability, we keep the original testing sets on two
+benchmarks and additionally construct multiple unimodal and multimodal OOD
+testing sets. The empirical results demonstrate the superior generalization
+ability of our proposed framework. We have released the code and data to
+facilitate the reproduction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Systematic Evaluation of Federated Learning on Biomedical Natural
+  Language Processing <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Peng, sicheng zhou, jiandong chen, Rui Zhang, Ziyue Xu, Ju Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models (LMs) like BERT and GPT have revolutionized natural language
+processing (NLP). However, privacy-sensitive domains, particularly the medical
+field, face challenges to train LMs due to limited data access and privacy
+constraints imposed by regulations like the Health Insurance Portability and
+Accountability Act (HIPPA) and the General Data Protection Regulation (GDPR).
+Federated learning (FL) offers a decentralized solution that enables
+collaborative learning while ensuring the preservation of data privacy. In this
+study, we systematically evaluate FL in medicine across $2$ biomedical NLP
+tasks using $6$ LMs encompassing $8$ corpora. Our results showed that: 1) FL
+models consistently outperform LMs trained on individual client's data and
+sometimes match the model trained with polled data; 2) With the fixed number of
+total data, LMs trained using FL with more clients exhibit inferior
+performance, but pre-trained transformer-based models exhibited greater
+resilience. 3) LMs trained using FL perform nearly on par with the model
+trained with pooled data when clients' data are IID distributed while
+exhibiting visible gaps with non-IID data. Our code is available at:
+https://github.com/PL97/FedNLP
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD 2023 Workshop FL4Data-Mining</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jina Embeddings: A Novel Set of High-Performance Sentence Embedding
+  Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11224v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11224v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Günther, Louis Milliken, Jonathan Geuter, Georgios Mastrapas, Bo Wang, Han Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Jina Embeddings constitutes a set of high-performance sentence embedding
+models adept at translating various textual inputs into numerical
+representations, thereby capturing the semantic essence of the text. While
+these models are not exclusively designed for text generation, they excel in
+applications such as dense retrieval and semantic textual similarity. This
+paper details the development of Jina Embeddings, starting with the creation of
+a high-quality pairwise and triplet dataset. It underlines the crucial role of
+data cleaning in dataset preparation, gives in-depth insights into the model
+training process, and concludes with a comprehensive performance evaluation
+using the Massive Textual Embedding Benchmark (MTEB).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 page appendix, EMNLP 2023 Industrial Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UMLS-KGI-<span class="highlight-title">BERT</span>: Data-Centric Knowledge Integration in <span class="highlight-title">Transformer</span>s for
+  Biomedical Entity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aidan Mannion, Thierry Chevalier, Didier Schwab, Lorraine Geouriot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained transformer language models (LMs) have in recent years become the
+dominant paradigm in applied NLP. These models have achieved state-of-the-art
+performance on tasks such as information extraction, question answering,
+sentiment analysis, document classification and many others. In the biomedical
+domain, significant progress has been made in adapting this paradigm to NLP
+tasks that require the integration of domain-specific knowledge as well as
+statistical modelling of language. In particular, research in this area has
+focused on the question of how best to construct LMs that take into account not
+only the patterns of token distribution in medical text, but also the wealth of
+structured information contained in terminology resources such as the UMLS.
+This work contributes a data-centric paradigm for enriching the language
+representations of biomedical transformer-encoder LMs by extracting text
+sequences from the UMLS. This allows for graph-based learning objectives to be
+combined with masked-language pre-training. Preliminary results from
+experiments in the extension of pre-trained LMs as well as training from
+scratch show that this framework improves downstream performance on multiple
+biomedical and clinical Named Entity Recognition (NER) tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Applying QNLP to sentiment analysis in finance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Stein, Ivo Christ, Nicolas Kraus, Maximilian Balthasar Mansky, Robert Müller, Claudia Linnhof-Popien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As an application domain where the slightest qualitative improvements can
+yield immense value, finance is a promising candidate for early quantum
+advantage. Focusing on the rapidly advancing field of Quantum Natural Language
+Processing (QNLP), we explore the practical applicability of the two central
+approaches DisCoCat and Quantum-Enhanced Long Short-Term Memory (QLSTM) to the
+problem of sentiment analysis in finance. Utilizing a novel ChatGPT-based data
+generation approach, we conduct a case study with more than 1000 realistic
+sentences and find that QLSTMs can be trained substantially faster than
+DisCoCat while also achieving close to classical results for their available
+software implementations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM Cognitive Judgements Differ From Human 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sotiris Lamprinidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have lately been on the spotlight of
+researchers, businesses, and consumers alike. While the linguistic capabilities
+of such models have been studied extensively, there is growing interest in
+investigating them as cognitive subjects. In the present work I examine GPT-3
+and ChatGPT capabilities on an limited-data inductive reasoning task from the
+cognitive science literature. The results suggest that these models' cognitive
+judgements are not human-like.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Conversational Shaping for Intelligent Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11785v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11785v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piotr Tarasiewicz, Sultan Kenjeyev, Ilana Sebag, Shehab Alshehabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent emergence of deep learning methods has enabled the research
+community to achieve state-of-the art results in several domains including
+natural language processing. However, the current robocall system remains
+unstable and inaccurate: text generator and chat-bots can be tedious and
+misunderstand human-like dialogue. In this work, we study the performance of
+two models able to enhance an intelligent conversational agent through
+adversarial conversational shaping: a generative adversarial network with
+policy gradient (GANPG) and a generative adversarial network with reward for
+every generation step (REGS) based on the REGS model presented in Li et al.
+[18] . This model is able to assign rewards to both partially and fully
+generated text sequences. We discuss performance with different training
+details : seq2seq [ 36] and transformers [37 ] in a reinforcement learning
+framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DialogStudio: Towards Richest and Most Diverse Unified <span class="highlight-title">Dataset</span>
+  Collection for Conversational AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10172v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10172v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianguo Zhang, Kun Qian, Zhiwei Liu, Shelby Heinecke, Rui Meng, Ye Liu, Zhou Yu, Huan Wang, Silvio Savarese, Caiming Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite advancements in conversational AI, language models encounter
+challenges to handle diverse conversational tasks, and existing dialogue
+dataset collections often lack diversity and comprehensiveness. To tackle these
+issues, we introduce DialogStudio: the largest and most diverse collection of
+dialogue datasets, unified under a consistent format while preserving their
+original information. Our collection encompasses data from open-domain
+dialogues, task-oriented dialogues, natural language understanding,
+conversational recommendation, dialogue summarization, and knowledge-grounded
+dialogues, making it an incredibly rich and diverse resource for dialogue
+research and model training. To further enhance the utility of DialogStudio, we
+identify the licenses for each dataset and design domain-aware prompts for
+selected dialogues to facilitate instruction-aware fine-tuning. Furthermore, we
+develop conversational AI models using the dataset collection, and our
+experiments in both zero-shot and few-shot learning scenarios demonstrate the
+superiority of DialogStudio. To improve transparency and support dataset and
+task-based research, as well as language model pre-training, all datasets,
+licenses, codes, and models associated with DialogStudio are made publicly
+accessible at https://github.com/salesforce/DialogStudio
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mathematical Capabilities of Chat<span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Frieder, Luca Pinchetti, Alexis Chevalier, Ryan-Rhys Griffiths, Tommaso Salvatori, Thomas Lukasiewicz, Philipp Christian Petersen, Julius Berner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the mathematical capabilities of two iterations of ChatGPT
+(released 9-January-2023 and 30-January-2023) and of GPT-4 by testing them on
+publicly available datasets, as well as hand-crafted ones, using a novel
+methodology. In contrast to formal mathematics, where large databases of formal
+proofs are available (e.g., the Lean Mathematical Library), current datasets of
+natural-language mathematics, used to benchmark language models, either cover
+only elementary mathematics or are very small. We address this by publicly
+releasing two new datasets: GHOSTS and miniGHOSTS. These are the first
+natural-language datasets curated by working researchers in mathematics that
+(1) aim to cover graduate-level mathematics, (2) provide a holistic overview of
+the mathematical capabilities of language models, and (3) distinguish multiple
+dimensions of mathematical reasoning. These datasets also test whether ChatGPT
+and GPT-4 can be helpful assistants to professional mathematicians by emulating
+use cases that arise in the daily professional activities of mathematicians. We
+benchmark the models on a range of fine-grained performance metrics. For
+advanced mathematics, this is the most detailed evaluation effort to date. We
+find that ChatGPT can be used most successfully as a mathematical assistant for
+querying facts, acting as a mathematical search engine and knowledge base
+interface. GPT-4 can additionally be used for undergraduate-level mathematics
+but fails on graduate-level difficulty. Contrary to many positive reports in
+the media about GPT-4 and ChatGPT's exam-solving abilities (a potential case of
+selection bias), their overall mathematical performance is well below the level
+of a graduate student. Hence, if your goal is to use ChatGPT to pass a
+graduate-level math exam, you would be better off copying from your average
+peer!
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added further evaluations on another ChatGPT version and on GPT-4.
+  The GHOSTS and miniGHOSTS datasets are available at
+  https://github.com/xyfrieder/science-GHOSTS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sabiá: Portuguese Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07880v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07880v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramon Pires, Hugo Abonizio, Thales Sales Almeida, Rodrigo Nogueira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the capabilities of language models continue to advance, it is conceivable
+that "one-size-fits-all" model will remain as the main paradigm. For instance,
+given the vast number of languages worldwide, many of which are low-resource,
+the prevalent practice is to pretrain a single model on multiple languages. In
+this paper, we add to the growing body of evidence that challenges this
+practice, demonstrating that monolingual pretraining on the target language
+significantly improves models already extensively trained on diverse corpora.
+More specifically, we further pretrain GPT-J and LLaMA models on Portuguese
+texts using 3% or less of their original pretraining budget. Few-shot
+evaluations on Poeta, a suite of 14 Portuguese datasets, reveal that our models
+outperform English-centric and multilingual counterparts by a significant
+margin. Our best model, Sabi\'a-65B, performs on par with GPT-3.5-turbo. By
+evaluating on datasets originally conceived in the target language as well as
+translated ones, we study the contributions of language-specific pretraining in
+terms of 1) capturing linguistic nuances and structures inherent to the target
+language, and 2) enriching the model's knowledge about a domain or culture. Our
+results indicate that the majority of the benefits stem from the
+domain-specific knowledge acquired through monolingual pretraining.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAP: Multimodal Uncertainty-Aware Vision-Language <span class="highlight-title">Pre-train</span>ing Model <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.05335v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.05335v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yatai Ji, Junjie Wang, Yuan Gong, Lin Zhang, Yanru Zhu, Hongfa Wang, Jiaxing Zhang, Tetsuya Sakai, Yujiu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal semantic understanding often has to deal with uncertainty, which
+means the obtained messages tend to refer to multiple targets. Such uncertainty
+is problematic for our interpretation, including inter- and intra-modal
+uncertainty. Little effort has studied the modeling of this uncertainty,
+particularly in pre-training on unlabeled datasets and fine-tuning in
+task-specific downstream datasets. In this paper, we project the
+representations of all modalities as probabilistic distributions via a
+Probability Distribution Encoder (PDE) by utilizing sequence-level
+interactions. Compared to the existing deterministic methods, such uncertainty
+modeling can convey richer multimodal semantic information and more complex
+relationships. Furthermore, we integrate uncertainty modeling with popular
+pre-training frameworks and propose suitable pre-training tasks:
+Distribution-based Vision-Language Contrastive learning (D-VLC),
+Distribution-based Masked Language Modeling (D-MLM), and Distribution-based
+Image-Text Matching (D-ITM). The fine-tuned models are applied to challenging
+downstream tasks, including image-text retrieval, visual question answering,
+visual reasoning, and visual entailment, and achieve state-of-the-art results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023 Main Track Long Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Class-Incremental Learning based on Label Generation <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12619v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12619v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijia Shao, Yiduo Guo, Dongyan Zhao, Bing Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the great success of pre-trained language models, it is still a
+challenge to use these models for continual learning, especially for the
+class-incremental learning (CIL) setting due to catastrophic forgetting (CF).
+This paper reports our finding that if we formulate CIL as a continual label
+generation problem, CF is drastically reduced and the generalizable
+representations of pre-trained models can be better retained. We thus propose a
+new CIL method (VAG) that also leverages the sparsity of vocabulary to focus
+the generation and creates pseudo-replay samples by using label semantics.
+Experimental results show that VAG outperforms baselines by a large margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, ACL 2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ $α$-$β$-Factorization and the Binary Case of Simon's Congruence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pamela Fleischmann, Jonas Höfer, Annika Huch, Dirk Nowotka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In 1991 H\'ebrard introduced a factorization of words that turned out to be a
+powerful tool for the investigation of a word's scattered factors (also known
+as (scattered) subwords or subsequences). Based on this, first Karandikar and
+Schnoebelen introduced the notion of $k$-richness and later on Barker et al.
+the notion of $k$-universality. In 2022 Fleischmann et al. presented a
+generalization of the arch factorization by intersecting the arch factorization
+of a word and its reverse. While the authors merely used this factorization for
+the investigation of shortest absent scattered factors, in this work we
+investigate this new $\alpha$-$\beta$-factorization as such. We characterize
+the famous Simon congruence of $k$-universal words in terms of $1$-universal
+words. Moreover, we apply these results to binary words. In this special case,
+we obtain a full characterization of the classes and calculate the index of the
+congruence. Lastly, we start investigating the ternary case, present a full
+list of possibilities for $\alpha\beta\alpha$-factors, and characterize their
+congruence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ My Boli: Code-mixed Marathi-English Corpora, <span class="highlight-title">Pretrain</span>ed Language Models
+  and Evaluation Benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14030v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14030v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanmay Chavan, Omkar Gokhale, Aditya Kane, Shantanu Patankar, Raviraj Joshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The research on code-mixed data is limited due to the unavailability of
+dedicated code-mixed datasets and pre-trained language models. In this work, we
+focus on the low-resource Indian language Marathi which lacks any prior work in
+code-mixing. We present L3Cube-MeCorpus, a large code-mixed Marathi-English
+(Mr-En) corpus with 10 million social media sentences for pretraining. We also
+release L3Cube-MeBERT and MeRoBERTa, code-mixed BERT-based transformer models
+pre-trained on MeCorpus. Furthermore, for benchmarking, we present three
+supervised datasets MeHate, MeSent, and MeLID for downstream tasks like
+code-mixed Mr-En hate speech detection, sentiment analysis, and language
+identification respectively. These evaluation datasets individually consist of
+manually annotated \url{~}12,000 Marathi-English code-mixed tweets. Ablations
+show that the models trained on this novel corpus significantly outperform the
+existing state-of-the-art BERT models. This is the first work that presents
+artifacts for code-mixed Marathi research. All datasets and models are publicly
+released at https://github.com/l3cube-pune/MarathiNLP .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RadAdapt: Radiology Report Summarization via Lightweight Domain
+  Adaptation of Large Language Models <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01146v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01146v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dave Van Veen, Cara Van Uden, Maayane Attias, Anuj Pareek, Christian Bluethgen, Malgorzata Polacin, Wah Chiu, Jean-Benoit Delbrouck, Juan Manuel Zambrano Chaves, Curtis P. Langlotz, Akshay S. Chaudhari, John Pauly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We systematically investigate lightweight strategies to adapt large language
+models (LLMs) for the task of radiology report summarization (RRS).
+Specifically, we focus on domain adaptation via pretraining (on natural
+language, biomedical text, or clinical text) and via discrete prompting or
+parameter-efficient fine-tuning. Our results consistently achieve best
+performance by maximally adapting to the task via pretraining on clinical text
+and fine-tuning on RRS examples. Importantly, this method fine-tunes a mere
+0.32% of parameters throughout the model, in contrast to end-to-end fine-tuning
+(100% of parameters). Additionally, we study the effect of in-context examples
+and out-of-distribution (OOD) training before concluding with a radiologist
+reader study and qualitative analysis. Our findings highlight the importance of
+domain adaptation in RRS and provide valuable insights toward developing
+effective natural language processing solutions for clinical tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 10 figures. Published in ACL BioNLP. Compared to v1, v2
+  includes minor edits and one additional figure in the appendix. Compared to
+  v2, v3 includes a link to the project's GitHub repository</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Textless Metric for Speech-to-Speech Comparison 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.11835v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.11835v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurent Besacier, Swen Ribeiro, Olivier Galibert, Ioan Calapodescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a new and simple method for comparing speech
+utterances without relying on text transcripts. Our speech-to-speech comparison
+metric utilizes state-of-the-art speech2unit encoders like HuBERT to convert
+speech utterances into discrete acoustic units. We then propose a simple and
+easily replicable neural architecture that learns a speech-based metric that
+closely corresponds to its text-based counterpart. This textless metric has
+numerous potential applications, including evaluating speech-to-speech
+translation for oral languages, languages without dependable ASR systems, or to
+avoid the need for ASR transcription altogether. This paper also shows that for
+speech-to-speech translation evaluation, ASR-BLEU (which consists in
+automatically transcribing both speech hypothesis and reference and compute
+sentence-level BLEU between transcripts) is a poor proxy to real text-BLEU even
+when ASR system is strong.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>link to supplementary material:
+  https://github.com/besacier/textless-metric</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Science in the Era of Chat<span class="highlight-title">GPT</span>, Large Language Models and Generative AI:
+  Challenges for Research Ethics and How to Respond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15299v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15299v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evangelos Pournaras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models of artificial intelligence (AI), such as ChatGPT, find
+remarkable but controversial applicability in science and research. This paper
+reviews epistemological challenges, ethical and integrity risks in science
+conduct in the advent of generative AI. This is with the aim to lay new timely
+foundations for a high-quality research ethics review. The role of AI language
+models as a research instrument and subject is scrutinized along with ethical
+implications for scientists, participants and reviewers. New emerging practices
+for research ethics review are discussed, concluding with ten recommendations
+that shape a response for a more responsible research conduct in the era of AI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ThoughtSource: A central hub for large language model reasoning data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11596v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11596v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Ott, Konstantin Hebenstreit, Valentin Liévin, Christoffer Egeberg Hother, Milad Moradi, Maximilian Mayrhauser, Robert Praas, Ole Winther, Matthias Samwald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) such as GPT-4 have recently demonstrated
+impressive results across a wide range of tasks. LLMs are still limited,
+however, in that they frequently fail at complex reasoning, their reasoning
+processes are opaque, they are prone to 'hallucinate' facts, and there are
+concerns about their underlying biases. Letting models verbalize reasoning
+steps as natural language, a technique known as chain-of-thought prompting, has
+recently been proposed as a way to address some of these issues. Here we
+present ThoughtSource, a meta-dataset and software library for chain-of-thought
+(CoT) reasoning. The goal of ThoughtSource is to improve future artificial
+intelligence systems by facilitating qualitative understanding of CoTs,
+enabling empirical evaluations, and providing training data. This first release
+of ThoughtSource integrates six scientific/medical, three general-domain and
+five math word question answering datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revision: added datasets, formatting</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ABNIRML: Analyzing the Behavior of Neural IR Models <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.00696v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.00696v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean MacAvaney, Sergey Feldman, Nazli Goharian, Doug Downey, Arman Cohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained contextualized language models such as BERT and T5 have
+established a new state-of-the-art for ad-hoc search. However, it is not yet
+well-understood why these methods are so effective, what makes some variants
+more effective than others, and what pitfalls they may have. We present a new
+comprehensive framework for Analyzing the Behavior of Neural IR ModeLs
+(ABNIRML), which includes new types of diagnostic probes that allow us to test
+several characteristics -- such as writing styles, factuality, sensitivity to
+paraphrasing and word order -- that are not addressed by previous techniques.
+To demonstrate the value of the framework, we conduct an extensive empirical
+study that yields insights into the factors that contribute to the neural
+model's gains, and identify potential unintended biases the models exhibit.
+Some of our results confirm conventional wisdom, like that recent neural
+ranking models rely less on exact term overlap with the query, and instead
+leverage richer linguistic information, evidenced by their higher sensitivity
+to word and sentence order. Other results are more surprising, such as that
+some models (e.g., T5 and ColBERT) are biased towards factually correct (rather
+than simply relevant) texts. Further, some characteristics vary even for the
+same base language model, and other characteristics can appear due to random
+variations during model training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TACL version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Language Models Reasoning with Chain-of-Knowledge <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06427v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06427v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianing Wang, Qiushi Sun, Nuo Chen, Xiang Li, Ming Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Chain-of-Thought (CoT) prompting has delivered success on complex
+reasoning tasks, which aims at designing a simple prompt like ``Let's think
+step by step'' or multiple in-context exemplars with well-designed rationales
+to elicit Large Language Models (LLMs) to generate intermediate reasoning
+steps. However, the generated rationales often come with mistakes, making
+unfactual and unfaithful reasoning chains. To mitigate this brittleness, we
+propose a novel Chain-of-Knowledge (CoK) prompting, where we aim at eliciting
+LLMs to generate explicit pieces of knowledge evidence in the form of structure
+triple. This is inspired by our human behaviors, i.e., we can draw a mind map
+or knowledge map as the reasoning evidence in the brain before answering a
+complex question. Benefiting from CoK, we additionally introduce a
+F^2-Verification method to estimate the reliability of the reasoning chains in
+terms of factuality and faithfulness. For the unreliable response, the wrong
+evidence can be indicated to prompt the LLM to rethink. Extensive experiments
+demonstrate that our method can further improve the performance of commonsense,
+factual, symbolic, and arithmetic reasoning tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Positive-Augmented Contrastive Learning for Image and Video Captioning
+  Evaluation <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12112v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12112v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Sarto, Manuele Barraco, Marcella Cornia, Lorenzo Baraldi, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The CLIP model has been recently proven to be very effective for a variety of
+cross-modal tasks, including the evaluation of captions generated from
+vision-and-language architectures. In this paper, we propose a new recipe for a
+contrastive-based evaluation metric for image captioning, namely
+Positive-Augmented Contrastive learning Score (PAC-S), that in a novel way
+unifies the learning of a contrastive visual-semantic space with the addition
+of generated images and text on curated data. Experiments spanning several
+datasets demonstrate that our new metric achieves the highest correlation with
+human judgments on both images and videos, outperforming existing
+reference-based metrics like CIDEr and SPICE and reference-free metrics like
+CLIP-Score. Finally, we test the system-level correlation of the proposed
+metric when considering popular image captioning approaches, and assess the
+impact of employing different cross-modal features. Our source code and trained
+models are publicly available at: https://github.com/aimagelab/pacscore.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023 (highlight paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion<span class="highlight-title">GPT</span>: Human Motion as a Foreign Language 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14795v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14795v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Biao Jiang, Xin Chen, Wen Liu, Jingyi Yu, Gang Yu, Tao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Though the advancement of pre-trained large language models unfolds, the
+exploration of building a unified model for language and other multi-modal
+data, such as motion, remains challenging and untouched so far. Fortunately,
+human motion displays a semantic coupling akin to human language, often
+perceived as a form of body language. By fusing language data with large-scale
+motion models, motion-language pre-training that can enhance the performance of
+motion-related tasks becomes feasible. Driven by this insight, we propose
+MotionGPT, a unified, versatile, and user-friendly motion-language model to
+handle multiple motion-relevant tasks. Specifically, we employ the discrete
+vector quantization for human motion and transfer 3D motion into motion tokens,
+similar to the generation process of word tokens. Building upon this "motion
+vocabulary", we perform language modeling on both motion and text in a unified
+manner, treating human motion as a specific language. Moreover, inspired by
+prompt learning, we pre-train MotionGPT with a mixture of motion-language data
+and fine-tune it on prompt-based question-and-answer tasks. Extensive
+experiments demonstrate that MotionGPT achieves state-of-the-art performances
+on multiple motion tasks including text-driven motion generation, motion
+captioning, motion prediction, and motion in-between.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://github.com/OpenMotionLab/MotionGPT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pattern<span class="highlight-title">GPT</span> :A Pattern-Driven Framework for Large Language Model Text
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00470v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00470v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Xiao, Xin Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models(LLMS)have shown excellent text generation capabilities,
+capable of generating fluent human-like responses for many downstream tasks.
+However, applying large language models to real-world critical tasks remains
+challenging due to their susceptibility to hallucinations and inability to
+directly use external knowledge. To cope with the above challenges, this paper
+proposes PatternGPT, a pattern-driven text generation framework for Large
+Language Models. Firstly, the framework utilizes the extraction capability of
+Large Language Models to generate rich and diversified structured and
+formalized patterns, which facilitates the introduction of external knowledge
+to do the computation, and then draws on the idea of federated learning to use
+multiple agents to achieve the sharing in order to obtain more diversified
+patterns, and finally uses judgment criteria and optimization algorithm to
+search for high-quality patterns to guide the generation of models. Finally,
+external knowledge such as judgment criteria and optimization algorithms are
+used to search for high-quality patterns, and the searched patterns are used to
+guide model generation. This framework has the advantages of generating
+diversified patterns, protecting data privacy, combining external knowledge,
+and improving the quality of generation, which provides an effective method to
+optimize the text generation capability of large language models, and make it
+better applied to the field of intelligent dialogue and content generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMs as Workers in Human-Computational Algorithms? Replicating
+  Crowdsourcing Pipelines with LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10168v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10168v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongshuang Wu, Haiyi Zhu, Maya Albayrak, Alexis Axon, Amanda Bertsch, Wenxing Deng, Ziqi Ding, Bill Guo, Sireesh Gururaja, Tzu-Sheng Kuo, Jenny T. Liang, Ryan Liu, Ihita Mandal, Jeremiah Milbauer, Xiaolin Ni, Namrata Padmanabhan, Subhashini Ramkumar, Alexis Sudjianto, Jordan Taylor, Ying-Jui Tseng, Patricia Vaidos, Zhijin Wu, Wei Wu, Chenyang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LLMs have shown promise in replicating human-like behavior in crowdsourcing
+tasks that were previously thought to be exclusive to human abilities. However,
+current efforts focus mainly on simple atomic tasks. We explore whether LLMs
+can replicate more complex crowdsourcing pipelines. We find that modern LLMs
+can simulate some of crowdworkers' abilities in these "human computation
+algorithms," but the level of success is variable and influenced by requesters'
+understanding of LLM capabilities, the specific skills required for sub-tasks,
+and the optimal interaction modality for performing these sub-tasks. We reflect
+on human and LLMs' different sensitivities to instructions, stress the
+importance of enabling human-facing safeguards for LLMs, and discuss the
+potential of training humans and LLMs with complementary skill sets. Crucially,
+we show that replicating crowdsourcing pipelines offers a valuable platform to
+investigate (1) the relative strengths of LLMs on different tasks (by
+cross-comparing their performances on sub-tasks) and (2) LLMs' potential in
+complex tasks, where they can complete part of the tasks while leaving others
+to humans.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chat<span class="highlight-title">GPT</span> Chemistry Assistant for Text Mining and Prediction of MOF
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiling Zheng, Oufan Zhang, Christian Borgs, Jennifer T. Chayes, Omar M. Yaghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We use prompt engineering to guide ChatGPT in the automation of text mining
+of metal-organic frameworks (MOFs) synthesis conditions from diverse formats
+and styles of the scientific literature. This effectively mitigates ChatGPT's
+tendency to hallucinate information -- an issue that previously made the use of
+Large Language Models (LLMs) in scientific fields challenging. Our approach
+involves the development of a workflow implementing three different processes
+for text mining, programmed by ChatGPT itself. All of them enable parsing,
+searching, filtering, classification, summarization, and data unification with
+different tradeoffs between labor, speed, and accuracy. We deploy this system
+to extract 26,257 distinct synthesis parameters pertaining to approximately 800
+MOFs sourced from peer-reviewed research articles. This process incorporates
+our ChemPrompt Engineering strategy to instruct ChatGPT in text mining,
+resulting in impressive precision, recall, and F1 scores of 90-99%.
+Furthermore, with the dataset built by text mining, we constructed a
+machine-learning model with over 86% accuracy in predicting MOF experimental
+crystallization outcomes and preliminarily identifying important factors in MOF
+crystallization. We also developed a reliable data-grounded MOF chatbot to
+answer questions on chemical reactions and synthesis procedures. Given that the
+process of using ChatGPT reliably mines and tabulates diverse MOF synthesis
+information in a unified format, while using only narrative language requiring
+no coding expertise, we anticipate that our ChatGPT Chemistry Assistant will be
+very useful across various other chemistry sub-disciplines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on Journal of the American Chemical Society (2023); 102
+  pages (18-page manuscript, 84 pages of supporting information)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unifying Token and Span Level Supervisions for Few-Shot Sequence
+  Labeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07946v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07946v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zifeng Cheng, Qingyu Zhou, Zhiwei Jiang, Xuemin Zhao, Yunbo Cao, Qing Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot sequence labeling aims to identify novel classes based on only a few
+labeled samples. Existing methods solve the data scarcity problem mainly by
+designing token-level or span-level labeling models based on metric learning.
+However, these methods are only trained at a single granularity (i.e., either
+token level or span level) and have some weaknesses of the corresponding
+granularity. In this paper, we first unify token and span level supervisions
+and propose a Consistent Dual Adaptive Prototypical (CDAP) network for few-shot
+sequence labeling. CDAP contains the token-level and span-level networks,
+jointly trained at different granularities. To align the outputs of two
+networks, we further propose a consistent loss to enable them to learn from
+each other. During the inference phase, we propose a consistent greedy
+inference algorithm that first adjusts the predicted probability and then
+greedily selects non-overlapping spans with maximum probability. Extensive
+experiments show that our model achieves new state-of-the-art results on three
+benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Transactions on Information Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Performance Comparison of Large Language Models on VNHSGE English
+  <span class="highlight-title">Dataset</span>: OpenAI Chat<span class="highlight-title">GPT</span>, Microsoft Bing Chat, and Google Bard 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02288v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02288v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan-Quy Dao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a performance comparison of three large language models
+(LLMs), namely OpenAI ChatGPT, Microsoft Bing Chat (BingChat), and Google Bard,
+on the VNHSGE English dataset. The performance of BingChat, Bard, and ChatGPT
+(GPT-3.5) is 92.4\%, 86\%, and 79.2\%, respectively. The results show that
+BingChat is better than ChatGPT and Bard. Therefore, BingChat and Bard can
+replace ChatGPT while ChatGPT is not yet officially available in Vietnam. The
+results also indicate that BingChat, Bard and ChatGPT outperform Vietnamese
+students in English language proficiency. The findings of this study contribute
+to the understanding of the potential of LLMs in English language education.
+The remarkable performance of ChatGPT, BingChat, and Bard demonstrates their
+potential as effective tools for teaching and learning English at the high
+school level.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlignAtt: Using Attention-based Audio-Translation Alignments as a Guide
+  for Simultaneous Speech Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Papi, Marco Turchi, Matteo Negri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention is the core mechanism of today's most used architectures for
+natural language processing and has been analyzed from many perspectives,
+including its effectiveness for machine translation-related tasks. Among these
+studies, attention resulted to be a useful source of information to get
+insights about word alignment also when the input text is substituted with
+audio segments, as in the case of the speech translation (ST) task. In this
+paper, we propose AlignAtt, a novel policy for simultaneous ST (SimulST) that
+exploits the attention information to generate source-target alignments that
+guide the model during inference. Through experiments on the 8 language pairs
+of MuST-C v1.0, we show that AlignAtt outperforms previous state-of-the-art
+SimulST policies applied to offline-trained models with gains in terms of BLEU
+of 2 points and latency reductions ranging from 0.5s to 0.8s across the 8
+languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Interspeech 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Guided Generation for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09702v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09702v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon T. Willard, Rémi Louf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article we describe an efficient approach to guiding language model
+text generation with regular expressions and context-free grammars. Our
+approach adds little to no overhead to the token sequence generation process,
+and makes guided generation feasible in practice. An implementation is provided
+in the open source Python library Outlines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization
+  Using Floating-Point Formats 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09782v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09782v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxia Wu, Zhewei Yao, Yuxiong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the complex domain of large language models (LLMs), striking a balance
+between computational efficiency and maintaining model quality is a formidable
+challenge. Navigating the inherent limitations of uniform quantization,
+particularly when dealing with outliers, and motivated by the launch of
+NVIDIA's H100 hardware, this study delves into the viability of floating-point
+(FP) quantization, particularly focusing on FP8 and FP4, as a potential
+solution. Our comprehensive investigation reveals that for LLMs, FP8 activation
+consistently outshines its integer (INT8) equivalent, with the performance edge
+becoming more noticeable in models possessing parameters beyond one billion.
+For weight quantization, our findings indicate that FP4 exhibits comparable, if
+not superior, performance to INT4, simplifying deployment on FP-supported
+hardware like H100. To mitigate the overhead from precision alignment caused by
+the disparity between weights and activations, we propose two scaling
+constraints for weight quantization that negligibly impact the performance
+compared to the standard W4A8 model. We additionally enhance our quantization
+methods by integrating the Low Rank Compensation (LoRC) strategy, yielding
+improvements especially in smaller models. The results of our investigation
+emphasize the immense potential of FP quantization for LLMs, paving the way for
+high-efficiency deployment in resource-limited settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FAIR: A Causal Framework for Accurately Inferring Judgments Reversals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11585v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11585v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghua He, Nanfei Gu, Yuntao Shi, Qionghui Zhang, Yaying Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence researchers have made significant advances in legal
+intelligence in recent years. However, the existing studies have not focused on
+the important value embedded in judgments reversals, which limits the
+improvement of the efficiency of legal intelligence. In this paper, we propose
+a causal Framework for Accurately Inferring case Reversals (FAIR), which models
+the problem of judgments reversals based on real Chinese judgments. We mine the
+causes of judgments reversals by causal inference methods and inject the
+obtained causal relationships into the neural network as a priori knowledge.
+And then, our framework is validated on a challenging dataset as a legal
+judgment prediction task. The experimental results show that our framework can
+tap the most critical factors in judgments reversal, and the obtained causal
+relationships can effectively improve the neural network's performance. In
+addition, we discuss the generalization ability of large language models for
+legal intelligence tasks using ChatGPT as an example. Our experiment has found
+that the generalization ability of large language models still has defects, and
+mining causal relationships can effectively improve the accuracy and explain
+ability of model predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">136</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAPR: Proximity Attention Point Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanshu Zhang, Shichong Peng, Alireza Moazeni, Ke Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning accurate and parsimonious point cloud representations of scene
+surfaces from scratch remains a challenge in 3D representation learning.
+Existing point-based methods often suffer from the vanishing gradient problem
+or require a large number of points to accurately model scene geometry and
+texture. To address these limitations, we propose Proximity Attention Point
+Rendering (PAPR), a novel method that consists of a point-based scene
+representation and a differentiable renderer. Our scene representation uses a
+point cloud where each point is characterized by its spatial position,
+foreground score, and view-independent feature vector. The renderer selects the
+relevant points for each ray and produces accurate colours using their
+associated features. PAPR effectively learns point cloud positions to represent
+the correct scene geometry, even when the initialization drastically differs
+from the target geometry. Notably, our method captures fine texture details
+while using only a parsimonious set of points. We also demonstrate four
+practical applications of our method: geometry editing, object manipulation,
+texture transfer, and exposure control. More results and code are available on
+our project website at https://zvict.github.io/papr/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Representation Learning in Anomaly Detection: Successes, Limits and a
+  Grand Challenge <span class="chip">CVPR'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yedid Hoshen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this perspective paper, we argue that the dominant paradigm in anomaly
+detection cannot scale indefinitely and will eventually hit fundamental limits.
+This is due to the a no free lunch principle for anomaly detection. These
+limitations can be overcome when there are strong tasks priors, as is the case
+for many industrial tasks. When such priors do not exists, the task is much
+harder for anomaly detection. We pose two such tasks as grand challenges for
+anomaly detection: i) scientific discovery by anomaly detection ii) a
+"mini-grand" challenge of detecting the most anomalous image in the ImageNet
+dataset. We believe new anomaly detection tools and ideas would need to be
+developed to overcome these challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Keynote talk at the Visual Anomaly and Novelty Detection Workshop,
+  CVPR'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GLSFormer : Gated - Long, Short Sequence <span class="highlight-title">Transformer</span> for Step
+  Recognition in Surgical Videos <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nisarg A. Shah, Shameema Sikder, S. Swaroop Vedula, Vishal M. Patel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated surgical step recognition is an important task that can
+significantly improve patient safety and decision-making during surgeries.
+Existing state-of-the-art methods for surgical step recognition either rely on
+separate, multi-stage modeling of spatial and temporal information or operate
+on short-range temporal resolution when learned jointly. However, the benefits
+of joint modeling of spatio-temporal features and long-range information are
+not taken in account. In this paper, we propose a vision transformer-based
+approach to jointly learn spatio-temporal features directly from sequence of
+frame-level patches. Our method incorporates a gated-temporal attention
+mechanism that intelligently combines short-term and long-term spatio-temporal
+feature representations. We extensively evaluate our approach on two cataract
+surgery video datasets, namely Cataract-101 and D99, and demonstrate superior
+performance compared to various state-of-the-art methods. These results
+validate the suitability of our proposed approach for automated surgical step
+recognition. Our code is released at:
+https://github.com/nisargshah1999/GLSFormer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023 (Early Accept)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AlignDet: Aligning <span class="highlight-title">Pre-train</span>ing and Fine-tuning in Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Li, Jie Wu, Xionghui Wang, Chen Chen, Jie Qin, Xuefeng Xiao, Rui Wang, Min Zheng, Xin Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paradigm of large-scale pre-training followed by downstream fine-tuning
+has been widely employed in various object detection algorithms. In this paper,
+we reveal discrepancies in data, model, and task between the pre-training and
+fine-tuning procedure in existing practices, which implicitly limit the
+detector's performance, generalization ability, and convergence speed. To this
+end, we propose AlignDet, a unified pre-training framework that can be adapted
+to various existing detectors to alleviate the discrepancies. AlignDet
+decouples the pre-training process into two stages, i.e., image-domain and
+box-domain pre-training. The image-domain pre-training optimizes the detection
+backbone to capture holistic visual abstraction, and box-domain pre-training
+learns instance-level semantics and task-aware concepts to initialize the parts
+out of the backbone. By incorporating the self-supervised pre-trained
+backbones, we can pre-train all modules for various detectors in an
+unsupervised paradigm. As depicted in Figure 1, extensive experiments
+demonstrate that AlignDet can achieve significant improvements across diverse
+protocols, such as detection algorithm, model backbone, data setting, and
+training schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by
+2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Code and Models are publicly available.
+  Project Page: https://liming-ai.github.io/AlignDet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Dense UV Completion for Human Mesh Recovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanjun Wang, Qingping Sun, Wenjia Wang, Jun Ling, Zhongang Cai, Rong Xie, Li Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human mesh reconstruction from a single image is challenging in the presence
+of occlusion, which can be caused by self, objects, or other humans. Existing
+methods either fail to separate human features accurately or lack proper
+supervision for feature completion. In this paper, we propose Dense Inpainting
+Human Mesh Recovery (DIMR), a two-stage method that leverages dense
+correspondence maps to handle occlusion. Our method utilizes a dense
+correspondence map to separate visible human features and completes human
+features on a structured UV map dense human with an attention-based feature
+completion module. We also design a feature inpainting training procedure that
+guides the network to learn from unoccluded features. We evaluate our method on
+several datasets and demonstrate its superior performance under heavily
+occluded scenarios compared to other methods. Extensive experiments show that
+our method obviously outperforms prior SOTA methods on heavily occluded images
+and achieves comparable results on the standard benchmarks (3DPW).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OBJECT 3DIT: Language-guided 3D-aware Image Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oscar Michel, Anand Bhattad, Eli VanderBilt, Ranjay Krishna, Aniruddha Kembhavi, Tanmay Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing image editing tools, while powerful, typically disregard the
+underlying 3D geometry from which the image is projected. As a result, edits
+made using these tools may become detached from the geometry and lighting
+conditions that are at the foundation of the image formation process. In this
+work, we formulate the newt ask of language-guided 3D-aware editing, where
+objects in an image should be edited according to a language instruction in
+context of the underlying 3D scene. To promote progress towards this goal, we
+release OBJECT: a dataset consisting of 400K editing examples created from
+procedurally generated 3D scenes. Each example consists of an input image,
+editing instruction in language, and the edited image. We also introduce 3DIT :
+single and multi-task models for four editing tasks. Our models show impressive
+abilities to understand the 3D composition of entire scenes, factoring in
+surrounding objects, surfaces, lighting conditions, shadows, and
+physically-plausible object configurations. Surprisingly, training on only
+synthetic scenes from OBJECT, editing capabilities of 3DIT generalize to
+real-world images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CNOS: A Strong Baseline for CAD-based Novel Object Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Nguyen Nguyen, Tomas Hodan, Georgy Ponimatkin, Thibault Groueix, Vincent Lepetit
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a simple three-stage approach to segment unseen objects in RGB
+images using their CAD models. Leveraging recent powerful foundation models,
+DINOv2 and Segment Anything, we create descriptors and generate proposals,
+including binary masks for a given input RGB image. By matching proposals with
+reference descriptors created from CAD models, we achieve precise object ID
+assignment along with modal masks. We experimentally demonstrate that our
+method achieves state-of-the-art results in CAD-based novel object
+segmentation, surpassing existing approaches on the seven core datasets of the
+BOP challenge by 19.8\% AP using the same BOP evaluation protocol. Our source
+code is available at https://github.com/nv-nguyen/cnos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Driving Policy Prediction based on Deep Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fuxiao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this project, we implemented an end-to-end system that takes in combined
+visual features of video frames from a normal camera and depth information from
+a cloud points scanner, and predicts driving policies (vehicle speed and
+steering angle). We verified the safety of our system by comparing the
+predicted results with standard behaviors by real-world experienced drivers.
+Our test results show that the predictions can be considered as accurate in at
+lease half of the testing cases (50% 80%, depending on the model), and using
+combined features improved the performance in most cases than using video
+frames only.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HRFNet: High-Resolution Forgery Network for Localizing Satellite Image
+  Manipulation <span class="chip">ICIP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fahim Faisal Niloy, Kishor Kumar Bhaumik, Simon S. Woo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing high-resolution satellite image forgery localization methods rely on
+patch-based or downsampling-based training. Both of these training methods have
+major drawbacks, such as inaccurate boundaries between pristine and forged
+regions, the generation of unwanted artifacts, etc. To tackle the
+aforementioned challenges, inspired by the high-resolution image segmentation
+literature, we propose a novel model called HRFNet to enable satellite image
+forgery localization effectively. Specifically, equipped with shallow and deep
+branches, our model can successfully integrate RGB and resampling features in
+both global and local manners to localize forgery more accurately. We perform
+various experiments to demonstrate that our method achieves the best
+performance, while the memory requirement and processing speed are not
+compromised compared to existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICIP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cascade-DETR: Delving into High-Quality Universal Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11035v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11035v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingqiao Ye, Lei Ke, Siyuan Li, Yu-Wing Tai, Chi-Keung Tang, Martin Danelljan, Fisher Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object localization in general environments is a fundamental part of vision
+systems. While dominating on the COCO benchmark, recent Transformer-based
+detection methods are not competitive in diverse domains. Moreover, these
+methods still struggle to very accurately estimate the object bounding boxes in
+complex environments.
+  We introduce Cascade-DETR for high-quality universal object detection. We
+jointly tackle the generalization to diverse domains and localization accuracy
+by proposing the Cascade Attention layer, which explicitly integrates
+object-centric information into the detection decoder by limiting the attention
+to the previous box prediction. To further enhance accuracy, we also revisit
+the scoring of queries. Instead of relying on classification scores, we predict
+the expected IoU of the query, leading to substantially more well-calibrated
+confidences. Lastly, we introduce a universal object detection benchmark,
+UDB10, that contains 10 datasets from diverse domains. While also advancing the
+state-of-the-art on COCO, Cascade-DETR substantially improves DETR-based
+detectors on all datasets in UDB10, even by over 10 mAP in some cases. The
+improvements under stringent quality requirements are even more pronounced. Our
+code and models will be released at https://github.com/SysCV/cascade-detr.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICCV 2023. Our code and models will be released at
+  https://github.com/SysCV/cascade-detr</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-objective point cloud autoencoders for explainable myocardial
+  infarction prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcel Beetz, Abhirup Banerjee, Vicente Grau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Myocardial infarction (MI) is one of the most common causes of death in the
+world. Image-based biomarkers commonly used in the clinic, such as ejection
+fraction, fail to capture more complex patterns in the heart's 3D anatomy and
+thus limit diagnostic accuracy. In this work, we present the multi-objective
+point cloud autoencoder as a novel geometric deep learning approach for
+explainable infarction prediction, based on multi-class 3D point cloud
+representations of cardiac anatomy and function. Its architecture consists of
+multiple task-specific branches connected by a low-dimensional latent space to
+allow for effective multi-objective learning of both reconstruction and MI
+prediction, while capturing pathology-specific 3D shape information in an
+interpretable latent space. Furthermore, its hierarchical branch design with
+point cloud-based deep learning operations enables efficient multi-scale
+feature learning directly on high-resolution anatomy point clouds. In our
+experiments on a large UK Biobank dataset, the multi-objective point cloud
+autoencoder is able to accurately reconstruct multi-temporal 3D shapes with
+Chamfer distances between predicted and input anatomies below the underlying
+images' pixel resolution. Our method outperforms multiple machine learning and
+deep learning benchmarks for the task of incident MI prediction by 19% in terms
+of Area Under the Receiver Operating Characteristic curve. In addition, its
+task-specific compact latent space exhibits easily separable control and MI
+clusters with clinically plausible associations between subject encodings and
+corresponding 3D shapes, thus demonstrating the explainability of the
+prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Metric3D: Towards Zero-shot Metric 3D Prediction from A Single Image <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Yin, Chi Zhang, Hao Chen, Zhipeng Cai, Gang Yu, Kaixuan Wang, Xiaozhi Chen, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing accurate 3D scenes from images is a long-standing vision task.
+Due to the ill-posedness of the single-image reconstruction problem, most
+well-established methods are built upon multi-view geometry. State-of-the-art
+(SOTA) monocular metric depth estimation methods can only handle a single
+camera model and are unable to perform mixed-data training due to the metric
+ambiguity. Meanwhile, SOTA monocular methods trained on large mixed datasets
+achieve zero-shot generalization by learning affine-invariant depths, which
+cannot recover real-world metrics. In this work, we show that the key to a
+zero-shot single-view metric depth model lies in the combination of large-scale
+data training and resolving the metric ambiguity from various camera models. We
+propose a canonical camera space transformation module, which explicitly
+addresses the ambiguity problems and can be effortlessly plugged into existing
+monocular models. Equipped with our module, monocular models can be stably
+trained with over 8 million images with thousands of camera models, resulting
+in zero-shot generalization to in-the-wild images with unseen camera settings.
+Experiments demonstrate SOTA performance of our method on 7 zero-shot
+benchmarks. Notably, our method won the championship in the 2nd Monocular Depth
+Estimation Challenge. Our method enables the accurate recovery of metric 3D
+structures on randomly collected internet images, paving the way for plausible
+single-image metrology. The potential benefits extend to downstream tasks,
+which can be significantly improved by simply plugging in our model. For
+example, our model relieves the scale drift issues of monocular-SLAM (Fig. 1),
+leading to high-quality metric scale dense mapping. The code is available at
+https://github.com/YvanYin/Metric3D.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Won the championship in the 2nd Monocular
+  Depth Estimation Challenge. The code is available at
+  https://github.com/YvanYin/Metric3D</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Spiking-UNet for Image Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10974v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10974v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hebei Li, Yueyi Zhang, Zhiwei Xiong, Zheng-jun Zha, Xiaoyan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  U-Net, known for its simple yet efficient architecture, is widely utilized
+for image processing tasks and is particularly suitable for deployment on
+neuromorphic chips. This paper introduces the novel concept of Spiking-UNet for
+image processing, which combines the power of Spiking Neural Networks (SNNs)
+with the U-Net architecture. To achieve an efficient Spiking-UNet, we face two
+primary challenges: ensuring high-fidelity information propagation through the
+network via spikes and formulating an effective training strategy. To address
+the issue of information loss, we introduce multi-threshold spiking neurons,
+which improve the efficiency of information transmission within the
+Spiking-UNet. For the training strategy, we adopt a conversion and fine-tuning
+pipeline that leverage pre-trained U-Net models. During the conversion process,
+significant variability in data distribution across different parts is observed
+when utilizing skip connections. Therefore, we propose a connection-wise
+normalization method to prevent inaccurate firing rates. Furthermore, we adopt
+a flow-based training method to fine-tune the converted models, reducing time
+steps while preserving performance. Experimental results show that, on image
+segmentation and denoising, our Spiking-UNet achieves comparable performance to
+its non-spiking counterpart, surpassing existing SNN methods. Compared with the
+converted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference
+time by approximately 90\%. This research broadens the application scope of
+SNNs in image processing and is expected to inspire further exploration in the
+field of neuromorphic engineering. The code for our Spiking-UNet implementation
+is available at https://github.com/SNNresearch/Spiking-UNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spinal nerve segmentation method and <span class="highlight-title">dataset</span> construction in endoscopic
+  surgical scenarios <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10955v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10955v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaowu Peng, Pengcheng Zhao, Yongyu Ye, Junying Chen, Yunbing Chang, Xiaoqing Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Endoscopic surgery is currently an important treatment method in the field of
+spinal surgery and avoiding damage to the spinal nerves through video guidance
+is a key challenge. This paper presents the first real-time segmentation method
+for spinal nerves in endoscopic surgery, which provides crucial navigational
+information for surgeons. A finely annotated segmentation dataset of
+approximately 10,000 consec-utive frames recorded during surgery is constructed
+for the first time for this field, addressing the problem of semantic
+segmentation. Based on this dataset, we propose FUnet (Frame-Unet), which
+achieves state-of-the-art performance by utilizing inter-frame information and
+self-attention mechanisms. We also conduct extended exper-iments on a similar
+polyp endoscopy video dataset and show that the model has good generalization
+ability with advantageous performance. The dataset and code of this work are
+presented at: https://github.com/zzzzzzpc/FUnet .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Soft-tissue Driven Craniomaxillofacial Surgical Planning <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Fang, Daeseung Kim, Xuanang Xu, Tianshu Kuang, Nathan Lampen, Jungwook Lee, Hannah H. Deng, Jaime Gateno, Michael A. K. Liebschner, James J. Xia, Pingkun Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In CMF surgery, the planning of bony movement to achieve a desired facial
+outcome is a challenging task. Current bone driven approaches focus on
+normalizing the bone with the expectation that the facial appearance will be
+corrected accordingly. However, due to the complex non-linear relationship
+between bony structure and facial soft-tissue, such bone-driven methods are
+insufficient to correct facial deformities. Despite efforts to simulate facial
+changes resulting from bony movement, surgical planning still relies on
+iterative revisions and educated guesses. To address these issues, we propose a
+soft-tissue driven framework that can automatically create and verify surgical
+plans. Our framework consists of a bony planner network that estimates the bony
+movements required to achieve the desired facial outcome and a facial simulator
+network that can simulate the possible facial changes resulting from the
+estimated bony movement plans. By combining these two models, we can verify and
+determine the final bony movement required for planning. The proposed framework
+was evaluated using a clinical dataset, and our experimental results
+demonstrate that the soft-tissue driven approach greatly improves the accuracy
+and efficacy of surgical planning when compared to the conventional bone-driven
+approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Early accepted by MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PE-YOLO: Pyramid Enhancement Network for Dark Object Detection <span class="chip">ICANN 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangchen Yin, Zhenda Yu, Zetao Fei, Wenjun Lv, Xin Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current object detection models have achieved good results on many benchmark
+datasets, detecting objects in dark conditions remains a large challenge. To
+address this issue, we propose a pyramid enhanced network (PENet) and joint it
+with YOLOv3 to build a dark object detection framework named PE-YOLO. Firstly,
+PENet decomposes the image into four components of different resolutions using
+the Laplacian pyramid. Specifically we propose a detail processing module (DPM)
+to enhance the detail of images, which consists of context branch and edge
+branch. In addition, we propose a low-frequency enhancement filter (LEF) to
+capture low-frequency semantics and prevent high-frequency noise. PE-YOLO
+adopts an end-to-end joint training approach and only uses normal detection
+loss to simplify the training process. We conduct experiments on the low-light
+object detection dataset ExDark to demonstrate the effectiveness of ours. The
+results indicate that compared with other dark detectors and low-light
+enhancement models, PE-YOLO achieves the advanced results, achieving 78.0% in
+mAP and 53.6 in FPS, respectively, which can adapt to object detection under
+different low-light conditions. The code is available at
+https://github.com/XiangchenYin/PE-YOLO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICANN 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Online Lane Graph Extraction by Object-Lane Clustering <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yigit Baran Can, Alexander Liniger, Danda Pani Paudel, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving requires accurate local scene understanding information.
+To this end, autonomous agents deploy object detection and online BEV lane
+graph extraction methods as a part of their perception stack. In this work, we
+propose an architecture and loss formulation to improve the accuracy of local
+lane graph estimates by using 3D object detection outputs. The proposed method
+learns to assign the objects to centerlines by considering the centerlines as
+cluster centers and the objects as data points to be assigned a probability
+distribution over the cluster centers. This training scheme ensures direct
+supervision on the relationship between lanes and objects, thus leading to
+better performance. The proposed method improves lane graph estimation
+substantially over state-of-the-art methods. The extensive ablations show that
+our method can achieve significant performance improvements by using the
+outputs of existing 3D object detection methods. Since our method uses the
+detection outputs rather than detection method intermediate representations, a
+single model of our method can use any detection method at test time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Proxy Anchor-based Unsupervised Learning for Continuous Generalized
+  Category Discovery <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyungmin Kim, Sungho Suh, Daehwan Kim, Daun Jeong, Hansang Cho, Junmo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in deep learning have significantly improved the performance
+of various computer vision applications. However, discovering novel categories
+in an incremental learning scenario remains a challenging problem due to the
+lack of prior knowledge about the number and nature of new categories. Existing
+methods for novel category discovery are limited by their reliance on labeled
+datasets and prior knowledge about the number of novel categories and the
+proportion of novel samples in the batch. To address the limitations and more
+accurately reflect real-world scenarios, in this paper, we propose a novel
+unsupervised class incremental learning approach for discovering novel
+categories on unlabeled sets without prior knowledge. The proposed method
+fine-tunes the feature extractor and proxy anchors on labeled sets, then splits
+samples into old and novel categories and clusters on the unlabeled dataset.
+Furthermore, the proxy anchors-based exemplar generates representative category
+vectors to mitigate catastrophic forgetting. Experimental results demonstrate
+that our proposed approach outperforms the state-of-the-art methods on
+fine-grained datasets under real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OCTraN: 3D Occupancy Convolutional <span class="highlight-title">Transformer</span> Network in Unstructured
+  Traffic Scenarios <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Nalgunda Ganesh, Dhruval Pobbathi Badrinath, Harshith Mohan Kumar, Priya SS, Surabhi Narayan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern approaches for vision-centric environment perception for autonomous
+navigation make extensive use of self-supervised monocular depth estimation
+algorithms that output disparity maps. However, when this disparity map is
+projected onto 3D space, the errors in disparity are magnified, resulting in a
+depth estimation error that increases quadratically as the distance from the
+camera increases. Though Light Detection and Ranging (LiDAR) can solve this
+issue, it is expensive and not feasible for many applications. To address the
+challenge of accurate ranging with low-cost sensors, we propose, OCTraN, a
+transformer architecture that uses iterative-attention to convert 2D image
+features into 3D occupancy features and makes use of convolution and transpose
+convolution to efficiently operate on spatial information. We also develop a
+self-supervised training pipeline to generalize the model to any scene by
+eliminating the need for LiDAR ground truth by substituting it with
+pseudo-ground truth labels obtained from boosted monocular depth estimation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work was accepted as a spotlight presentation at the
+  Transformers for Vision Workshop @CVPR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling 3D cardiac contraction and relaxation with point cloud
+  deformation networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcel Beetz, Abhirup Banerjee, Vicente Grau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global single-valued biomarkers of cardiac function typically used in
+clinical practice, such as ejection fraction, provide limited insight on the
+true 3D cardiac deformation process and hence, limit the understanding of both
+healthy and pathological cardiac mechanics. In this work, we propose the Point
+Cloud Deformation Network (PCD-Net) as a novel geometric deep learning approach
+to model 3D cardiac contraction and relaxation between the extreme ends of the
+cardiac cycle. It employs the recent advances in point cloud-based deep
+learning into an encoder-decoder structure, in order to enable efficient
+multi-scale feature learning directly on multi-class 3D point cloud
+representations of the cardiac anatomy. We evaluate our approach on a large
+dataset of over 10,000 cases from the UK Biobank study and find average Chamfer
+distances between the predicted and ground truth anatomies below the pixel
+resolution of the underlying image acquisition. Furthermore, we observe similar
+clinical metrics between predicted and ground truth populations and show that
+the PCD-Net can successfully capture subpopulation-specific differences between
+normal subjects and myocardial infarction (MI) patients. We then demonstrate
+that the learned 3D deformation patterns outperform multiple clinical
+benchmarks by 13% and 7% in terms of area under the receiver operating
+characteristic curve for the tasks of prevalent MI detection and incident MI
+prediction and by 7% in terms of Harrell's concordance index for MI survival
+analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Confidence intervals for performance estimates in 3D medical image
+  segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        R. El Jurdi, G. Varoquax, O. Colliot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical segmentation models are evaluated empirically. As such an evaluation
+is based on a limited set of example images, it is unavoidably noisy. Beyond a
+mean performance measure, reporting confidence intervals is thus crucial.
+However, this is rarely done in medical image segmentation. The width of the
+confidence interval depends on the test set size and on the spread of the
+performance measure (its standard-deviation across of the test set). For
+classification, many test images are needed to avoid wide confidence intervals.
+Segmentation, however, has not been studied, and it differs by the amount of
+information brought by a given test image. In this paper, we study the typical
+confidence intervals in medical image segmentation. We carry experiments on 3D
+image segmentation using the standard nnU-net framework, two datasets from the
+Medical Decathlon challenge and two performance measures: the Dice accuracy and
+the Hausdorff distance. We show that the parametric confidence intervals are
+reasonable approximations of the bootstrap estimates for varying test set sizes
+and spread of the performance metric. Importantly, we show that the test size
+needed to achieve a given precision is often much lower than for classification
+tasks. Typically, a 1% wide confidence interval requires about 100-200 test
+samples when the spread is low (standard-deviation around 3%). More difficult
+segmentation tasks may lead to higher spreads and require over 1000 samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intrinsic Appearance Decomposition Using Point Cloud Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyan Xing, Konrad Groh, Sezer Karaoglu, Theo Gevers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intrinsic decomposition is to infer the albedo and shading from the image.
+Since it is a heavily ill-posed problem, previous methods rely on prior
+assumptions from 2D images, however, the exploration of the data representation
+itself is limited. The point cloud is known as a rich format of scene
+representation, which naturally aligns the geometric information and the color
+information of an image. Our proposed method, Point Intrinsic Net, in short,
+PoInt-Net, jointly predicts the albedo, light source direction, and shading,
+using point cloud representation. Experiments reveal the benefits of PoInt-Net,
+in terms of accuracy, it outperforms 2D representation approaches on multiple
+metrics across datasets; in terms of efficiency, it trains on small-scale point
+clouds and performs stably on any-scale point clouds; in terms of robustness,
+it only trains on single object level dataset, and demonstrates reasonable
+generalization ability for unseen objects and scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language-based Action Concept Spaces Improve Video <span class="highlight-title">Self-Supervised</span>
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10922v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10922v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanchana Ranasinghe, Michael Ryoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent contrastive language image pre-training has led to learning highly
+transferable and robust image representations. However, adapting these models
+to video domains with minimal supervision remains an open problem. We explore a
+simple step in that direction, using language tied self-supervised learning to
+adapt an image CLIP model to the video domain. A backbone modified for temporal
+modeling is trained under self-distillation settings with train objectives
+operating in an action concept space. Feature vectors of various action
+concepts extracted from a language encoder using relevant textual prompts
+construct this space. We introduce two train objectives, concept distillation
+and concept alignment, that retain generality of original representations while
+enforcing relations between actions and their attributes. Our approach improves
+zero-shot and linear probing performance on three action recognition
+benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Fine-Tuning Strategies for <span class="highlight-title">Self-supervised</span> Medical Imaging
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Osama Khan, Yi Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the rapid progress in self-supervised learning (SSL), end-to-end
+fine-tuning still remains the dominant fine-tuning strategy for medical imaging
+analysis. However, it remains unclear whether this approach is truly optimal
+for effectively utilizing the pre-trained knowledge, especially considering the
+diverse categories of SSL that capture different types of features. In this
+paper, we first establish strong contrastive and restorative SSL baselines that
+outperform SOTA methods across four diverse downstream tasks. Building upon
+these strong baselines, we conduct an extensive fine-tuning analysis across
+multiple pre-training and fine-tuning datasets, as well as various fine-tuning
+dataset sizes. Contrary to the conventional wisdom of fine-tuning only the last
+few layers of a pre-trained network, we show that fine-tuning intermediate
+layers is more effective, with fine-tuning the second quarter (25-50%) of the
+network being optimal for contrastive SSL whereas fine-tuning the third quarter
+(50-75%) of the network being optimal for restorative SSL. Compared to the
+de-facto standard of end-to-end fine-tuning, our best fine-tuning strategy,
+which fine-tunes a shallower network consisting of the first three quarters
+(0-75%) of the pre-trained network, yields improvements of as much as 5.48%.
+Additionally, using these insights, we propose a simple yet effective method to
+leverage the complementary strengths of multiple SSL models, resulting in
+enhancements of up to 3.57% compared to using the best model alone. Hence, our
+fine-tuning strategies not only enhance the performance of individual SSL
+models, but also enable effective utilization of the complementary strengths
+offered by multiple SSL models, leading to significant improvements in
+self-supervised medical imaging analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WeakPolyp: You Only Look Bounding Box for Polyp Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Wei, Yiwen Hu, Shuguang Cui, S. Kevin Zhou, Zhen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Limited by expensive pixel-level labels, polyp segmentation models are
+plagued by data shortage and suffer from impaired generalization. In contrast,
+polyp bounding box annotations are much cheaper and more accessible. Thus, to
+reduce labeling cost, we propose to learn a weakly supervised polyp
+segmentation model (i.e., WeakPolyp) completely based on bounding box
+annotations. However, coarse bounding boxes contain too much noise. To avoid
+interference, we introduce the mask-to-box (M2B) transformation. By supervising
+the outer box mask of the prediction instead of the prediction itself, M2B
+greatly mitigates the mismatch between the coarse label and the precise
+prediction. But, M2B only provides sparse supervision, leading to non-unique
+predictions. Therefore, we further propose a scale consistency (SC) loss for
+dense supervision. By explicitly aligning predictions across the same image at
+different scales, the SC loss largely reduces the variation of predictions.
+Note that our WeakPolyp is a plug-and-play model, which can be easily ported to
+other appealing backbones. Besides, the proposed modules are only used during
+training, bringing no computation cost to inference. Extensive experiments
+demonstrate the effectiveness of our proposed WeakPolyp, which surprisingly
+achieves a comparable performance with a fully supervised model, requiring no
+mask annotations at all.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by MICCAI 2023, codes are available at
+  https://github.com/weijun88/WeakPolyp</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Variational Point Encoding Deformation for Dental Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johan Ziruo Ye, Thomas Ørkild, Peter Lempel Søndergaard, Søren Hauberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital dentistry has made significant advancements in recent years, yet
+numerous challenges remain to be addressed. In this study, we release a new
+extensive dataset of tooth meshes to encourage further research. Additionally,
+we propose Variational FoldingNet (VF-Net), which extends FoldingNet to enable
+probabilistic learning of point cloud representations. A key challenge in
+existing latent variable models for point clouds is the lack of a 1-to-1
+mapping between input points and output points. Instead, they must rely on
+optimizing Chamfer distances, a metric that does not have a normalized
+distributional counterpart, preventing its usage in probabilistic models. We
+demonstrate that explicit minimization of Chamfer distances can be replaced by
+a suitable encoder, which allows us to increase computational efficiency while
+simplifying the probabilistic extension. Our experimental findings present
+empirical evidence demonstrating the superior performance of VF-Net over
+existing models in terms of dental scan reconstruction and extrapolation.
+Additionally, our investigation highlights the robustness of VF-Net's latent
+representations. These results underscore the promising prospects of VF-Net as
+an effective and reliable method for point cloud reconstruction and analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human Motion Generation: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10894v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10894v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Zhu, Xiaoxuan Ma, Dongwoo Ro, Hai Ci, Jinlu Zhang, Jiaxin Shi, Feng Gao, Qi Tian, Yizhou Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human motion generation aims to generate natural human pose sequences and
+shows immense potential for real-world applications. Substantial progress has
+been made recently in motion data collection technologies and generation
+methods, laying the foundation for increasing interest in human motion
+generation. Most research within this field focuses on generating human motions
+based on conditional signals, such as text, audio, and scene contexts. While
+significant advancements have been made in recent years, the task continues to
+pose challenges due to the intricate nature of human motion and its implicit
+relationship with conditional signals. In this survey, we present a
+comprehensive literature review of human motion generation, which, to the best
+of our knowledge, is the first of its kind in this field. We begin by
+introducing the background of human motion and generative models, followed by
+an examination of representative methods for three mainstream sub-tasks:
+text-conditioned, audio-conditioned, and scene-conditioned human motion
+generation. Additionally, we provide an overview of common datasets and
+evaluation metrics. Lastly, we discuss open problems and outline potential
+future research directions. We hope that this survey could provide the
+community with a comprehensive glimpse of this rapidly evolving field and
+inspire novel ideas that address the outstanding challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Risk-optimized Outlier Removal for Robust Point Cloud Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10875v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10875v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinke Li, Junchi Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of point cloud deep models for safety-critical purposes has
+increased, but the reliability and security of these models can be compromised
+by intentional or naturally occurring point cloud noise. To combat this issue,
+we present a novel point cloud outlier removal method called PointCVaR, which
+empowers standard-trained models to eliminate additional outliers and restore
+the data. Our approach begins by conducting attribution analysis to determine
+the influence of each point on the model output, which we refer to as point
+risk. We then optimize the process of filtering high-risk points using
+Conditional Value at Risk (CVaR) as the objective. The rationale for this
+approach is based on the observation that noise points in point clouds tend to
+cluster in the tail of the risk distribution, with a low frequency but a high
+level of risk, resulting in significant interference with classification
+results. Despite requiring no additional training effort, our method produces
+exceptional results in various removal-and-classification experiments for noisy
+point clouds, which are corrupted by random noise, adversarial noise, and
+backdoor trigger noise. Impressively, it achieves 87% accuracy in defense
+against the backdoor attack by removing triggers. Overall, the proposed
+PointCVaR effectively eliminates noise points and enhances point cloud
+classification, making it a promising plug-in module for various models in
+different scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conservative Estimation of Perception Relevance of Dynamic Objects for
+  Safe Trajectories in Automotive Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ken Mori, Kai Storms, Steven Peters
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Having efficient testing strategies is a core challenge that needs to be
+overcome for the release of automated driving. This necessitates clear
+requirements as well as suitable methods for testing. In this work, the
+requirements for perception modules are considered with respect to relevance.
+The concept of relevance currently remains insufficiently defined and
+specified. In this paper, we propose a novel methodology to overcome this
+challenge by exemplary application to collision safety in the highway domain.
+Using this general system and use case specification, a corresponding concept
+for relevance is derived. Irrelevant objects are thus defined as objects which
+do not limit the set of safe actions available to the ego vehicle under
+consideration of all uncertainties. As an initial step, the use case is
+decomposed into functional scenarios with respect to collision relevance. For
+each functional scenario, possible actions of both the ego vehicle and any
+other dynamic object are formalized as equations. This set of possible actions
+is constrained by traffic rules, yielding relevance criteria. As a result, we
+present a conservative estimation which dynamic objects are relevant for
+perception and need to be considered for a complete evaluation. The estimation
+provides requirements which are applicable for offline testing and validation
+of perception components. A visualization is presented for examples from the
+highD dataset, showing the plausibility of the results. Finally, a possibility
+for a future validation of the presented relevance concept is outlined.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with
+  Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashish Singh, Prateek Agarwal, Zixuan Huang, Arpita Singh, Tong Yu, Sungchul Kim, Victor Bursztyn, Nikos Vlassis, Ryan A. Rossi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Captions are crucial for understanding scientific visualizations and
+documents. Existing captioning methods for scientific figures rely on
+figure-caption pairs extracted from documents for training, many of which fall
+short with respect to metrics like helpfulness, explainability, and
+visual-descriptiveness [15] leading to generated captions being misaligned with
+reader preferences. To enable the generation of high-quality figure captions,
+we introduce FigCaps-HF a new framework for figure-caption generation that can
+incorporate domain expert feedback in generating captions optimized for reader
+preferences. Our framework comprises of 1) an automatic method for evaluating
+quality of figure-caption pairs, 2) a novel reinforcement learning with human
+feedback (RLHF) method to optimize a generative figure-to-caption model for
+reader preferences. We demonstrate the effectiveness of our simple learning
+framework by improving performance over standard fine-tuning across different
+types of models. In particular, when using BLIP as the base model, our RLHF
+framework achieves a mean gain of 35.7%, 16.9%, and 9% in ROUGE, BLEU, and
+Meteor, respectively. Finally, we release a large-scale benchmark dataset with
+human feedback on figure-caption pairs to enable further evaluation and
+development of RLHF techniques for this problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 4 figures. Benchmark Documentation:
+  https://figcapshf.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Divide & Bind Your Attention for Improved Generative Semantic Nursing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yumeng Li, Margret Keuper, Dan Zhang, Anna Khoreva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emerging large-scale text-to-image generative models, e.g., Stable Diffusion
+(SD), have exhibited overwhelming results with high fidelity. Despite the
+magnificent progress, current state-of-the-art models still struggle to
+generate images fully adhering to the input prompt. Prior work, Attend &
+Excite, has introduced the concept of Generative Semantic Nursing (GSN), aiming
+to optimize cross-attention during inference time to better incorporate the
+semantics. It demonstrates promising results in generating simple prompts,
+e.g., ``a cat and a dog''. However, its efficacy declines when dealing with
+more complex prompts, and it does not explicitly address the problem of
+improper attribute binding. To address the challenges posed by complex prompts
+or scenarios involving multiple entities and to achieve improved attribute
+binding, we propose Divide & Bind. We introduce two novel loss objectives for
+GSN: a novel attendance loss and a binding loss. Our approach stands out in its
+ability to faithfully synthesize desired objects with improved attribute
+alignment from complex prompts and exhibits superior performance across
+multiple evaluation benchmarks. More videos and updates can be found on the
+project page \url{https://sites.google.com/view/divide-and-bind}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: \url{https://sites.google.com/view/divide-and-bind}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BlendFace: Re-designing Identity Encoders for Face-Swapping <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaede Shiohara, Xingchao Yang, Takafumi Taketomi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The great advancements of generative adversarial networks and face
+recognition models in computer vision have made it possible to swap identities
+on images from single sources. Although a lot of studies seems to have proposed
+almost satisfactory solutions, we notice previous methods still suffer from an
+identity-attribute entanglement that causes undesired attributes swapping
+because widely used identity encoders, eg, ArcFace, have some crucial attribute
+biases owing to their pretraining on face recognition tasks. To address this
+issue, we design BlendFace, a novel identity encoder for face-swapping. The key
+idea behind BlendFace is training face recognition models on blended images
+whose attributes are replaced with those of another mitigates inter-personal
+biases such as hairsyles. BlendFace feeds disentangled identity features into
+generators and guides generators properly as an identity loss function.
+Extensive experiments demonstrate that BlendFace improves the
+identity-attribute disentanglement in face-swapping models, maintaining a
+comparable quantitative performance to previous methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023. Code: https://github.com/mapooon/BlendFace, Webpage:
+  https://mapooon.github.io/BlendFacePage/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Effective Priors and Efficient Models for Weakly-Supervised
+  Change Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenghui Zhao, Lixiang Ru, Chen Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-supervised change detection (WSCD) aims to detect pixel-level changes
+with only image-level annotations. Owing to its label efficiency, WSCD is
+drawing increasing attention recently. However, current WSCD methods often
+encounter the challenge of change missing and fabricating, i.e., the
+inconsistency between image-level annotations and pixel-level predictions.
+Specifically, change missing refer to the situation that the WSCD model fails
+to predict any changed pixels, even though the image-level label indicates
+changed, and vice versa for change fabricating. To address this challenge, in
+this work, we leverage global-scale and local-scale priors in WSCD and propose
+two components: a Dilated Prior (DP) decoder and a Label Gated (LG) constraint.
+The DP decoder decodes samples with the changed image-level label, skips
+samples with the unchanged label, and replaces them with an all-unchanged
+pixel-level label. The LG constraint is derived from the correspondence between
+changed representations and image-level labels, penalizing the model when it
+mispredicts the change status. Additionally, we develop TransWCD, a simple yet
+powerful transformer-based model, showcasing the potential of weakly-supervised
+learning in change detection. By integrating the DP decoder and LG constraint
+into TransWCD, we form TransWCD-DL. Our proposed TransWCD and TransWCD-DL
+achieve significant +6.33% and +9.55% F1 score improvements over the
+state-of-the-art methods on the WHU-CD dataset, respectively. Some performance
+metrics even exceed several fully-supervised change detection (FSCD)
+competitors. Code will be available at
+https://github.com/zhenghuizhao/TransWCD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-paced Weight Consolidation for Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Cong, Yang Cong, Gan Sun, Yuyang Liu, Jiahua Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning algorithms which keep the parameters of new tasks close to
+that of previous tasks, are popular in preventing catastrophic forgetting in
+sequential task learning settings. However, 1) the performance for the new
+continual learner will be degraded without distinguishing the contributions of
+previously learned tasks; 2) the computational cost will be greatly increased
+with the number of tasks, since most existing algorithms need to regularize all
+previous tasks when learning new tasks. To address the above challenges, we
+propose a self-paced Weight Consolidation (spWC) framework to attain robust
+continual learning via evaluating the discriminative contributions of previous
+tasks. To be specific, we develop a self-paced regularization to reflect the
+priorities of past tasks via measuring difficulty based on key performance
+indicator (i.e., accuracy). When encountering a new task, all previous tasks
+are sorted from "difficult" to "easy" based on the priorities. Then the
+parameters of the new continual learner will be learned via selectively
+maintaining the knowledge amongst more difficult past tasks, which could well
+overcome catastrophic forgetting with less computational cost. We adopt an
+alternative convex search to iteratively update the model parameters and
+priority weights in the bi-convex formulation. The proposed spWC framework is
+plug-and-play, which is applicable to most continual learning algorithms (e.g.,
+EWC, MAS and RCIL) in different directions (e.g., classification and
+segmentation). Experimental results on several public benchmark datasets
+demonstrate that our proposed framework can effectively improve performance
+when compared with other popular continual learning algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Precipitation Nowcasting of Integrated Multi-satellitE Retrievals
+  for GPM: A U-Net Convolutional LSTM Architecture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10843v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10843v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reyhaneh Rahimi, Ardeshir Ebtehaj, Ali Behrangi, Jackson Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a deep learning architecture for nowcasting of
+precipitation almost globally every 30 min with a 4-hour lead time. The
+architecture fuses a U-Net and a convolutional long short-term memory (LSTM)
+neural network and is trained using data from the Integrated MultisatellitE
+Retrievals for GPM (IMERG) and a few key precipitation drivers from the Global
+Forecast System (GFS). The impacts of different training loss functions,
+including the mean-squared error (regression) and the focal-loss
+(classification), on the quality of precipitation nowcasts are studied. The
+results indicate that the regression network performs well in capturing light
+precipitation (below 1.6 mm/hr), but the classification network can outperform
+the regression network for nowcasting of precipitation extremes (>8 mm/hr), in
+terms of the critical success index (CSI).. Using the Wasserstein distance, it
+is shown that the predicted precipitation by the classification network has a
+closer class probability distribution to the IMERG than the regression network.
+It is uncovered that the inclusion of the physical variables can improve
+precipitation nowcasting, especially at longer lead times in both networks.
+Taking IMERG as a relative reference, a multi-scale analysis in terms of
+fractions skill score (FSS), shows that the nowcasting machine remains skillful
+(FSS > 0.5) at the resolution of 10 km compared to 50 km for GFS. For
+precipitation rates greater than 4~mm/hr, only the classification network
+remains FSS-skillful on scales greater than 50 km within a 2-hour lead time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Label Calibration for Semantic Segmentation Under Domain Shift <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ondrej Bohdal, Da Li, Timothy Hospedales
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Performance of a pre-trained semantic segmentation model is likely to
+substantially decrease on data from a new domain. We show a pre-trained model
+can be adapted to unlabelled target domain data by calculating soft-label
+prototypes under the domain shift and making predictions according to the
+prototype closest to the vector with predicted class probabilities. The
+proposed adaptation procedure is fast, comes almost for free in terms of
+computational resources and leads to considerable performance improvements. We
+demonstrate the benefits of such label calibration on the highly-practical
+synthetic-to-real semantic segmentation problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2023 Workshop on Pitfalls of Limited Data and Computation for
+  Trustworthy ML</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parse and Recall: Towards Accurate Lung Nodule Malignancy Prediction
+  like Radiologists <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10824v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10824v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianpeng Zhang, Xianghua Ye, Jianfeng Zhang, Yuxing Tang, Minfeng Xu, Jianfei Guo, Xin Chen, Zaiyi Liu, Jingren Zhou, Le Lu, Ling Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lung cancer is a leading cause of death worldwide and early screening is
+critical for improving survival outcomes. In clinical practice, the contextual
+structure of nodules and the accumulated experience of radiologists are the two
+core elements related to the accuracy of identification of benign and malignant
+nodules. Contextual information provides comprehensive information about
+nodules such as location, shape, and peripheral vessels, and experienced
+radiologists can search for clues from previous cases as a reference to enrich
+the basis of decision-making. In this paper, we propose a radiologist-inspired
+method to simulate the diagnostic process of radiologists, which is composed of
+context parsing and prototype recalling modules. The context parsing module
+first segments the context structure of nodules and then aggregates contextual
+information for a more comprehensive understanding of the nodule. The prototype
+recalling module utilizes prototype-based learning to condense previously
+learned cases as prototypes for comparative analysis, which is updated online
+in a momentum way during training. Building on the two modules, our method
+leverages both the intrinsic characteristics of the nodules and the external
+knowledge accumulated from other nodules to achieve a sound diagnosis. To meet
+the needs of both low-dose and noncontrast screening, we collect a large-scale
+dataset of 12,852 and 4,029 nodules from low-dose and noncontrast CTs
+respectively, each with pathology- or follow-up-confirmed labels. Experiments
+on several datasets demonstrate that our method achieves advanced screening
+performance on both low-dose and noncontrast scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient-Semantic Compensation for Incremental Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Cong, Yang Cong, Jiahua Dong, Gan Sun, Henghui Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incremental semantic segmentation aims to continually learn the segmentation
+of new coming classes without accessing the training data of previously learned
+classes. However, most current methods fail to address catastrophic forgetting
+and background shift since they 1) treat all previous classes equally without
+considering different forgetting paces caused by imbalanced gradient
+back-propagation; 2) lack strong semantic guidance between classes. To tackle
+the above challenges, in this paper, we propose a Gradient-Semantic
+Compensation (GSC) model, which surmounts incremental semantic segmentation
+from both gradient and semantic perspectives. Specifically, to address
+catastrophic forgetting from the gradient aspect, we develop a step-aware
+gradient compensation that can balance forgetting paces of previously seen
+classes via re-weighting gradient backpropagation. Meanwhile, we propose a
+soft-sharp semantic relation distillation to distill consistent inter-class
+semantic relations via soft labels for alleviating catastrophic forgetting from
+the semantic aspect. In addition, we develop a prototypical pseudo re-labeling
+that provides strong semantic guidance to mitigate background shift. It
+produces high-quality pseudo labels for old classes in the background by
+measuring distances between pixels and class-wise prototypes. Extensive
+experiments on three public datasets, i.e., Pascal VOC 2012, ADE20K, and
+Cityscapes, demonstrate the effectiveness of our proposed GSC model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained
+  Diffusion <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10816v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10816v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinheng Xie, Yuexiang Li, Yawen Huang, Haozhe Liu, Wentian Zhang, Yefeng Zheng, Mike Zheng Shou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent text-to-image diffusion models have demonstrated an astonishing
+capacity to generate high-quality images. However, researchers mainly studied
+the way of synthesizing images with only text prompts. While some works have
+explored using other modalities as conditions, considerable paired data, e.g.,
+box/mask-image pairs, and fine-tuning time are required for nurturing models.
+As such paired data is time-consuming and labor-intensive to acquire and
+restricted to a closed set, this potentially becomes the bottleneck for
+applications in an open world. This paper focuses on the simplest form of
+user-provided conditions, e.g., box or scribble. To mitigate the aforementioned
+problem, we propose a training-free method to control objects and contexts in
+the synthesized images adhering to the given spatial conditions. Specifically,
+three spatial constraints, i.e., Inner-Box, Outer-Box, and Corner Constraints,
+are designed and seamlessly integrated into the denoising step of diffusion
+models, requiring no additional training and massive annotated layout data.
+Extensive results show that the proposed constraints can control what and where
+to present in the images while retaining the ability of the Stable Diffusion
+model to synthesize with high fidelity and diverse concept coverage. The code
+is publicly available at https://github.com/Sierkinhane/BoxDiff.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. The paper is still being revised for better
+  organization and comparison</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Perceptual Quality Assessment of Omnidirectional Audio-visual Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10813v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10813v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xilei Zhu, Huiyu Duan, Yuqin Cao, Yuxin Zhu, Yucheng Zhu, Jing Liu, Li Chen, Xiongkuo Min, Guangtao Zhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Omnidirectional videos (ODVs) play an increasingly important role in the
+application fields of medical, education, advertising, tourism, etc. Assessing
+the quality of ODVs is significant for service-providers to improve the user's
+Quality of Experience (QoE). However, most existing quality assessment studies
+for ODVs only focus on the visual distortions of videos, while ignoring that
+the overall QoE also depends on the accompanying audio signals. In this paper,
+we first establish a large-scale audio-visual quality assessment dataset for
+omnidirectional videos, which includes 375 distorted omnidirectional
+audio-visual (A/V) sequences generated from 15 high-quality pristine
+omnidirectional A/V contents, and the corresponding perceptual audio-visual
+quality scores. Then, we design three baseline methods for full-reference
+omnidirectional audio-visual quality assessment (OAVQA), which combine existing
+state-of-the-art single-mode audio and video QA models via multimodal fusion
+strategies. We validate the effectiveness of the A/V multimodal fusion method
+for OAVQA on our dataset, which provides a new benchmark for omnidirectional
+QoE evaluation. Our dataset is available at https://github.com/iamazxl/OAVQA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures, to be published in CICAI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-<span class="highlight-title">Transformer</span>: A Unified Framework for Multimodal Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyuan Zhang, Kaixiong Gong, Kaipeng Zhang, Hongsheng Li, Yu Qiao, Wanli Ouyang, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal learning aims to build models that can process and relate
+information from multiple modalities. Despite years of development in this
+field, it still remains challenging to design a unified network for processing
+various modalities ($\textit{e.g.}$ natural language, 2D images, 3D point
+clouds, audio, video, time series, tabular data) due to the inherent gaps among
+them. In this work, we propose a framework, named Meta-Transformer, that
+leverages a $\textbf{frozen}$ encoder to perform multimodal perception without
+any paired multimodal training data. In Meta-Transformer, the raw input data
+from various modalities are mapped into a shared token space, allowing a
+subsequent encoder with frozen parameters to extract high-level semantic
+features of the input data. Composed of three main components: a unified data
+tokenizer, a modality-shared encoder, and task-specific heads for downstream
+tasks, Meta-Transformer is the first framework to perform unified learning
+across 12 modalities with unpaired data. Experiments on different benchmarks
+reveal that Meta-Transformer can handle a wide range of tasks including
+fundamental perception (text, image, point cloud, audio, video), practical
+application (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph,
+tabular, and time-series). Meta-Transformer indicates a promising future for
+developing unified multimodal intelligence with transformers. Code will be
+available at https://github.com/invictus717/MetaTransformer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://kxgong.github.io/meta_transformer/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HyperReenact: One-Shot Reenactment via Jointly Learning to Refine and
+  Retarget Faces <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10797v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10797v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stella Bounareli, Christos Tzelepis, Vasileios Argyriou, Ioannis Patras, Georgios Tzimiropoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present our method for neural face reenactment, called
+HyperReenact, that aims to generate realistic talking head images of a source
+identity, driven by a target facial pose. Existing state-of-the-art face
+reenactment methods train controllable generative models that learn to
+synthesize realistic facial images, yet producing reenacted faces that are
+prone to significant visual artifacts, especially under the challenging
+condition of extreme head pose changes, or requiring expensive few-shot
+fine-tuning to better preserve the source identity characteristics. We propose
+to address these limitations by leveraging the photorealistic generation
+ability and the disentangled properties of a pretrained StyleGAN2 generator, by
+first inverting the real images into its latent space and then using a
+hypernetwork to perform: (i) refinement of the source identity characteristics
+and (ii) facial pose re-targeting, eliminating this way the dependence on
+external editing methods that typically produce artifacts. Our method operates
+under the one-shot setting (i.e., using a single source frame) and allows for
+cross-subject reenactment, without requiring any subject-specific fine-tuning.
+We compare our method both quantitatively and qualitatively against several
+state-of-the-art techniques on the standard benchmarks of VoxCeleb1 and
+VoxCeleb2, demonstrating the superiority of our approach in producing
+artifact-free images, exhibiting remarkable robustness even under extreme head
+pose changes. We make the code and the pretrained models publicly available at:
+https://github.com/StelaBou/HyperReenact .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in ICCV 2023. Project page:
+  https://stelabou.github.io/hyperreenact.github.io/ Code:
+  https://github.com/StelaBou/HyperReenact</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing PatchCore for Few/many-shot Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10792v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10792v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        João Santos, Triet Tran, Oliver Rippel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot anomaly detection (AD) is an emerging sub-field of general AD, and
+tries to distinguish between normal and anomalous data using only few selected
+samples. While newly proposed few-shot AD methods do compare against
+pre-existing algorithms developed for the full-shot domain as baselines, they
+do not dedicatedly optimize them for the few-shot setting. It thus remains
+unclear if the performance of such pre-existing algorithms can be further
+improved. We address said question in this work. Specifically, we present a
+study on the AD/anomaly segmentation (AS) performance of PatchCore, the current
+state-of-the-art full-shot AD/AS algorithm, in both the few-shot and the
+many-shot settings. We hypothesize that further performance improvements can be
+realized by (I) optimizing its various hyperparameters, and by (II)
+transferring techniques known to improve few-shot supervised learning to the AD
+domain. Exhaustive experiments on the public VisA and MVTec AD datasets reveal
+that (I) significant performance improvements can be realized by optimizing
+hyperparameters such as the underlying feature extractor, and that (II)
+image-level augmentations can, but are not guaranteed, to improve performance.
+Based on these findings, we achieve a new state of the art in few-shot AD on
+VisA, further demonstrating the merit of adapting pre-existing AD/AS methods to
+the few-shot setting. Last, we identify the investigation of feature extractors
+with a strong inductive bias as a potential future research direction for
+(few-shot) AD/AS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Behavioral Analysis of Vision-and-Language Navigation Agents <span class="chip">CVPR2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijiao Yang, Arjun Majumdar, Stefan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To be successful, Vision-and-Language Navigation (VLN) agents must be able to
+ground instructions to actions based on their surroundings. In this work, we
+develop a methodology to study agent behavior on a skill-specific basis --
+examining how well existing agents ground instructions about stopping, turning,
+and moving towards specified objects or rooms. Our approach is based on
+generating skill-specific interventions and measuring changes in agent
+predictions. We present a detailed case study analyzing the behavior of a
+recent agent and then compare multiple agents in terms of skill-specific
+competency scores. This analysis suggests that biases from training have
+lasting effects on agent behavior and that existing models are able to ground
+simple referring expressions. Our comparisons between models show that
+skill-specific scores correlate with improvements in overall VLN task
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to CVPR2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feed-Forward Source-Free Domain Adaptation via Class Prototypes <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ondrej Bohdal, Da Li, Timothy Hospedales
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Source-free domain adaptation has become popular because of its practical
+usefulness and no need to access source data. However, the adaptation process
+still takes a considerable amount of time and is predominantly based on
+optimization that relies on back-propagation. In this work we present a simple
+feed-forward approach that challenges the need for back-propagation based
+adaptation. Our approach is based on computing prototypes of classes under the
+domain shift using a pre-trained model. It achieves strong improvements in
+accuracy compared to the pre-trained model and requires only a small fraction
+of time of existing domain adaptation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2022 Workshop on Out of Distribution Generalization in Computer
+  Vision (OOD-CV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SMURF: Spatial Multi-Representation Fusion for 3D Object Detection with
+  4D Imaging Radar 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10784v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10784v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianan Liu, Qiuchi Zhao, Weiyi Xiong, Tao Huang, Qing-Long Han, Bing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The 4D Millimeter wave (mmWave) radar is a promising technology for vehicle
+sensing due to its cost-effectiveness and operability in adverse weather
+conditions. However, the adoption of this technology has been hindered by
+sparsity and noise issues in radar point cloud data. This paper introduces
+spatial multi-representation fusion (SMURF), a novel approach to 3D object
+detection using a single 4D imaging radar. SMURF leverages multiple
+representations of radar detection points, including pillarization and density
+features of a multi-dimensional Gaussian mixture distribution through kernel
+density estimation (KDE). KDE effectively mitigates measurement inaccuracy
+caused by limited angular resolution and multi-path propagation of radar
+signals. Additionally, KDE helps alleviate point cloud sparsity by capturing
+density features. Experimental evaluations on View-of-Delft (VoD) and
+TJ4DRadSet datasets demonstrate the effectiveness and generalization ability of
+SMURF, outperforming recently proposed 4D imaging radar-based
+single-representation models. Moreover, while using 4D imaging radar only,
+SMURF still achieves comparable performance to the state-of-the-art 4D imaging
+radar and camera fusion-based method, with an increase of 1.22% in the mean
+average precision on bird's-eye view of TJ4DRadSet dataset and 1.32% in the 3D
+mean average precision on the entire annotated area of VoD dataset. Our
+proposed method demonstrates impressive inference time and addresses the
+challenges of real-time detection, with the inference time no more than 0.05
+seconds for most scans on both datasets. This research highlights the benefits
+of 4D mmWave radar and is a strong benchmark for subsequent works regarding 3D
+object detection with 4D imaging radar.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ See More and Know More: Zero-shot Point Cloud Segmentation via
+  Multi-modal Visual Data <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Lu, Qi Jiang, Runnan Chen, Yuenan Hou, Xinge Zhu, Yuexin Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot point cloud segmentation aims to make deep models capable of
+recognizing novel objects in point cloud that are unseen in the training phase.
+Recent trends favor the pipeline which transfers knowledge from seen classes
+with labels to unseen classes without labels. They typically align visual
+features with semantic features obtained from word embedding by the supervision
+of seen classes' annotations. However, point cloud contains limited information
+to fully match with semantic features. In fact, the rich appearance information
+of images is a natural complement to the textureless point cloud, which is not
+well explored in previous literature. Motivated by this, we propose a novel
+multi-modal zero-shot learning method to better utilize the complementary
+information of point clouds and images for more accurate visual-semantic
+alignment. Extensive experiments are performed in two popular benchmarks, i.e.,
+SemanticKITTI and nuScenes, and our method outperforms current SOTA methods
+with 52% and 49% improvement on average for unseen class mIoU, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learned Thresholds Token Merging and Pruning for Vision <span class="highlight-title">Transformer</span>s <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10780v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10780v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxim Bonnaerens, Joni Dambre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision transformers have demonstrated remarkable success in a wide range of
+computer vision tasks over the last years. However, their high computational
+costs remain a significant barrier to their practical deployment. In
+particular, the complexity of transformer models is quadratic with respect to
+the number of input tokens. Therefore techniques that reduce the number of
+input tokens that need to be processed have been proposed. This paper
+introduces Learned Thresholds token Merging and Pruning (LTMP), a novel
+approach that leverages the strengths of both token merging and token pruning.
+LTMP uses learned threshold masking modules that dynamically determine which
+tokens to merge and which to prune. We demonstrate our approach with extensive
+experiments on vision transformers on the ImageNet classification task. Our
+results demonstrate that LTMP achieves state-of-the-art accuracy across
+reduction rates while requiring only a single fine-tuning epoch, which is an
+order of magnitude faster than previous methods. Code is available at
+https://github.com/Mxbonn/ltmp .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper to be presented at Efficient Systems for Foundation Models
+  Workshop at the International Conference on Machine Learning (ICML) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Urban Radiance Field Representation with Deformable Neural Mesh
+  Primitives <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Lu, Yan Xu, Guang Chen, Hongsheng Li, Kwan-Yee Lin, Changjun Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRFs) have achieved great success in the past few
+years. However, most current methods still require intensive resources due to
+ray marching-based rendering. To construct urban-level radiance fields
+efficiently, we design Deformable Neural Mesh Primitive~(DNMP), and propose to
+parameterize the entire scene with such primitives. The DNMP is a flexible and
+compact neural variant of classic mesh representation, which enjoys both the
+efficiency of rasterization-based rendering and the powerful neural
+representation capability for photo-realistic image synthesis. Specifically, a
+DNMP consists of a set of connected deformable mesh vertices with paired vertex
+features to parameterize the geometry and radiance information of a local area.
+To constrain the degree of freedom for optimization and lower the storage
+budgets, we enforce the shape of each primitive to be decoded from a relatively
+low-dimensional latent space. The rendering colors are decoded from the vertex
+features (interpolated with rasterization) by a view-dependent MLP. The DNMP
+provides a new paradigm for urban-level scene representation with appealing
+properties: $(1)$ High-quality rendering. Our method achieves leading
+performance for novel view synthesis in urban scenarios. $(2)$ Low
+computational costs. Our representation enables fast rendering (2.07ms/1k
+pixels) and low peak memory usage (110MB/1k pixels). We also present a
+lightweight version that can run 33$\times$ faster than vanilla NeRFs, and
+comparable to the highly-optimized Instant-NGP (0.61 vs 0.71ms/1k pixels).
+Project page: \href{https://dnmp.github.io/}{https://dnmp.github.io/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoding the Enigma: Benchmarking Humans and AIs on the Many Facets of
+  Working Memory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankur Sikarwar, Mengmi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Working memory (WM), a fundamental cognitive process facilitating the
+temporary storage, integration, manipulation, and retrieval of information,
+plays a vital role in reasoning and decision-making tasks. Robust benchmark
+datasets that capture the multifaceted nature of WM are crucial for the
+effective development and evaluation of AI WM models. Here, we introduce a
+comprehensive Working Memory (WorM) benchmark dataset for this purpose. WorM
+comprises 10 tasks and a total of 1 million trials, assessing 4
+functionalities, 3 domains, and 11 behavioral and neural characteristics of WM.
+We jointly trained and tested state-of-the-art recurrent neural networks and
+transformers on all these tasks. We also include human behavioral benchmarks as
+an upper bound for comparison. Our results suggest that AI models replicate
+some characteristics of WM in the brain, most notably primacy and recency
+effects, and neural clusters and correlates specialized for different domains
+and functionalities of WM. In the experiments, we also reveal some limitations
+in existing models to approximate human behavior. This dataset serves as a
+valuable resource for communities in cognitive psychology, neuroscience, and
+AI, offering a standardized framework to compare and enhance WM models,
+investigate WM's neural underpinnings, and develop WM models with human-like
+capabilities. Our source code and data are available at
+https://github.com/ZhangLab-DeepNeuroCogLab/WorM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MSQNet: Actor-agnostic Action Recognition with Multi-modal Query 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anindya Mondal, Sauradip Nag, Joaquin M Prada, Xiatian Zhu, Anjan Dutta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing action recognition methods are typically actor-specific due to the
+intrinsic topological and apparent differences among the actors. This requires
+actor-specific pose estimation (e.g., humans vs. animals), leading to
+cumbersome model design complexity and high maintenance costs. Moreover, they
+often focus on learning the visual modality alone and single-label
+classification whilst neglecting other available information sources (e.g.,
+class name text) and the concurrent occurrence of multiple actions. To overcome
+these limitations, we propose a new approach called 'actor-agnostic multi-modal
+multi-label action recognition,' which offers a unified solution for various
+types of actors, including humans and animals. We further formulate a novel
+Multi-modal Semantic Query Network (MSQNet) model in a transformer-based object
+detection framework (e.g., DETR), characterized by leveraging visual and
+textual modalities to represent the action classes better. The elimination of
+actor-specific model designs is a key advantage, as it removes the need for
+actor pose estimation altogether. Extensive experiments on five publicly
+available benchmarks show that our MSQNet consistently outperforms the prior
+arts of actor-specific alternatives on human and animal single- and multi-label
+action recognition tasks by up to 50%. Code will be released at
+https://github.com/mondalanindya/MSQNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LBL: Logarithmic Barrier Loss Function for One-class Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10753v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10753v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianlei Wang, Dekang Liu, Wandong Zhang, Jiuwen Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One-class classification (OCC) aims to train a classifier only with the
+target class data and attracts great attention for its strong applicability in
+real-world application. Despite a lot of advances have been made in OCC, it
+still lacks the effective OCC loss functions for deep learning. In this paper,
+a novel logarithmic barrier function based OCC loss (LBL) that assigns large
+gradients to the margin samples and thus derives more compact hypersphere, is
+first proposed by approximating the OCC objective smoothly. But the
+optimization of LBL may be instability especially when samples lie on the
+boundary leading to the infinity loss. To address this issue, then, a
+unilateral relaxation Sigmoid function is introduced into LBL and a novel OCC
+loss named LBLSig is proposed. The LBLSig can be seen as the fusion of the mean
+square error (MSE) and the cross entropy (CE) and the optimization of LBLSig is
+smoother owing to the unilateral relaxation Sigmoid function. The effectiveness
+of the proposed LBL and LBLSig is experimentally demonstrated in comparisons
+with several state-of-the-art OCC algorithms on different network structures.
+The source code can be found at https://github.com/ML-HDU/LBL_LBLSig.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EdgeAL: An Edge Estimation Based Active Learning Approach for OCT
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10745v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10745v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abdul Kadir, Hasan Md Tusfiqur Alam, Daniel Sonntag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Active learning algorithms have become increasingly popular for training
+models with limited data. However, selecting data for annotation remains a
+challenging problem due to the limited information available on unseen data. To
+address this issue, we propose EdgeAL, which utilizes the edge information of
+unseen images as {\it a priori} information for measuring uncertainty. The
+uncertainty is quantified by analyzing the divergence and entropy in model
+predictions across edges. This measure is then used to select superpixels for
+annotation. We demonstrate the effectiveness of EdgeAL on multi-class Optical
+Coherence Tomography (OCT) segmentation tasks, where we achieved a 99% dice
+score while reducing the annotation label cost to 12%, 2.3%, and 3%,
+respectively, on three publicly available datasets (Duke, AROI, and UMN). The
+source code is available at \url{https://github.com/Mak-Ta-Reque/EdgeAL}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version of the contribution has been accepted for publication,
+  after peer review (when applicable) but is not the Version of Record and does
+  not reflect post-acceptance improvements, or any corrections. Use of this
+  Accepted Version is subject to the publisher's Accepted Manuscript terms of
+  use
+  https://www.springernature.com/gp/open-research/policies/accepted-manuscript-terms</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kick Back & Relax: Learning to Reconstruct the World by Watching SlowTV <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaime Spencer, Chris Russell, Simon Hadfield, Richard Bowden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised monocular depth estimation (SS-MDE) has the potential to
+scale to vast quantities of data. Unfortunately, existing approaches limit
+themselves to the automotive domain, resulting in models incapable of
+generalizing to complex environments such as natural or indoor settings.
+  To address this, we propose a large-scale SlowTV dataset curated from
+YouTube, containing an order of magnitude more data than existing automotive
+datasets. SlowTV contains 1.7M images from a rich diversity of environments,
+such as worldwide seasonal hiking, scenic driving and scuba diving. Using this
+dataset, we train an SS-MDE model that provides zero-shot generalization to a
+large collection of indoor/outdoor datasets. The resulting model outperforms
+all existing SSL approaches and closes the gap on supervised SoTA, despite
+using a more efficient architecture.
+  We additionally introduce a collection of best-practices to further maximize
+performance and zero-shot generalization. This includes 1) aspect ratio
+augmentation, 2) camera intrinsic estimation, 3) support frame randomization
+and 4) flexible motion estimation. Code is available at
+https://github.com/jspenmar/slowtv_monodepth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdjointDPM: Adjoint Sensitivity Method for Gradient Backpropagation of
+  Diffusion Probabilistic Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachun Pan, Hanshu Yan, Jun Hao Liew, Vincent Y. F. Tan, Jiashi Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing customization methods require access to multiple reference examples
+to align pre-trained diffusion probabilistic models (DPMs) with user-provided
+concepts. This paper aims to address the challenge of DPM customization when
+the only available supervision is a differentiable metric defined on the
+generated contents. Since the sampling procedure of DPMs involves recursive
+calls to the denoising UNet, na\"ive gradient backpropagation requires storing
+the intermediate states of all iterations, resulting in extremely high memory
+consumption. To overcome this issue, we propose a novel method AdjointDPM,
+which first generates new samples from diffusion models by solving the
+corresponding probability-flow ODEs. It then uses the adjoint sensitivity
+method to backpropagate the gradients of the loss to the models' parameters
+(including conditioning signals, network weights, and initial noises) by
+solving another augmented ODE. To reduce numerical errors in both the forward
+generation and gradient backpropagation processes, we further reparameterize
+the probability-flow ODE and augmented ODE as simple non-stiff ODEs using
+exponential integration. Finally, we demonstrate the effectiveness of
+AdjointDPM on three interesting tasks: converting visual effects into
+identification text embeddings, finetuning DPMs for specific types of
+stylization, and optimizing initial noise to generate adversarial samples for
+security auditing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and
+  Lane Segmentation in Self-Driving Cars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10705v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10705v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quang Huy Che, Dinh Phuc Nguyen, Minh Quan Pham, Duc Khai Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation is a common task in autonomous driving to understand
+the surrounding environment. Driveable Area Segmentation and Lane Detection are
+particularly important for safe and efficient navigation on the road. However,
+original semantic segmentation models are computationally expensive and require
+high-end hardware, which is not feasible for embedded systems in autonomous
+vehicles. This paper proposes a lightweight model for the driveable area and
+lane line segmentation. TwinLiteNet is designed cheaply but achieves accurate
+and efficient segmentation results. We evaluate TwinLiteNet on the BDD100K
+dataset and compare it with modern models. Experimental results show that our
+TwinLiteNet performs similarly to existing approaches, requiring significantly
+fewer computational resources. Specifically, TwinLiteNet achieves a mIoU score
+of 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task
+with only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000.
+Furthermore, TwinLiteNet can run in real-time on embedded devices with limited
+computing power, especially since it achieves 60FPS on Jetson Xavier NX, making
+it an ideal solution for self-driving vehicles. Code is available:
+url{https://github.com/chequanghuy/TwinLiteNet}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reverse Knowledge Distillation: Training a Large Model using a Small One
+  for Retinal Image Matching on Limited Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahar Almahfouz Nasser, Nihar Gupte, Amit Sethi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retinal image matching plays a crucial role in monitoring disease progression
+and treatment response. However, datasets with matched keypoints between
+temporally separated pairs of images are not available in abundance to train
+transformer-based model. We propose a novel approach based on reverse knowledge
+distillation to train large models with limited data while preventing
+overfitting. Firstly, we propose architectural modifications to a CNN-based
+semi-supervised method called SuperRetina that help us improve its results on a
+publicly available dataset. Then, we train a computationally heavier model
+based on a vision transformer encoder using the lighter CNN-based model, which
+is counter-intuitive in the field knowledge-distillation research where
+training lighter models based on heavier ones is the norm. Surprisingly, such
+reverse knowledge distillation improves generalization even further. Our
+experiments suggest that high-dimensional fitting in representation space may
+prevent overfitting unlike training directly to match the final output. We also
+provide a public dataset with annotations for retinal image keypoint detection
+and matching to help the research community develop algorithms for retinal
+image applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SqueezerFaceNet: Reducing a Small Face Recognition CNN Even More Via
+  Filter Pruning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10697v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10697v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Alonso-Fernandez, Kevin Hernandez-Diaz, Jose Maria Buades Rubio, Josef Bigun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread use of mobile devices for various digital services has created
+a need for reliable and real-time person authentication. In this context,
+facial recognition technologies have emerged as a dependable method for
+verifying users due to the prevalence of cameras in mobile devices and their
+integration into everyday applications. The rapid advancement of deep
+Convolutional Neural Networks (CNNs) has led to numerous face verification
+architectures. However, these models are often large and impractical for mobile
+applications, reaching sizes of hundreds of megabytes with millions of
+parameters. We address this issue by developing SqueezerFaceNet, a light face
+recognition network which less than 1M parameters. This is achieved by applying
+a network pruning method based on Taylor scores, where filters with small
+importance scores are removed iteratively. Starting from an already small
+network (of 1.24M) based on SqueezeNet, we show that it can be further reduced
+(up to 40%) without an appreciable loss in performance. To the best of our
+knowledge, we are the first to evaluate network pruning methods for the task of
+face recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at VIII International Workshop on Artificial Intelligence
+  and Pattern Recognition, IWAIPR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLPD: Slide-level Prototypical Distillation for WSIs <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhimiao Yu, Tiancheng Lin, Yi Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Improving the feature representation ability is the foundation of many whole
+slide pathological image (WSIs) tasks. Recent works have achieved great success
+in pathological-specific self-supervised learning (SSL). However, most of them
+only focus on learning patch-level representations, thus there is still a gap
+between pretext and slide-level downstream tasks, e.g., subtyping, grading and
+staging. Aiming towards slide-level representations, we propose Slide-Level
+Prototypical Distillation (SLPD) to explore intra- and inter-slide semantic
+structures for context modeling on WSIs. Specifically, we iteratively perform
+intra-slide clustering for the regions (4096x4096 patches) within each WSI to
+yield the prototypes and encourage the region representations to be closer to
+the assigned prototypes. By representing each slide with its prototypes, we
+further select similar slides by the set distance of prototypes and assign the
+regions by cross-slide prototypes for distillation. SLPD achieves
+state-of-the-art results on multiple slide-level benchmarks and demonstrates
+that representation learning of semantic structures of slides can make a
+suitable proxy task for WSI analysis. Code will be available at
+https://github.com/Carboxy/SLPD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Conference on Medical Image Computing and Computer
+  Assisted Intervention (MICCAI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self2Self+: Single-Image Denoising with <span class="highlight-title">Self-Supervised</span> Learning and
+  Image Quality Assessment Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaekyun Ko, Sanghwan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, denoising methods based on supervised learning have exhibited
+promising performance. However, their reliance on external datasets containing
+noisy-clean image pairs restricts their applicability. To address this
+limitation, researchers have focused on training denoising networks using
+solely a set of noisy inputs. To improve the feasibility of denoising
+procedures, in this study, we proposed a single-image self-supervised learning
+method in which only the noisy input image is used for network training. Gated
+convolution was used for feature extraction and no-reference image quality
+assessment was used for guiding the training process. Moreover, the proposed
+method sampled instances from the input image dataset using Bernoulli sampling
+with a certain dropout rate for training. The corresponding result was produced
+by averaging the generated predictions from various instances of the trained
+network with dropouts. The experimental results indicated that the proposed
+method achieved state-of-the-art denoising performance on both synthetic and
+real-world datasets. This highlights the effectiveness and practicality of our
+method as a potential solution for various noise removal tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report and supplemantry materials are combined into one
+  paper. - Technical report: Page 1~7 - Supplemantry materials : Page 8~18</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pre-train</span>, Adapt and Detect: Multi-Task Adapter Tuning for Camouflaged
+  Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10685v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10685v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghui Xing, Dexuan Kong, Shizhou Zhang, Geng Chen, Lingyan Ran, Peng Wang, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camouflaged object detection (COD), aiming to segment camouflaged objects
+which exhibit similar patterns with the background, is a challenging task. Most
+existing works are dedicated to establishing specialized modules to identify
+camouflaged objects with complete and fine details, while the boundary can not
+be well located for the lack of object-related semantics. In this paper, we
+propose a novel ``pre-train, adapt and detect" paradigm to detect camouflaged
+objects. By introducing a large pre-trained model, abundant knowledge learned
+from massive multi-modal data can be directly transferred to COD. A lightweight
+parallel adapter is inserted to adjust the features suitable for the downstream
+COD task. Extensive experiments on four challenging benchmark datasets
+demonstrate that our method outperforms existing state-of-the-art COD models by
+large margins. Moreover, we design a multi-task learning scheme for tuning the
+adapter to exploit the shareable knowledge across different semantic classes.
+Comprehensive experimental results showed that the generalization ability of
+our model can be substantially improved with multi-task adapter initialization
+on source tasks and multi-task adaptation on target tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep learning for classification of noisy QR codes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10677v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10677v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rebecca Leygonie, Sylvain Lobry,  ), Laurent Wendling (LIPADE)
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We wish to define the limits of a classical classification model based on
+deep learning when applied to abstract images, which do not represent visually
+identifiable objects.QR codes (Quick Response codes) fall into this category of
+abstract images: one bit corresponding to one encoded character, QR codes were
+not designed to be decoded manually. To understand the limitations of a deep
+learning-based model for abstract image classification, we train an image
+classification model on QR codes generated from information obtained when
+reading a health pass. We compare a classification model with a classical
+(deterministic) decoding method in the presence of noise. This study allows us
+to conclude that a model based on deep learning can be relevant for the
+understanding of abstract images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in French language. RFIAP 2022 - Reconnaissance des Formes, Image,
+  Apprentissage et Perception, Jul 2022, Vannes (Bretagne), France</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Unified Demosaicing for Bayer and Non-Bayer Patterned Image
+  Sensors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haechang Lee, Dongwon Park, Wongi Jeong, Kijeong Kim, Hyunwoo Je, Dongil Ryu, Se Young Chun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the physical size of recent CMOS image sensors (CIS) gets smaller, the
+latest mobile cameras are adopting unique non-Bayer color filter array (CFA)
+patterns (e.g., Quad, Nona, QxQ), which consist of homogeneous color units with
+adjacent pixels. These non-Bayer sensors are superior to conventional Bayer CFA
+thanks to their changeable pixel-bin sizes for different light conditions but
+may introduce visual artifacts during demosaicing due to their inherent pixel
+pattern structures and sensor hardware characteristics. Previous demosaicing
+methods have primarily focused on Bayer CFA, necessitating distinct
+reconstruction methods for non-Bayer patterned CIS with various CFA modes under
+different lighting conditions. In this work, we propose an efficient unified
+demosaicing method that can be applied to both conventional Bayer RAW and
+various non-Bayer CFAs' RAW data in different operation modes. Our Knowledge
+Learning-based demosaicing model for Adaptive Patterns, namely KLAP, utilizes
+CFA-adaptive filters for only 1% key filters in the network for each CFA, but
+still manages to effectively demosaic all the CFAs, yielding comparable
+performance to the large-scale models. Furthermore, by employing meta-learning
+during inference (KLAP-M), our model is able to eliminate unknown
+sensor-generic artifacts in real RAW data, effectively bridging the gap between
+synthetic images and real sensor RAW. Our KLAP and KLAP-M methods achieved
+state-of-the-art demosaicing performance in both synthetic and real RAW data of
+Bayer and non-Bayer CFAs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lighting up NeRF via Unsupervised Decomposition and Enhancement <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10664v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10664v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyuan Wang, Xiaogang Xu, Ke Xu, Rynson WH. Lau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Field (NeRF) is a promising approach for synthesizing novel
+views, given a set of images and the corresponding camera poses of a scene.
+However, images photographed from a low-light scene can hardly be used to train
+a NeRF model to produce high-quality results, due to their low pixel
+intensities, heavy noise, and color distortion. Combining existing low-light
+image enhancement methods with NeRF methods also does not work well due to the
+view inconsistency caused by the individual 2D enhancement process. In this
+paper, we propose a novel approach, called Low-Light NeRF (or LLNeRF), to
+enhance the scene representation and synthesize normal-light novel views
+directly from sRGB low-light images in an unsupervised manner. The core of our
+approach is a decomposition of radiance field learning, which allows us to
+enhance the illumination, reduce noise and correct the distorted colors jointly
+with the NeRF optimization process. Our method is able to produce novel view
+images with proper lighting and vivid colors and details, given a collection of
+camera-finished low dynamic range (8-bits/channel) images from a low-light
+scene. Experiments demonstrate that our method outperforms existing low-light
+enhancement methods and NeRF methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Project website: https://whyy.site/paper/llnerf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RetouchingFFHQ: A Large-scale <span class="highlight-title">Dataset</span> for Fine-grained Face Retouching
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10642v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10642v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qichao Ying, Jiaxin Liu, Sheng Li, Haisheng Xu, Zhenxing Qian, Xinpeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread use of face retouching filters on short-video platforms has
+raised concerns about the authenticity of digital appearances and the impact of
+deceptive advertising. To address these issues, there is a pressing need to
+develop advanced face retouching techniques. However, the lack of large-scale
+and fine-grained face retouching datasets has been a major obstacle to progress
+in this field. In this paper, we introduce RetouchingFFHQ, a large-scale and
+fine-grained face retouching dataset that contains over half a million
+conditionally-retouched images. RetouchingFFHQ stands out from previous
+datasets due to its large scale, high quality, fine-grainedness, and
+customization. By including four typical types of face retouching operations
+and different retouching levels, we extend the binary face retouching detection
+into a fine-grained, multi-retouching type, and multi-retouching level
+estimation problem. Additionally, we propose a Multi-granularity Attention
+Module (MAM) as a plugin for CNN backbones for enhanced cross-scale
+representation learning. Extensive experiments using different baselines as
+well as our proposed method on RetouchingFFHQ show decent performance on face
+retouching detection. With the proposed new dataset, we believe there is great
+potential for future work to tackle the challenging problem of real-world
+fine-grained face retouching detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantized Feature Distillation for Network Quantization <span class="chip">AAAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Zhu, Yin-Yin He, Jianxin Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural network quantization aims to accelerate and trim full-precision neural
+network models by using low bit approximations. Methods adopting the
+quantization aware training (QAT) paradigm have recently seen a rapid growth,
+but are often conceptually complicated. This paper proposes a novel and highly
+effective QAT method, quantized feature distillation (QFD). QFD first trains a
+quantized (or binarized) representation as the teacher, then quantize the
+network using knowledge distillation (KD). Quantitative results show that QFD
+is more flexible and effective (i.e., quantization friendly) than previous
+quantization methods. QFD surpasses existing methods by a noticeable margin on
+not only image classification but also object detection, albeit being much
+simpler. Furthermore, QFD quantizes ViT and Swin-Transformer on MS-COCO
+detection and segmentation, which verifies its potential in real world
+deployment. To the best of our knowledge, this is the first time that vision
+transformers have been quantized in object detection and image segmentation
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning and Evaluating Human Preferences for Conversational Head
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10636v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10636v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohan Zhou, Yalong Bai, Wei Zhang, Ting Yao, Tiejun Zhao, Tao Mei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A reliable and comprehensive evaluation metric that aligns with manual
+preference assessments is crucial for conversational head video synthesis
+method development. Existing quantitative evaluations often fail to capture the
+full complexity of human preference, as they only consider limited evaluation
+dimensions. Qualitative evaluations and user studies offer a solution but are
+time-consuming and labor-intensive. This limitation hinders the advancement of
+conversational head generation algorithms and systems. In this paper, we
+propose a novel learning-based evaluation metric named Preference Score (PS)
+for fitting human preference according to the quantitative evaluations across
+different dimensions. PS can serve as a quantitative evaluation without the
+need for human annotation. Experimental results validate the superiority of
+Preference Score in aligning with human perception, and also demonstrates
+robustness and generalizability to unseen data, making it a valuable tool for
+advancing conversation head generation. We expect this metric could facilitate
+new advances in conversational head generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parallelization of a new embedded application for automatic meteor
+  detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathuran Kandeepan, Clara Ciocan, Adrien Cassagne, Lionel Lacassagne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article presents the methods used to parallelize a new computer vision
+application. The system is able to automatically detect meteor from
+non-stabilized cameras and noisy video sequences. The application is designed
+to be embedded in weather balloons or for airborne observation campaigns. Thus,
+the final target is a low power system-on-chip (< 10 Watts) while the software
+needs to compute a stream of frames in real-time (> 25 frames per second). For
+this, first the application is split in a tasks graph, then different
+parallelization techniques are applied. Experiment results demonstrate the
+efficiency of the parallelization methods. For instance, on the Raspberry Pi 4
+and on a HD video sequence, the processing chain reaches 42 frames per second
+while it only consumes 6 Watts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in French language, COMPAS 2023 - Conf{\'e}rence francophone
+  d'informatique en Parall{\'e}lisme, Architecture et Syst{\`e}me, Jul 2023,
+  Annecy (France), France</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Discriminative Visual-Text Representation for Polyp
+  Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10625v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10625v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suncheng Xiang, Cang Liu, Sijia Du, Dahong Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Colonoscopic Polyp Re-Identification aims to match a specific polyp in a
+large gallery with different cameras and views, which plays a key role for the
+prevention and treatment of colorectal cancer in the computer-aided diagnosis.
+However, traditional methods mainly focus on the visual representation
+learning, while neglect to explore the potential of semantic features during
+training, which may easily leads to poor generalization capability when adapted
+the pretrained model into the new scenarios. To relieve this dilemma, we
+propose a simple but effective training method named VT-ReID, which can
+remarkably enrich the representation of polyp videos with the interchange of
+high-level semantic information. Moreover, we elaborately design a novel
+clustering mechanism to introduce prior knowledge from textual data, which
+leverages contrastive learning to promote better separation from abundant
+unlabeled text data. To the best of our knowledge, this is the first attempt to
+employ the visual-text feature with clustering mechanism for the colonoscopic
+polyp re-identification. Empirical results show that our method significantly
+outperforms current state-of-the art methods with a clear margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Joint Skeletal and Semantic Embedding Loss for Micro-gesture
+  Classification <span class="chip">IJCAI-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun Li, Dan Guo, Guoliang Chen, Xinge Peng, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we briefly introduce the solution of our team HFUT-VUT for the
+Micros-gesture Classification in the MiGA challenge at IJCAI 2023. The
+micro-gesture classification task aims at recognizing the action category of a
+given video based on the skeleton data. For this task, we propose a
+3D-CNNs-based micro-gesture recognition network, which incorporates a skeletal
+and semantic embedding loss to improve action classification performance.
+Finally, we rank 1st in the Micro-gesture Classification Challenge, surpassing
+the second-place team in terms of Top-1 accuracy by 1.10%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>1st Place in Micro-gesture Classification sub-challenge in MiGA at
+  IJCAI-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quaternion tensor ring decomposition and application for color image
+  inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jifei Miao, Kit Ian Kou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, tensor networks have emerged as powerful tools for solving
+large-scale optimization problems. One of the most promising tensor networks is
+the tensor ring (TR) decomposition, which achieves circular dimensional
+permutation invariance in the model through the utilization of the trace
+operation and equitable treatment of the latent cores. On the other hand, more
+recently, quaternions have gained significant attention and have been widely
+utilized in color image processing tasks due to their effectiveness in encoding
+color pixels. Therefore, in this paper, we propose the quaternion tensor ring
+(QTR) decomposition, which inherits the powerful and generalized representation
+abilities of the TR decomposition while leveraging the advantages of
+quaternions for color pixel representation. In addition to providing the
+definition of QTR decomposition and an algorithm for learning the QTR format,
+this paper also proposes a low-rank quaternion tensor completion (LRQTC) model
+and its algorithm for color image inpainting based on the QTR decomposition.
+Finally, extensive experiments on color image inpainting demonstrate that the
+proposed QTLRC method is highly competitive.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heterogeneous Federated Learning: State-of-the-art and Research
+  Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mang Ye, Xiuwen Fang, Bo Du, Pong C. Yuen, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) has drawn increasing attention owing to its potential
+use in large-scale industrial applications. Existing federated learning works
+mainly focus on model homogeneous settings. However, practical federated
+learning typically faces the heterogeneity of data distributions, model
+architectures, network environments, and hardware devices among participant
+clients. Heterogeneous Federated Learning (HFL) is much more challenging, and
+corresponding solutions are diverse and complex. Therefore, a systematic survey
+on this topic about the research challenges and state-of-the-art is essential.
+In this survey, we firstly summarize the various research challenges in HFL
+from five aspects: statistical heterogeneity, model heterogeneity,
+communication heterogeneity, device heterogeneity, and additional challenges.
+In addition, recent advances in HFL are reviewed and a new taxonomy of existing
+HFL methods is proposed with an in-depth analysis of their pros and cons. We
+classify existing methods from three different levels according to the HFL
+procedure: data-level, model-level, and server-level. Finally, several critical
+and promising future research directions in HFL are discussed, which may
+facilitate further developments in this field. A periodically updated
+collection on HFL is available at https://github.com/marswhu/HFL_Survey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 11 figures, and 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Feature Embedding For Automatic Building Outline Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihang Ran, Wei Yuan, Xiaodan Shi, Zipei Fan, Ryosuke Shibasaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building outline extracted from high-resolution aerial images can be used in
+various application fields such as change detection and disaster assessment.
+However, traditional CNN model cannot recognize contours very precisely from
+original images. In this paper, we proposed a CNN and Transformer based model
+together with active contour model to deal with this problem. We also designed
+a triple-branch decoder structure to handle different features generated by
+encoder. Experiment results show that our model outperforms other baseline
+model on two datasets, achieving 91.1% mIoU on Vaihingen and 83.8% on Bing
+huts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-Driven Turbulence Image Restoration with Stochastic Refinement <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10603v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10603v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ajay Jaiswal, Xingguang Zhang, Stanley H. Chan, Zhangyang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image distortion by atmospheric turbulence is a stochastic degradation, which
+is a critical problem in long-range optical imaging systems. A number of
+research has been conducted during the past decades, including model-based and
+emerging deep-learning solutions with the help of synthetic data. Although fast
+and physics-grounded simulation tools have been introduced to help the
+deep-learning models adapt to real-world turbulence conditions recently, the
+training of such models only relies on the synthetic data and ground truth
+pairs. This paper proposes the Physics-integrated Restoration Network (PiRN) to
+bring the physics-based simulator directly into the training process to help
+the network to disentangle the stochasticity from the degradation and the
+underlying image. Furthermore, to overcome the ``average effect" introduced by
+deterministic models and the domain gap between the synthetic and real-world
+degradation, we further introduce PiRN with Stochastic Refinement (PiRN-SR) to
+boost its perceptual quality. Overall, our PiRN and PiRN-SR improve the
+generalization to real-world unknown turbulence conditions and provide a
+state-of-the-art restoration in both pixel-wise accuracy and perceptual
+quality. Our codes are available at \url{https://github.com/VITA-Group/PiRN}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SCA-PVNet: Self-and-Cross Attention Based Aggregation of Point Cloud and
+  Multi-View for 3D Object Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10601v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10601v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyun Lin, Yi Cheng, Aiyuan Guo, Shangbo Mao, Yiqun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To address 3D object retrieval, substantial efforts have been made to
+generate highly discriminative descriptors of 3D objects represented by a
+single modality, e.g., voxels, point clouds or multi-view images. It is
+promising to leverage the complementary information from multi-modality
+representations of 3D objects to further improve retrieval performance.
+However, multi-modality 3D object retrieval is rarely developed and analyzed on
+large-scale datasets. In this paper, we propose self-and-cross attention based
+aggregation of point cloud and multi-view images (SCA-PVNet) for 3D object
+retrieval. With deep features extracted from point clouds and multi-view
+images, we design two types of feature aggregation modules, namely the
+In-Modality Aggregation Module (IMAM) and the Cross-Modality Aggregation Module
+(CMAM), for effective feature fusion. IMAM leverages a self-attention mechanism
+to aggregate multi-view features while CMAM exploits a cross-attention
+mechanism to interact point cloud features with multi-view features. The final
+descriptor of a 3D object for object retrieval can be obtained via
+concatenating the aggregated features from both modules. Extensive experiments
+and analysis are conducted on three datasets, ranging from small to large
+scale, to show the superiority of the proposed SCA-PVNet over the
+state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event Blob Tracking: An Asynchronous Real-Time Algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10593v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10593v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziwei Wang, Timothy Molloy, Pieter van Goor, Robert Mahony
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event-based cameras have become increasingly popular for tracking fast-moving
+objects due to their high temporal resolution, low latency, and high dynamic
+range. In this paper, we propose a novel algorithm for tracking event blobs
+using raw events asynchronously in real time. We introduce the concept of an
+event blob as a spatio-temporal likelihood of event occurrence where the
+conditional spatial likelihood is blob-like. Many real-world objects generate
+event blob data, for example, flickering LEDs such as car headlights or any
+small foreground object moving against a static or slowly varying background.
+The proposed algorithm uses a nearest neighbour classifier with a dynamic
+threshold criteria for data association coupled with a Kalman filter to track
+the event blob state. Our algorithm achieves highly accurate tracking and event
+blob shape estimation even under challenging lighting conditions and high-speed
+motions. The microsecond time resolution achieved means that the filter output
+can be used to derive secondary information such as time-to-contact or range
+estimation, that will enable applications to real-world problems such as
+collision avoidance in autonomous driving.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 8 figures, preprint version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reference-based Painterly Inpainting via Diffusion: Crossing the Wild
+  Reference Domain Gap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10584v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10584v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dejia Xu, Xingqian Xu, Wenyan Cong, Humphrey Shi, Zhangyang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Have you ever imagined how it would look if we placed new objects into
+paintings? For example, what would it look like if we placed a basketball into
+Claude Monet's ``Water Lilies, Evening Effect''? We propose Reference-based
+Painterly Inpainting, a novel task that crosses the wild reference domain gap
+and implants novel objects into artworks. Although previous works have examined
+reference-based inpainting, they are not designed for large domain
+discrepancies between the target and the reference, such as inpainting an
+artistic image using a photorealistic reference. This paper proposes a novel
+diffusion framework, dubbed RefPaint, to ``inpaint more wildly'' by taking such
+references with large domain gaps. Built with an image-conditioned diffusion
+model, we introduce a ladder-side branch and a masked fusion mechanism to work
+with the inpainting mask. By decomposing the CLIP image embeddings at inference
+time, one can manipulate the strength of semantic and style information with
+ease. Experiments demonstrate that our proposed RefPaint framework produces
+significantly better results than existing methods. Our method enables creative
+painterly image inpainting with reference objects that would otherwise be
+difficult to achieve. Project page: https://vita-group.github.io/RefPaint/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ethosight: A Joint-Embedding Based System for Nuanced Perception Using
+  Contextual Label Affinity Metric and Reasoning Based Iterative Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Latapie, Kristinn R. Thorisson, Shan Yu, Vahagn Petrosyan, Patrick Hammer, Pei Wang, Brandon Kynoch, Hanning Chen, Tangrui Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional computer vision models often require extensive manual effort for
+data acquisition and validation, particularly when detecting subtle behavioral
+nuances or events. The difficulty in distinguishing routine behaviors from
+potential risks in real-world applications, like differentiating routine
+shopping from potential shoplifting, further complicates the process.
+  We present Ethosight, a novel zero-shot computer vision algorithm. Ethosight
+eradicates the need for pre-existing symbolic knowledge, initiating from a
+clean slate based on user requirements and semantic knowledge of interest.
+Using localized label affinity calculations and a reasoning-guided iterative
+learning loop, Ethosight infers scene details and iteratively refines the label
+set. Reasoning mechanisms can be derived from large language models like GPT4,
+symbolic reasoners like OpenNARS, or hybrid systems.
+  Ethosight further capitalizes on the capabilities of a pre-trained
+multi-modal model, ImageBind, generating accurate semantic knowledge of images
+within a few cycles. It successfully captures both explicit and nuanced
+elements efficiently. We also introduce the implementation of Korzybski's
+"time-binding" concept in machines, which allows for generational learning and
+knowledge sharing across deployments.
+  Our evaluations demonstrate Ethosight's efficacy across 40 complex use cases.
+It has exhibited an exceptional ability to discern new areas of interest,
+consistently generating high-affinity scores within the top five labels from a
+set of a thousand. Tests conducted across diverse environments attest to
+Ethosight's robust performance. Detailed results and case studies within the
+main body of this paper and an appendix underscore a promising trajectory
+towards enhancing the adaptability and resilience of computer vision models in
+detecting and extracting subtle and nuanced behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Federated Learning Convergence with Prototype Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Qiao, Huy Q. Le, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a distributed machine learning technique, federated learning (FL) requires
+clients to collaboratively train a shared model with an edge server without
+leaking their local data. However, the heterogeneous data distribution among
+clients often leads to a decrease in model performance. To tackle this issue,
+this paper introduces a prototype-based regularization strategy to address the
+heterogeneity in the data distribution. Specifically, the regularization
+process involves the server aggregating local prototypes from distributed
+clients to generate a global prototype, which is then sent back to the
+individual clients to guide their local training. The experimental results on
+MNIST and Fashion-MNIST show that our proposal achieves improvements of 3.3%
+and 8.9% in average test accuracy, respectively, compared to the most popular
+baseline FedAvg. Furthermore, our approach has a fast convergence rate in
+heterogeneous settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ No-frills Temporal Video Grounding: Multi-Scale Neighboring Attention
+  and Zoom-in Boundary Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Zhang, Sipeng Zheng, Qin Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal video grounding (TVG) aims to retrieve the time interval of a
+language query from an untrimmed video. A significant challenge in TVG is the
+low "Semantic Noise Ratio (SNR)", which results in worse performance with lower
+SNR. Prior works have addressed this challenge using sophisticated techniques.
+In this paper, we propose a no-frills TVG model that consists of two core
+modules, namely multi-scale neighboring attention and zoom-in boundary
+detection. The multi-scale neighboring attention restricts each video token to
+only aggregate visual contexts from its neighbor, enabling the extraction of
+the most distinguishing information with multi-scale feature hierarchies from
+high-ratio noises. The zoom-in boundary detection then focuses on local-wise
+discrimination of the selected top candidates for fine-grained grounding
+adjustment. With an end-to-end training strategy, our model achieves
+competitive performance on different TVG benchmarks, while also having the
+advantage of faster inference speed and lighter model parameters, thanks to its
+lightweight architecture.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EMQ: Evolving Training-free Proxies for Automated Mixed Precision
+  Quantization <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10554v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10554v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peijie Dong, Lujun Li, Zimian Wei, Xin Niu, Zhiliang Tian, Hengyue Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mixed-Precision Quantization~(MQ) can achieve a competitive
+accuracy-complexity trade-off for models. Conventional training-based search
+methods require time-consuming candidate training to search optimized per-layer
+bit-width configurations in MQ. Recently, some training-free approaches have
+presented various MQ proxies and significantly improve search efficiency.
+However, the correlation between these proxies and quantization accuracy is
+poorly understood. To address the gap, we first build the MQ-Bench-101, which
+involves different bit configurations and quantization results. Then, we
+observe that the existing training-free proxies perform weak correlations on
+the MQ-Bench-101. To efficiently seek superior proxies, we develop an automatic
+search of proxies framework for MQ via evolving algorithms. In particular, we
+devise an elaborate search space involving the existing proxies and perform an
+evolution search to discover the best correlated MQ proxy. We proposed a
+diversity-prompting selection strategy and compatibility screening protocol to
+avoid premature convergence and improve search efficiency. In this way, our
+Evolving proxies for Mixed-precision Quantization~(EMQ) framework allows the
+auto-generation of proxies without heavy tuning and expert knowledge. Extensive
+experiments on ImageNet with various ResNet and MobileNet families demonstrate
+that our EMQ obtains superior performance than state-of-the-art mixed-precision
+methods at a significantly reduced cost. The code will be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Large Language Models on Blockchains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhao Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training and deploying the large language models requires a large mount of
+computational resource because the language models contain billions of
+parameters and the text has thousands of tokens. Another problem is that the
+large language models are static. They are fixed after the training process. To
+tackle these issues, in this paper, we propose to train and deploy the dynamic
+large language model on blockchains, which have high computation performance
+and are distributed across a network of computers. A blockchain is a secure,
+decentralized, and transparent system that allows for the creation of a
+tamper-proof ledger for transactions without the need for intermediaries. The
+dynamic large language models can continuously learn from the user input after
+the training process. Our method provides a new way to develop the large
+language models and also sheds a light on the next generation artificial
+intelligence systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interactive Segmentation for Diverse Gesture Types Without Context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josh Myers-Dean, Yifei Fan, Brian Price, Wilson Chan, Danna Gurari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactive segmentation entails a human marking an image to guide how a
+model either creates or edits a segmentation. Our work addresses limitations of
+existing methods: they either only support one gesture type for marking an
+image (e.g., either clicks or scribbles) or require knowledge of the gesture
+type being employed, and require specifying whether marked regions should be
+included versus excluded in the final segmentation. We instead propose a
+simplified interactive segmentation task where a user only must mark an image,
+where the input can be of any gesture type without specifying the gesture type.
+We support this new task by introducing the first interactive segmentation
+dataset with multiple gesture types as well as a new evaluation metric capable
+of holistically evaluating interactive segmentation algorithms. We then analyze
+numerous interactive segmentation algorithms, including ones adapted for our
+novel task. While we observe promising performance overall, we also highlight
+areas for future improvement. To facilitate further extensions of this work, we
+publicly share our new dataset at https://github.com/joshmyersdean/dig.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedSoup: Improving Generalization and Personalization in Federated
+  Learning via Selective Model Interpolation <span class="chip">MICCAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghui Chen, Meirui Jiang, Qi Dou, Zehua Wang, Xiaoxiao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-silo federated learning (FL) enables the development of machine
+learning models on datasets distributed across data centers such as hospitals
+and clinical research laboratories. However, recent research has found that
+current FL algorithms face a trade-off between local and global performance
+when confronted with distribution shifts. Specifically, personalized FL methods
+have a tendency to overfit to local data, leading to a sharp valley in the
+local model and inhibiting its ability to generalize to out-of-distribution
+data. In this paper, we propose a novel federated model soup method (i.e.,
+selective interpolation of model parameters) to optimize the trade-off between
+local and global performance. Specifically, during the federated training
+phase, each client maintains its own global model pool by monitoring the
+performance of the interpolated model between the local and global models. This
+allows us to alleviate overfitting and seek flat minima, which can
+significantly improve the model's generalization performance. We evaluate our
+method on retinal and pathological image classification tasks, and our proposed
+method achieves significant improvements for out-of-distribution
+generalization. Our code is available at https://github.com/ubc-tea/FedSoup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MICCAI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is Grad-CAM Explainable in Medical Images? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subhashis Suara, Aayush Jha, Pratik Sinha, Arif Ahmed Sekh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable Deep Learning has gained significant attention in the field of
+artificial intelligence (AI), particularly in domains such as medical imaging,
+where accurate and interpretable machine learning models are crucial for
+effective diagnosis and treatment planning. Grad-CAM is a baseline that
+highlights the most critical regions of an image used in a deep learning
+model's decision-making process, increasing interpretability and trust in the
+results. It is applied in many computer vision (CV) tasks such as
+classification and explanation. This study explores the principles of
+Explainable Deep Learning and its relevance to medical imaging, discusses
+various explainability techniques and their limitations, and examines medical
+imaging applications of Grad-CAM. The findings highlight the potential of
+Explainable Deep Learning and Grad-CAM in improving the accuracy and
+interpretability of deep learning models in medical imaging. The code is
+available in (will be available).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identifying Interpretable Subspaces in Image Representations <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10504v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10504v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neha Kalibhat, Shweta Bhardwaj, Bayan Bruss, Hamed Firooz, Maziar Sanjabi, Soheil Feizi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Automatic Feature Explanation using Contrasting Concepts (FALCON),
+an interpretability framework to explain features of image representations. For
+a target feature, FALCON captions its highly activating cropped images using a
+large captioning dataset (like LAION-400m) and a pre-trained vision-language
+model like CLIP. Each word among the captions is scored and ranked leading to a
+small number of shared, human-understandable concepts that closely describe the
+target feature. FALCON also applies contrastive interpretation using lowly
+activating (counterfactual) images, to eliminate spurious concepts. Although
+many existing approaches interpret features independently, we observe in
+state-of-the-art self-supervised and supervised models, that less than 20% of
+the representation space can be explained by individual features. We show that
+features in larger spaces become more interpretable when studied in groups and
+can be explained with high-order scoring concepts through FALCON. We discuss
+how extracted concepts can be used to explain and debug failures in downstream
+tasks. Finally, we present a technique to transfer concepts from one
+(explainable) representation space to another unseen representation space by
+learning a simple linear transformation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GLSFormer: Gated - Long, Short Sequence <span class="highlight-title">Transformer</span> for Step Recognition
+  in Surgical Videos <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nisarg A. Shah, Shameema Sikder, S. Swaroop Vedula, Vishal M. Patel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated surgical step recognition is an important task that can
+significantly improve patient safety and decision-making during surgeries.
+Existing state-of-the-art methods for surgical step recognition either rely on
+separate, multi-stage modeling of spatial and temporal information or operate
+on short-range temporal resolution when learned jointly. However, the benefits
+of joint modeling of spatio-temporal features and long-range information are
+not taken in account. In this paper, we propose a vision transformer-based
+approach to jointly learn spatio-temporal features directly from sequence of
+frame-level patches. Our method incorporates a gated-temporal attention
+mechanism that intelligently combines short-term and long-term spatio-temporal
+feature representations. We extensively evaluate our approach on two cataract
+surgery video datasets, namely Cataract-101 and D99, and demonstrate superior
+performance compared to various state-of-the-art methods. These results
+validate the suitability of our proposed approach for automated surgical step
+recognition. Our code is released at:
+https://github.com/nisargshah1999/GLSFormer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023 (Early Accept)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SimCol3D -- 3D Reconstruction during Colonoscopy Challenge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11261v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11261v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anita Rau, Sophia Bano, Yueming Jin, Pablo Azagra, Javier Morlana, Edward Sanderson, Bogdan J. Matuszewski, Jae Young Lee, Dong-Jae Lee, Erez Posner, Netanel Frank, Varshini Elangovan, Sista Raviteja, Zhengwen Li, Jiquan Liu, Seenivasan Lalithkumar, Mobarakol Islam, Hongliang Ren, José M. M. Montiel, Danail Stoyanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Colorectal cancer is one of the most common cancers in the world. While
+colonoscopy is an effective screening technique, navigating an endoscope
+through the colon to detect polyps is challenging. A 3D map of the observed
+surfaces could enhance the identification of unscreened colon tissue and serve
+as a training platform. However, reconstructing the colon from video footage
+remains unsolved due to numerous factors such as self-occlusion, reflective
+surfaces, lack of texture, and tissue deformation that limit feature-based
+methods. Learning-based approaches hold promise as robust alternatives, but
+necessitate extensive datasets. By establishing a benchmark, the 2022 EndoVis
+sub-challenge SimCol3D aimed to facilitate data-driven depth and pose
+prediction during colonoscopy. The challenge was hosted as part of MICCAI 2022
+in Singapore. Six teams from around the world and representatives from academia
+and industry participated in the three sub-challenges: synthetic depth
+prediction, synthetic pose prediction, and real pose prediction. This paper
+describes the challenge, the submitted methods, and their results. We show that
+depth prediction in virtual colonoscopy is robustly solvable, while pose
+estimation remains an open research question.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Non-Parametric Models for Confidence Aware Image Prediction from
+  Low Data using Gaussian Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil U. Shinde, Florian Richter, Michael C. Yip
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to envision future states is crucial to informed decision making
+while interacting with dynamic environments. With cameras providing a prevalent
+and information rich sensing modality, the problem of predicting future states
+from image sequences has garnered a lot of attention. Current state of the art
+methods typically train large parametric models for their predictions. Though
+often able to predict with accuracy, these models rely on the availability of
+large training datasets to converge to useful solutions. In this paper we focus
+on the problem of predicting future images of an image sequence from very
+little training data. To approach this problem, we use non-parametric models to
+take a probabilistic approach to image prediction. We generate probability
+distributions over sequentially predicted images and propagate uncertainty
+through time to generate a confidence metric for our predictions. Gaussian
+Processes are used for their data efficiency and ability to readily incorporate
+new training data online. We showcase our method by successfully predicting
+future frames of a smooth fluid simulation environment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Joint one-sided synthetic unpaired image translation and segmentation
+  for colorectal cancer prevention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enric Moreu, Eric Arazo, Kevin McGuinness, Noel E. O'Connor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has shown excellent performance in analysing medical images.
+However, datasets are difficult to obtain due privacy issues, standardization
+problems, and lack of annotations. We address these problems by producing
+realistic synthetic images using a combination of 3D technologies and
+generative adversarial networks. We propose CUT-seg, a joint training where a
+segmentation model and a generative model are jointly trained to produce
+realistic images while learning to segment polyps. We take advantage of recent
+one-sided translation models because they use significantly less memory,
+allowing us to add a segmentation model in the training loop. CUT-seg performs
+better, is computationally less expensive, and requires less real images than
+other memory-intensive image translation approaches that require two stage
+training. Promising results are achieved on five real polyp segmentation
+datasets using only one real image and zero real annotations. As a part of this
+study we release Synth-Colon, an entirely synthetic dataset that includes 20000
+realistic colon images and additional details about depth and 3D geometry:
+https://enric1994.github.io/synth-colon
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2202.08680</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UP-DP: Unsupervised <span class="highlight-title">Prompt</span> Learning for Data Pre-Selection with
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Li, Sima Behpour, Thang Doan, Wenbin He, Liang Gou, Liu Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we investigate the task of data pre-selection, which aims to
+select instances for labeling from an unlabeled dataset through a single pass,
+thereby optimizing performance for undefined downstream tasks with a limited
+annotation budget. Previous approaches to data pre-selection relied solely on
+visual features extracted from foundation models, such as CLIP and BLIP-2, but
+largely ignored the powerfulness of text features. In this work, we argue that,
+with proper design, the joint feature space of both vision and text can yield a
+better representation for data pre-selection. To this end, we introduce UP-DP,
+a simple yet effective unsupervised prompt learning approach that adapts
+vision-language models, like BLIP-2, for data pre-selection. Specifically, with
+the BLIP-2 parameters frozen, we train text prompts to extract the joint
+features with improved representation, ensuring a diverse cluster structure
+that covers the entire dataset. We extensively compare our method with the
+state-of-the-art using seven benchmark datasets in different settings,
+achieving up to a performance gain of 20%. Interestingly, the prompts learned
+from one dataset demonstrate significant generalizability and can be applied
+directly to enhance the feature extraction of BLIP-2 from other datasets. To
+the best of our knowledge, UP-DP is the first work to incorporate unsupervised
+prompt learning in a vision-language model for data pre-selection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heuristic Hyperparameter Choice for Image Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyu Jiang, João P. C. Bertoldo, Etienne Decencière
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection (AD) in images is a fundamental computer vision problem by
+deep learning neural network to identify images deviating significantly from
+normality. The deep features extracted from pretrained models have been proved
+to be essential for AD based on multivariate Gaussian distribution analysis.
+However, since models are usually pretrained on a large dataset for
+classification tasks such as ImageNet, they might produce lots of redundant
+features for AD, which increases computational cost and degrades the
+performance. We aim to do the dimension reduction of Negated Principal
+Component Analysis (NPCA) for these features. So we proposed some heuristic to
+choose hyperparameter of NPCA algorithm for getting as fewer components of
+features as possible while ensuring a good performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards General Game Representations: Decomposing Games Pixels into
+  Content and Style 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11141v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11141v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chintan Trivedi, Konstantinos Makantasis, Antonios Liapis, Georgios N. Yannakakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  On-screen game footage contains rich contextual information that players
+process when playing and experiencing a game. Learning pixel representations of
+games can benefit artificial intelligence across several downstream tasks
+including game-playing agents, procedural content generation, and player
+modelling. The generalizability of these methods, however, remains a challenge,
+as learned representations should ideally be shared across games with similar
+game mechanics. This could allow, for instance, game-playing agents trained on
+one game to perform well in similar games with no re-training. This paper
+explores how generalizable pre-trained computer vision encoders can be for such
+tasks, by decomposing the latent space into content embeddings and style
+embeddings. The goal is to minimize the domain gap between games of the same
+genre when it comes to game content critical for downstream tasks, and ignore
+differences in graphical style. We employ a pre-trained Vision Transformer
+encoder and a decomposition technique based on game genres to obtain separate
+content and style embeddings. Our findings show that the decomposed embeddings
+achieve style invariance across multiple games while still maintaining strong
+content extraction capabilities. We argue that the proposed decomposition of
+content and style offers better generalization capacities across game
+environments independently of the downstream task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Frequency-aware optical coherence tomography image super-resolution via
+  conditional generative adversarial neural network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11130v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11130v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueshen Li, Zhenxing Dong, Hongshan Liu, Jennifer J. Kang-Mieler, Yuye Ling, Yu Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical coherence tomography (OCT) has stimulated a wide range of medical
+image-based diagnosis and treatment in fields such as cardiology and
+ophthalmology. Such applications can be further facilitated by deep
+learning-based super-resolution technology, which improves the capability of
+resolving morphological structures. However, existing deep learning-based
+method only focuses on spatial distribution and disregard frequency fidelity in
+image reconstruction, leading to a frequency bias. To overcome this limitation,
+we propose a frequency-aware super-resolution framework that integrates three
+critical frequency-based modules (i.e., frequency transformation, frequency
+skip connection, and frequency alignment) and frequency-based loss function
+into a conditional generative adversarial network (cGAN). We conducted a
+large-scale quantitative study from an existing coronary OCT dataset to
+demonstrate the superiority of our proposed framework over existing deep
+learning frameworks. In addition, we confirmed the generalizability of our
+framework by applying it to fish corneal images and rat retinal images,
+demonstrating its capability to super-resolve morphological details in eye
+imaging.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures, submitted to Biomedical Optics Express special
+  issue</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Sampling with Momentum for Mitigating Divergence Artifacts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suttisak Wizadwongsa, Worameth Chinchuthakun, Pramook Khungurn, Amit Raj, Supasorn Suwajanakorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the remarkable success of diffusion models in image generation, slow
+sampling remains a persistent issue. To accelerate the sampling process, prior
+studies have reformulated diffusion sampling as an ODE/SDE and introduced
+higher-order numerical methods. However, these methods often produce divergence
+artifacts, especially with a low number of sampling steps, which limits the
+achievable acceleration. In this paper, we investigate the potential causes of
+these artifacts and suggest that the small stability regions of these methods
+could be the principal cause. To address this issue, we propose two novel
+techniques. The first technique involves the incorporation of Heavy Ball (HB)
+momentum, a well-known technique for improving optimization, into existing
+diffusion numerical methods to expand their stability regions. We also prove
+that the resulting methods have first-order convergence. The second technique,
+called Generalized Heavy Ball (GHVB), constructs a new high-order method that
+offers a variable trade-off between accuracy and artifact suppression.
+Experimental results show that our techniques are highly effective in reducing
+artifacts and improving image quality, surpassing state-of-the-art diffusion
+solvers on both pixel-based and latent-based diffusion models for low-step
+sampling. Our research provides novel insights into the design of numerical
+methods for future diffusion work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://github.com/sWizad/momentum-diffusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Frequency Domain Adversarial Training for Robust Volumetric Medical
+  Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07269v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07269v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asif Hanif, Muzammal Naseer, Salman Khan, Mubarak Shah, Fahad Shahbaz Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is imperative to ensure the robustness of deep learning models in critical
+applications such as, healthcare. While recent advances in deep learning have
+improved the performance of volumetric medical image segmentation models, these
+models cannot be deployed for real-world applications immediately due to their
+vulnerability to adversarial attacks. We present a 3D frequency domain
+adversarial attack for volumetric medical image segmentation models and
+demonstrate its advantages over conventional input or voxel domain attacks.
+Using our proposed attack, we introduce a novel frequency domain adversarial
+training approach for optimizing a robust model against voxel and frequency
+domain attacks. Moreover, we propose frequency consistency loss to regulate our
+frequency domain adversarial training that achieves a better tradeoff between
+model's performance on clean and adversarial samples. Code is publicly
+available at https://github.com/asif-hanif/vafa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted in MICCAI 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Calibration Bias Without Fixed Attribute Grouping for
+  Improved Fairness in Medical Imaging Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01738v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01738v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changjian Shui, Justin Szeto, Raghav Mehta, Douglas L. Arnold, Tal Arbel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trustworthy deployment of deep learning medical imaging models into
+real-world clinical practice requires that they be calibrated. However, models
+that are well calibrated overall can still be poorly calibrated for a
+sub-population, potentially resulting in a clinician unwittingly making poor
+decisions for this group based on the recommendations of the model. Although
+methods have been shown to successfully mitigate biases across subgroups in
+terms of model accuracy, this work focuses on the open problem of mitigating
+calibration biases in the context of medical image analysis. Our method does
+not require subgroup attributes during training, permitting the flexibility to
+mitigate biases for different choices of sensitive attributes without
+re-training. To this end, we propose a novel two-stage method: Cluster-Focal to
+first identify poorly calibrated samples, cluster them into groups, and then
+introduce group-wise focal loss to improve calibration bias. We evaluate our
+method on skin lesion classification with the public HAM10000 dataset, and on
+predicting future lesional activity for multiple sclerosis (MS) patients. In
+addition to considering traditional sensitive attributes (e.g. age, sex) with
+demographic subgroups, we also consider biases among groups with different
+image-derived attributes, such as lesion load, which are required in medical
+image analysis. Our results demonstrate that our method effectively controls
+calibration error in the worst-performing subgroups while preserving prediction
+performance, and outperforming recent baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LA-Net: Landmark-Aware Learning for Reliable Facial Expression
+  Recognition under Label Noise <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09023v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09023v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyu Wu, Jinshi Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial expression recognition (FER) remains a challenging task due to the
+ambiguity of expressions. The derived noisy labels significantly harm the
+performance in real-world scenarios. To address this issue, we present a new
+FER model named Landmark-Aware Net~(LA-Net), which leverages facial landmarks
+to mitigate the impact of label noise from two perspectives. Firstly, LA-Net
+uses landmark information to suppress the uncertainty in expression space and
+constructs the label distribution of each sample by neighborhood aggregation,
+which in turn improves the quality of training supervision. Secondly, the model
+incorporates landmark information into expression representations using the
+devised expression-landmark contrastive loss. The enhanced expression feature
+extractor can be less susceptible to label noise. Our method can be integrated
+with any deep neural network for better training supervision without
+introducing extra inference costs. We conduct extensive experiments on both
+in-the-wild datasets and synthetic noisy datasets and demonstrate that LA-Net
+achieves state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can point cloud networks learn statistical shape models of anatomies? <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05610v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05610v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jadie Adams, Shireen Elhabian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical Shape Modeling (SSM) is a valuable tool for investigating and
+quantifying anatomical variations within populations of anatomies. However,
+traditional correspondence-based SSM generation methods have a prohibitive
+inference process and require complete geometric proxies (e.g., high-resolution
+binary volumes or surface meshes) as input shapes to construct the SSM.
+Unordered 3D point cloud representations of shapes are more easily acquired
+from various medical imaging practices (e.g., thresholded images and surface
+scanning). Point cloud deep networks have recently achieved remarkable success
+in learning permutation-invariant features for different point cloud tasks
+(e.g., completion, semantic segmentation, classification). However, their
+application to learning SSM from point clouds is to-date unexplored. In this
+work, we demonstrate that existing point cloud encoder-decoder-based completion
+networks can provide an untapped potential for SSM, capturing population-level
+statistical representations of shapes while reducing the inference burden and
+relaxing the input requirement. We discuss the limitations of these techniques
+to the SSM application and suggest future improvements. Our work paves the way
+for further exploration of point cloud deep learning for SSM, a promising
+avenue for advancing shape analysis literature and broadening SSM to diverse
+use cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023. 13 pages, 5 figures, appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fully Bayesian VIB-DeepSSM <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05797v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05797v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jadie Adams, Shireen Elhabian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical shape modeling (SSM) enables population-based quantitative
+analysis of anatomical shapes, informing clinical diagnosis. Deep learning
+approaches predict correspondence-based SSM directly from unsegmented 3D images
+but require calibrated uncertainty quantification, motivating Bayesian
+formulations. Variational information bottleneck DeepSSM (VIB-DeepSSM) is an
+effective, principled framework for predicting probabilistic shapes of anatomy
+from images with aleatoric uncertainty quantification. However, VIB is only
+half-Bayesian and lacks epistemic uncertainty inference. We derive a fully
+Bayesian VIB formulation and demonstrate the efficacy of two scalable
+implementation approaches: concrete dropout and batch ensemble. Additionally,
+we introduce a novel combination of the two that further enhances uncertainty
+calibration via multimodal marginalization. Experiments on synthetic shapes and
+left atrium data demonstrate that the fully Bayesian VIB network predicts SSM
+from images with improved uncertainty reasoning without sacrificing accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023. 13 pages, 4 figures, appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAP: Multimodal Uncertainty-Aware Vision-Language <span class="highlight-title">Pre-train</span>ing Model <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.05335v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.05335v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yatai Ji, Junjie Wang, Yuan Gong, Lin Zhang, Yanru Zhu, Hongfa Wang, Jiaxing Zhang, Tetsuya Sakai, Yujiu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal semantic understanding often has to deal with uncertainty, which
+means the obtained messages tend to refer to multiple targets. Such uncertainty
+is problematic for our interpretation, including inter- and intra-modal
+uncertainty. Little effort has studied the modeling of this uncertainty,
+particularly in pre-training on unlabeled datasets and fine-tuning in
+task-specific downstream datasets. In this paper, we project the
+representations of all modalities as probabilistic distributions via a
+Probability Distribution Encoder (PDE) by utilizing sequence-level
+interactions. Compared to the existing deterministic methods, such uncertainty
+modeling can convey richer multimodal semantic information and more complex
+relationships. Furthermore, we integrate uncertainty modeling with popular
+pre-training frameworks and propose suitable pre-training tasks:
+Distribution-based Vision-Language Contrastive learning (D-VLC),
+Distribution-based Masked Language Modeling (D-MLM), and Distribution-based
+Image-Text Matching (D-ITM). The fine-tuned models are applied to challenging
+downstream tasks, including image-text retrieval, visual question answering,
+visual reasoning, and visual entailment, and achieve state-of-the-art results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023 Main Track Long Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain Adaptation based Enhanced Detection for Autonomous Driving in
+  Foggy and Rainy Weather 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinlong Li, Runsheng Xu, Jin Ma, Qin Zou, Jiaqi Ma, Hongkai Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Typically, object detection methods for autonomous driving that rely on
+supervised learning make the assumption of a consistent feature distribution
+between the training and testing data, however such assumption may fail in
+different weather conditions. Due to the domain gap, a detection model trained
+under clear weather may not perform well in foggy and rainy conditions.
+Overcoming detection bottlenecks in foggy and rainy weather is a real challenge
+for autonomous vehicles deployed in the wild. To bridge the domain gap and
+improve the performance of object detectionin foggy and rainy weather, this
+paper presents a novel framework for domain-adaptive object detection. The
+adaptations at both the image-level and object-level are intended to minimize
+the differences in image style and object appearance between domains.
+Furthermore, in order to improve the model's performance on challenging
+examples, we introduce a novel adversarial gradient reversal layer that
+conducts adversarial mining on difficult instances in addition to domain
+adaptation. Additionally, we suggest generating an auxiliary domain through
+data augmentation to enforce a new domain-level metric regularization.
+Experimental findings on public V2V benchmark exhibit a substantial enhancement
+in object detection specifically for foggy and rainy driving scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>only change the title of this paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collaborative Perception in Autonomous Driving: Methods, <span class="highlight-title">Dataset</span>s and
+  Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06262v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06262v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yushan Han, Hui Zhang, Huifang Li, Yi Jin, Congyan Lang, Yidong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative perception is essential to address occlusion and sensor failure
+issues in autonomous driving. In recent years, theoretical and experimental
+investigations of novel works for collaborative perception have increased
+tremendously. So far, however, few reviews have focused on systematical
+collaboration modules and large-scale collaborative perception datasets. This
+work reviews recent achievements in this field to bridge this gap and motivate
+future research. We start with a brief overview of collaboration schemes. After
+that, we systematically summarize the collaborative perception methods for
+ideal scenarios and real-world issues. The former focus on collaboration
+modules and efficiency, and the latter is devoted to addressing the problems in
+actual application. Furthermore, we present large-scale public datasets and
+summarize quantitative results on these benchmarks. Finally, we highlight gaps
+and overlooked challenges between current academic research and real-world
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 6 figures. Accepted by IEEE Intelligent Transportation
+  Systems Magazine. URL:
+  https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GSMorph: Gradient Surgery for cine-MRI Cardiac Deformable Registration <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14687v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14687v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Dou, Ning Bi, Luyi Han, Yuhao Huang, Ritse Mann, Xin Yang, Dong Ni, Nishant Ravikumar, Alejandro F. Frangi, Yunzhi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based deformable registration methods have been widely
+investigated in diverse medical applications. Learning-based deformable
+registration relies on weighted objective functions trading off registration
+accuracy and smoothness of the deformation field. Therefore, they inevitably
+require tuning the hyperparameter for optimal registration performance. Tuning
+the hyperparameters is highly computationally expensive and introduces
+undesired dependencies on domain knowledge. In this study, we construct a
+registration model based on the gradient surgery mechanism, named GSMorph, to
+achieve a hyperparameter-free balance on multiple losses. In GSMorph, we
+reformulate the optimization procedure by projecting the gradient of similarity
+loss orthogonally to the plane associated with the smoothness constraint,
+rather than additionally introducing a hyperparameter to balance these two
+competing terms. Furthermore, our method is model-agnostic and can be merged
+into any deep registration network without introducing extra parameters or
+slowing down inference. In this study, We compared our method with
+state-of-the-art (SOTA) deformable registration approaches over two publicly
+available cardiac MRI datasets. GSMorph proves superior to five SOTA
+learning-based registration models and two conventional registration
+techniques, SyN and Demons, on both registration accuracy and smoothness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-order Tensor Pooling with Attention for Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.05216v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.05216v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piotr Koniusz, Lei Wang, Ke Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim at capturing high-order statistics of feature vectors formed by a
+neural network, and propose end-to-end second- and higher-order pooling to form
+a tensor descriptor. Tensor descriptors require a robust similarity measure due
+to low numbers of aggregated vectors and the burstiness phenomenon, when a
+given feature appears more/less frequently than statistically expected. The
+Heat Diffusion Process (HDP) on a graph Laplacian is closely related to the
+Eigenvalue Power Normalization (EPN) of the covariance/auto-correlation matrix,
+whose inverse forms a loopy graph Laplacian. We show that the HDP and the EPN
+play the same role, i.e., to boost or dampen the magnitude of the eigenspectrum
+thus preventing the burstiness. We equip higher-order tensors with EPN which
+acts as a spectral detector of higher-order occurrences to prevent burstiness.
+We also prove that for a tensor of order r built from d dimensional feature
+descriptors, such a detector gives the likelihood if at least one higher-order
+occurrence is 'projected' into one of binom(d,r) subspaces represented by the
+tensor; thus forming a tensor power normalization metric endowed with
+binom(d,r) such 'detectors'. For experimental contributions, we apply several
+second- and higher-order pooling variants to action recognition, provide
+previously not presented comparisons of such pooling variants, and show
+state-of-the-art results on HMDB-51, YUP++ and MPII Cooking Activities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Deep Graph Matching Based on Cycle Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Tourani, Carsten Rother, Muhammad Haris Khan, Bogdan Savchynskyy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We contribute to the sparsely populated area of unsupervised deep graph
+matching with application to keypoint matching in images. Contrary to the
+standard \emph{supervised} approach, our method does not require ground truth
+correspondences between keypoint pairs. Instead, it is self-supervised by
+enforcing consistency of matchings between images of the same object category.
+As the matching and the consistency loss are discrete, their derivatives cannot
+be straightforwardly used for learning. We address this issue in a principled
+way by building our method upon the recent results on black-box differentiation
+of combinatorial solvers. This makes our method exceptionally flexible, as it
+is compatible with arbitrary network architectures and combinatorial solvers.
+Our experimental evaluation suggests that our technique sets a new
+state-of-the-art for unsupervised graph matching.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures, 3 papers</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Periocular Biometrics: A Modality for Unconstrained Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.13792v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.13792v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Alonso-Fernandez, Josef Bigun, Julian Fierrez, Naser Damer, Hugo Proença, Arun Ross
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Periocular refers to the externally visible region of the face that surrounds
+the eye socket. This feature-rich area can provide accurate identification in
+unconstrained or uncooperative scenarios, where the iris or face modalities may
+not offer sufficient biometric cues due to factors such as partial occlusion or
+high subject-to-camera distance. The COVID-19 pandemic has further highlighted
+its importance, as the ocular region remained the only visible facial area even
+in controlled settings due to the widespread use of masks. This paper discusses
+the state of the art in periocular biometrics, presenting an overall framework
+encompassing its most significant research aspects, which include: (a) ocular
+definition, acquisition, and detection; (b) identity recognition, including
+combination with other modalities and use of various spectra; and (c) ocular
+soft-biometric analysis. Finally, we conclude by addressing current challenges
+and proposing future directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE Computer journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling Open-Vocabulary Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09683v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09683v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Minderer, Alexey Gritsenko, Neil Houlsby
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-vocabulary object detection has benefited greatly from pretrained
+vision-language models, but is still limited by the amount of available
+detection training data. While detection training data can be expanded by using
+Web image-text pairs as weak supervision, this has not been done at scales
+comparable to image-level pretraining. Here, we scale up detection data with
+self-training, which uses an existing detector to generate pseudo-box
+annotations on image-text pairs. Major challenges in scaling self-training are
+the choice of label space, pseudo-annotation filtering, and training
+efficiency. We present the OWLv2 model and OWL-ST self-training recipe, which
+address these challenges. OWLv2 surpasses the performance of previous
+state-of-the-art open-vocabulary detectors already at comparable training
+scales (~10M examples). However, with OWL-ST, we can scale to over 1B examples,
+yielding further large improvement: With an L/14 architecture, OWL-ST improves
+AP on LVIS rare classes, for which the model has seen no human box annotations,
+from 31.2% to 44.6% (43% relative improvement). OWL-ST unlocks Web-scale
+training for open-world localization, similar to what has been seen for image
+classification and language modelling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Implicit Multidimensional Projection of Local Subspaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2009.03259v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2009.03259v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongzheng Bian, Yumeng Xue, Liang Zhou, Jian Zhang, Baoquan Chen, Daniel Weiskopf, Yunhai Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a visualization method to understand the effect of
+multidimensional projection on local subspaces, using implicit function
+differentiation. Here, we understand the local subspace as the multidimensional
+local neighborhood of data points. Existing methods focus on the projection of
+multidimensional data points, and the neighborhood information is ignored. Our
+method is able to analyze the shape and directional information of the local
+subspace to gain more insights into the global structure of the data through
+the perception of local structures. Local subspaces are fitted by
+multidimensional ellipses that are spanned by basis vectors. An accurate and
+efficient vector transformation method is proposed based on analytical
+differentiation of multidimensional projections formulated as implicit
+functions. The results are visualized as glyphs and analyzed using a full set
+of specifically-designed interactions supported in our efficient web-based
+visualization tool. The usefulness of our method is demonstrated using various
+multi- and high-dimensional benchmark datasets. Our implicit differentiation
+vector transformation is evaluated through numerical comparisons; the overall
+method is evaluated through exploration examples and use cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Implicit Identity Representation Conditioned Memory Compensation Network
+  for Talking Head video Generation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09906v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09906v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fa-Ting Hong, Dan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Talking head video generation aims to animate a human face in a still image
+with dynamic poses and expressions using motion information derived from a
+target-driving video, while maintaining the person's identity in the source
+image. However, dramatic and complex motions in the driving video cause
+ambiguous generation, because the still source image cannot provide sufficient
+appearance information for occluded regions or delicate expression variations,
+which produces severe artifacts and significantly degrades the generation
+quality. To tackle this problem, we propose to learn a global facial
+representation space, and design a novel implicit identity representation
+conditioned memory compensation network, coined as MCNet, for high-fidelity
+talking head generation.~Specifically, we devise a network module to learn a
+unified spatial facial meta-memory bank from all training samples, which can
+provide rich facial structure and appearance priors to compensate warped source
+facial features for the generation. Furthermore, we propose an effective query
+mechanism based on implicit identity representations learned from the discrete
+keypoints of the source image. It can greatly facilitate the retrieval of more
+correlated information from the memory bank for the compensation. Extensive
+experiments demonstrate that MCNet can learn representative and complementary
+facial memory, and can clearly outperform previous state-of-the-art talking
+head generation methods on VoxCeleb1 and CelebV datasets. Please check our
+\href{https://github.com/harlanhong/ICCV2023-MCNET}{Project}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023, update the reference and figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Regular SE(3) Group Convolutions for Volumetric Medical Image Analysis <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thijs P. Kuipers, Erik J. Bekkers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regular group convolutional neural networks (G-CNNs) have been shown to
+increase model performance and improve equivariance to different geometrical
+symmetries. This work addresses the problem of SE(3), i.e., roto-translation
+equivariance, on volumetric data. Volumetric image data is prevalent in many
+medical settings. Motivated by the recent work on separable group convolutions,
+we devise a SE(3) group convolution kernel separated into a continuous SO(3)
+(rotation) kernel and a spatial kernel. We approximate equivariance to the
+continuous setting by sampling uniform SO(3) grids. Our continuous SO(3) kernel
+is parameterized via RBF interpolation on similarly uniform grids. We
+demonstrate the advantages of our approach in volumetric medical image
+analysis. Our SE(3) equivariant models consistently outperform CNNs and regular
+discrete G-CNNs on challenging medical classification tasks and show
+significantly improved generalization capabilities. Our approach achieves up to
+a 16.5% gain in accuracy over regular CNNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure, 2 tables, accepted at MICCAI 2023. Updated
+  version to camera ready version 1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Detecting Images Generated by Deep Diffusion Models using their Local
+  Intrinsic Dimensionality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02347v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02347v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Lorenz, Ricard Durall, Janis Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models recently have been successfully applied for the visual
+synthesis of strikingly realistic appearing images. This raises strong concerns
+about their potential for malicious purposes. In this paper, we propose using
+the lightweight multi Local Intrinsic Dimensionality (multiLID), which has been
+originally developed in context of the detection of adversarial examples, for
+the automatic detection of synthetic images and the identification of the
+according generator networks. In contrast to many existing detection
+approaches, which often only work for GAN-generated images, the proposed method
+provides close to perfect detection results in many realistic use cases.
+Extensive experiments on known and newly created datasets demonstrate that the
+proposed multiLID approach exhibits superiority in diffusion detection and
+model identification. Since the empirical evaluations of recent publications on
+the detection of generated images are often mainly focused on the
+"LSUN-Bedroom" dataset, we further establish a comprehensive benchmark for the
+detection of diffusion-generated images, including samples from several
+diffusion models with different image sizes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UW-ProCCaps: UnderWater Progressive Colourisation with Capsules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01091v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01091v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rita Pucci, Niki Martinel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Underwater images are fundamental for studying and understanding the status
+of marine life. We focus on reducing the memory space required for image
+storage while the memory space consumption in the collecting phase limits the
+time lasting of this phase leading to the need for more image collection
+campaigns. We present a novel machine-learning model that reconstructs the
+colours of underwater images from their luminescence channel, thus saving 2/3
+of the available storage space. Our model specialises in underwater colour
+reconstruction and consists of an encoder-decoder architecture. The encoder is
+composed of a convolutional encoder and a parallel specialised classifier
+trained with webly-supervised data. The encoder and the decoder use layers of
+capsules to capture the features of the entities in the image. The colour
+reconstruction process recalls the progressive and the generative adversarial
+training procedures. The progressive training gives the ground for a generative
+adversarial routine focused on the refining of colours giving the image bright
+and saturated colours which bring the image back to life. We validate the model
+both qualitatively and quantitatively on four benchmark datasets. This is the
+first attempt at colour reconstruction in greyscale underwater images.
+Extensive results on four benchmark datasets demonstrate that our solution
+outperforms state-of-the-art (SOTA) solutions. We also demonstrate that the
+generated colourisation enhances the quality of images compared to enhancement
+models at the SOTA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vicinity Vision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.10552v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.10552v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixuan Sun, Zhen Qin, Hui Deng, Jianyuan Wang, Yi Zhang, Kaihao Zhang, Nick Barnes, Stan Birchfield, Lingpeng Kong, Yiran Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision transformers have shown great success on numerous computer vision
+tasks. However, its central component, softmax attention, prohibits vision
+transformers from scaling up to high-resolution images, due to both the
+computational complexity and memory footprint being quadratic. Although linear
+attention was introduced in natural language processing (NLP) tasks to mitigate
+a similar issue, directly applying existing linear attention to vision
+transformers may not lead to satisfactory results. We investigate this problem
+and find that computer vision tasks focus more on local information compared
+with NLP tasks. Based on this observation, we present a Vicinity Attention that
+introduces a locality bias to vision transformers with linear complexity.
+Specifically, for each image patch, we adjust its attention weight based on its
+2D Manhattan distance measured by its neighbouring patches. In this case, the
+neighbouring patches will receive stronger attention than far-away patches.
+Moreover, since our Vicinity Attention requires the token length to be much
+larger than the feature dimension to show its efficiency advantages, we further
+propose a new Vicinity Vision Transformer (VVT) structure to reduce the feature
+dimension without degenerating the accuracy. We perform extensive experiments
+on the CIFAR100, ImageNet1K, and ADE20K datasets to validate the effectiveness
+of our method. Our method has a slower growth rate of GFlops than previous
+transformer-based and convolution-based networks when the input resolution
+increases. In particular, our approach achieves state-of-the-art image
+classification accuracy with 50% fewer parameters than previous methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code: https://github.com/OpenNLPLab/Vicinity-Vision-Transformer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Drone navigation and license place detection for vehicle location in
+  indoor spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10165v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10165v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moa Arvidsson, Sithichot Sawirot, Cristofer Englund, Fernando Alonso-Fernandez, Martin Torstensson, Boris Duran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Millions of vehicles are transported every year, tightly parked in vessels or
+boats. To reduce the risks of associated safety issues like fires, knowing the
+location of vehicles is essential, since different vehicles may need different
+mitigation measures, e.g. electric cars. This work is aimed at creating a
+solution based on a nano-drone that navigates across rows of parked vehicles
+and detects their license plates. We do so via a wall-following algorithm, and
+a CNN trained to detect license plates. All computations are done in real-time
+on the drone, which just sends position and detected images that allow the
+creation of a 2D map with the position of the plates. Our solution is capable
+of reading all plates across eight test cases (with several rows of plates,
+different drone speeds, or low light) by aggregation of measurements across
+several drone journeys.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at VIII International Workshop on Artificial Intelligence
+  and Pattern Recognition, IWAIPR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HDGT: Heterogeneous Driving Graph <span class="highlight-title">Transformer</span> for Multi-Agent Trajectory
+  Prediction via Scene Encoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.09753v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.09753v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaosong Jia, Penghao Wu, Li Chen, Yu Liu, Hongyang Li, Junchi Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Encoding a driving scene into vector representations has been an essential
+task for autonomous driving that can benefit downstream tasks e.g. trajectory
+prediction. The driving scene often involves heterogeneous elements such as the
+different types of objects (agents, lanes, traffic signs) and the semantic
+relations between objects are rich and diverse. Meanwhile, there also exist
+relativity across elements, which means that the spatial relation is a relative
+concept and need be encoded in a ego-centric manner instead of in a global
+coordinate system. Based on these observations, we propose Heterogeneous
+Driving Graph Transformer (HDGT), a backbone modelling the driving scene as a
+heterogeneous graph with different types of nodes and edges. For heterogeneous
+graph construction, we connect different types of nodes according to diverse
+semantic relations. For spatial relation encoding, the coordinates of the node
+as well as its in-edges are in the local node-centric coordinate system. For
+the aggregation module in the graph neural network (GNN), we adopt the
+transformer structure in a hierarchical way to fit the heterogeneous nature of
+inputs. Experimental results show that HDGT achieves state-of-the-art
+performance for the task of trajectory prediction, on INTERACTION Prediction
+Challenge and Waymo Open Motion Challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE TPAMI in 2023. Code url:
+  https://github.com/OpenDriveLab/HDGT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Navya3DSeg -- Navya 3D Semantic Segmentation <span class="highlight-title">Dataset</span> & split generation
+  for autonomous vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08292v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08292v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Almin, Léo Lemarié, Anh Duong, B Ravi Kiran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving (AD) perception today relies heavily on deep learning
+based architectures requiring large scale annotated datasets with their
+associated costs for curation and annotation. The 3D semantic data are useful
+for core perception tasks such as obstacle detection and ego-vehicle
+localization. We propose a new dataset, Navya 3D Segmentation (Navya3DSeg),
+with a diverse label space corresponding to a large scale production grade
+operational domain, including rural, urban, industrial sites and universities
+from 13 countries. It contains 23 labeled sequences and 25 supplementary
+sequences without labels, designed to explore self-supervised and
+semi-supervised semantic segmentation benchmarks on point clouds. We also
+propose a novel method for sequential dataset split generation based on
+iterative multi-label stratification, and demonstrated to achieve a +1.2% mIoU
+improvement over the original split proposed by SemanticKITTI dataset. A
+complete benchmark for semantic segmentation task was performed, with state of
+the art methods. Finally, we demonstrate an Active Learning (AL) based dataset
+distillation framework. We introduce a novel heuristic-free sampling method
+called ego-pose distance based sampling in the context of AL. A detailed
+presentation on the dataset is available here
+https://www.youtube.com/watch?v=5m6ALIs-s20.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted version to IEEE RA-L. Version with supplementary materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Positive-Augmented Contrastive Learning for Image and Video Captioning
+  Evaluation <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12112v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12112v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Sarto, Manuele Barraco, Marcella Cornia, Lorenzo Baraldi, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The CLIP model has been recently proven to be very effective for a variety of
+cross-modal tasks, including the evaluation of captions generated from
+vision-and-language architectures. In this paper, we propose a new recipe for a
+contrastive-based evaluation metric for image captioning, namely
+Positive-Augmented Contrastive learning Score (PAC-S), that in a novel way
+unifies the learning of a contrastive visual-semantic space with the addition
+of generated images and text on curated data. Experiments spanning several
+datasets demonstrate that our new metric achieves the highest correlation with
+human judgments on both images and videos, outperforming existing
+reference-based metrics like CIDEr and SPICE and reference-free metrics like
+CLIP-Score. Finally, we test the system-level correlation of the proposed
+metric when considering popular image captioning approaches, and assess the
+impact of employing different cross-modal features. Our source code and trained
+models are publicly available at: https://github.com/aimagelab/pacscore.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023 (highlight paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reading Radiology Imaging Like The Radiologist 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05921v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05921v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated radiology report generation aims to generate radiology reports that
+contain rich, fine-grained descriptions of radiology imaging. Compared with
+image captioning in the natural image domain, medical images are very similar
+to each other, with only minor differences in the occurrence of diseases. Given
+the importance of these minor differences in the radiology report, it is
+crucial to encourage the model to focus more on the subtle regions of disease
+occurrence. Secondly, the problem of visual and textual data biases is serious.
+Not only do normal cases make up the majority of the dataset, but sentences
+describing areas with pathological changes also constitute only a small part of
+the paragraph. Lastly, generating medical image reports involves the challenge
+of long text generation, which requires more expertise and empirical training
+in medical knowledge. As a result, the difficulty of generating such reports is
+increased. To address these challenges, we propose a disease-oriented retrieval
+framework that utilizes similar reports as prior knowledge references. We
+design a factual consistency captioning generator to generate more accurate and
+factually consistent disease descriptions. Our framework can find most similar
+reports for a given disease from the CXR database by retrieving a
+disease-oriented mask consisting of the position and morphological
+characteristics. By referencing the disease-oriented similar report and the
+visual features, the factual consistency model can generate a more accurate
+radiology report.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>There are data writing errors in the paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised 3D registration through optimization-guided cyclical
+  self-training <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16997v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16997v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Bigalke, Lasse Hansen, Tony C. W. Mok, Mattias P. Heinrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art deep learning-based registration methods employ three
+different learning strategies: supervised learning, which requires costly
+manual annotations, unsupervised learning, which heavily relies on hand-crafted
+similarity metrics designed by domain experts, or learning from synthetic data,
+which introduces a domain shift. To overcome the limitations of these
+strategies, we propose a novel self-supervised learning paradigm for
+unsupervised registration, relying on self-training. Our idea is based on two
+key insights. Feature-based differentiable optimizers 1) perform reasonable
+registration even from random features and 2) stabilize the training of the
+preceding feature extraction network on noisy labels. Consequently, we propose
+cyclical self-training, where pseudo labels are initialized as the displacement
+fields inferred from random features and cyclically updated based on more and
+more expressive features from the learning feature extractor, yielding a
+self-reinforcement effect. We evaluate the method for abdomen and lung
+registration, consistently surpassing metric-based supervision and
+outperforming diverse state-of-the-art competitors. Source code is available at
+https://github.com/multimodallearning/reg-cyclical-self-train.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08396v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08396v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdul Rehman Khan, Asifullah Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional Neural Networks (CNNs) have made significant strides in medical
+image analysis in recent years. However, the local nature of the convolution
+operator may pose a limitation for capturing global and long-range interactions
+in CNNs. Recently, Transformers have gained popularity in the computer vision
+community and also medical image segmentation due to their ability to process
+global features effectively. The scalability issues of self-attention mechanism
+and lack of the CNN-like inductive bias may have limited their adoption.
+Therefore, hybrid Vision transformers (CNN-Transformer), exploiting advantages
+of both Convolution and Self-attention Mechanisms, have gained importance. In
+this work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision
+transformer (CNN-Transformer) for medical image segmentation. The proposed
+Hybrid Decoder, based on MaxViT-block, is designed to harness the power of both
+the convolution and self-attention mechanisms at each decoding stage with
+nominal computational burden. The inclusion of multi-axis self-attention,
+within each decoder stage, significantly enhances the discriminating capacity
+between the object and background regions, and thereby helps in improving the
+segmentation efficiency. In the Hybrid Decoder block, the fusion process
+commences by integrating the upsampled lower level decoder features, obtained
+through transpose convolution, with the skip-connection features derived from
+the hybrid encoder. Subsequently, the fused features undergo refinement through
+the utilization of a multi-axis attention mechanism. The proposed decoder block
+is repeated multiple times to progressively segment the nuclei regions.
+Experimental results on MoNuSeg18 and MoNuSAC20 dataset demonstrates the
+effectiveness of the proposed technique.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RegFormer: An Efficient Projection-Aware <span class="highlight-title">Transformer</span> Network for
+  Large-Scale Point Cloud Registration <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12384v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12384v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiuming Liu, Guangming Wang, Zhe Liu, Chaokang Jiang, Marc Pollefeys, Hesheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although point cloud registration has achieved remarkable advances in
+object-level and indoor scenes, large-scale registration methods are rarely
+explored. Challenges mainly arise from the huge point number, complex
+distribution, and outliers of outdoor LiDAR scans. In addition, most existing
+registration works generally adopt a two-stage paradigm: They first find
+correspondences by extracting discriminative local features, and then leverage
+estimators (eg. RANSAC) to filter outliers, which are highly dependent on
+well-designed descriptors and post-processing choices. To address these
+problems, we propose an end-to-end transformer network (RegFormer) for
+large-scale point cloud alignment without any further post-processing.
+Specifically, a projection-aware hierarchical transformer is proposed to
+capture long-range dependencies and filter outliers by extracting point
+features globally. Our transformer has linear complexity, which guarantees high
+efficiency even for large-scale scenes. Furthermore, to effectively reduce
+mismatches, a bijective association transformer is designed for regressing the
+initial transformation. Extensive experiments on KITTI and NuScenes datasets
+demonstrate that our RegFormer achieves competitive performance in terms of
+both accuracy and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023. Codes will be released at
+  https://github.com/IRMVLab/RegFormer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Positive unlabeled learning with tensor networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.14085v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.14085v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bojan Žunkovič
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Positive unlabeled learning is a binary classification problem with positive
+and unlabeled data. It is common in domains where negative labels are costly or
+impossible to obtain, e.g., medicine and personalized advertising. Most
+approaches to positive unlabeled learning apply to specific data types (e.g.,
+images, categorical data) and can not generate new positive and negative
+samples. This work introduces a feature-space distance-based tensor network
+approach to the positive unlabeled learning problem. The presented method is
+not domain specific and significantly improves the state-of-the-art results on
+the MNIST image and 15 categorical/mixed datasets. The trained tensor network
+model is also a generative model and enables the generation of new positive and
+negative instances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Event Camera Data <span class="highlight-title">Pre-train</span>ing <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01928v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01928v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Yang, Liyuan Pan, Liu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a pre-trained neural network for handling event camera
+data. Our model is a self-supervised learning framework, and uses paired event
+camera data and natural RGB images for training.
+  Our method contains three modules connected in a sequence: i) a family of
+event data augmentations, generating meaningful event images for
+self-supervised training; ii) a conditional masking strategy to sample
+informative event patches from event images, encouraging our model to capture
+the spatial layout of a scene and accelerating training; iii) a contrastive
+learning approach, enforcing the similarity of embeddings between matching
+event images, and between paired event and RGB images. An embedding projection
+loss is proposed to avoid the model collapse when enforcing the event image
+embedding similarities. A probability distribution alignment loss is proposed
+to encourage the event image to be consistent with its paired RGB image in the
+feature space.
+  Transfer learning performance on downstream tasks shows the superiority of
+our method over state-of-the-art methods. For example, we achieve top-1
+accuracy at 64.83% on the N-ImageNet dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion<span class="highlight-title">BERT</span>: A Unified Perspective on Learning Human Motion
+  Representations <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06551v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06551v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Zhu, Xiaoxuan Ma, Zhaoyang Liu, Libin Liu, Wayne Wu, Yizhou Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a unified perspective on tackling various human-centric video
+tasks by learning human motion representations from large-scale and
+heterogeneous data resources. Specifically, we propose a pretraining stage in
+which a motion encoder is trained to recover the underlying 3D motion from
+noisy partial 2D observations. The motion representations acquired in this way
+incorporate geometric, kinematic, and physical knowledge about human motion,
+which can be easily transferred to multiple downstream tasks. We implement the
+motion encoder with a Dual-stream Spatio-temporal Transformer (DSTformer)
+neural network. It could capture long-range spatio-temporal relationships among
+the skeletal joints comprehensively and adaptively, exemplified by the lowest
+3D pose estimation error so far when trained from scratch. Furthermore, our
+proposed framework achieves state-of-the-art performance on all three
+downstream tasks by simply finetuning the pretrained motion encoder with a
+simple regression head (1-2 layers), which demonstrates the versatility of the
+learned motion representations. Code and models are available at
+https://motionbert.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Estimation of control area in badminton doubles with pose information
+  from top and back view drone videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04247v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04247v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ning Ding, Kazuya Takeda, Wenhui Jin, Yingjiu Bei, Keisuke Fujii
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of visual tracking to the performance analysis of sports
+players in dynamic competitions is vital for effective coaching. In doubles
+matches, coordinated positioning is crucial for maintaining control of the
+court and minimizing opponents' scoring opportunities. The analysis of such
+teamwork plays a vital role in understanding the dynamics of the game. However,
+previous studies have primarily focused on analyzing and assessing singles
+players without considering occlusion in broadcast videos. These studies have
+relied on discrete representations, which involve the analysis and
+representation of specific actions (e.g., strokes) or events that occur during
+the game while overlooking the meaningful spatial distribution. In this work,
+we present the first annotated drone dataset from top and back views in
+badminton doubles and propose a framework to estimate the control area
+probability map, which can be used to evaluate teamwork performance. We present
+an efficient framework of deep neural networks that enables the calculation of
+full probability surfaces. This framework utilizes the embedding of a Gaussian
+mixture map of players' positions and employs graph convolution on their poses.
+In the experiment, we verify our approach by comparing various baselines and
+discovering the correlations between the score and control area. Additionally,
+we propose a practical application for assessing optimal positioning to provide
+instructions during a game. Our approach offers both visual and quantitative
+evaluations of players' movements, thereby providing valuable insights into
+doubles teamwork. The dataset and related project code is available at
+https://github.com/Ning-D/Drone_BD_ControlArea
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 10 figures, to appear in Multimedia Tools and Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Class Attention to Regions of Lesion for Imbalanced Medical Image
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10036v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10036v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Xin Zhuang, Jiabin Cai, Jianguo Zhang, Wei-shi Zheng, Ruixuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated medical image classification is the key component in intelligent
+diagnosis systems. However, most medical image datasets contain plenty of
+samples of common diseases and just a handful of rare ones, leading to major
+class imbalances. Currently, it is an open problem in intelligent diagnosis to
+effectively learn from imbalanced training data. In this paper, we propose a
+simple yet effective framework, named \textbf{C}lass \textbf{A}ttention to
+\textbf{RE}gions of the lesion (CARE), to handle data imbalance issues by
+embedding attention into the training process of \textbf{C}onvolutional
+\textbf{N}eural \textbf{N}etworks (CNNs). The proposed attention module helps
+CNNs attend to lesion regions of rare diseases, therefore helping CNNs to learn
+their characteristics more effectively. In addition, this attention module
+works only during the training phase and does not change the architecture of
+the original network, so it can be directly combined with any existing CNN
+architecture. The CARE framework needs bounding boxes to represent the lesion
+regions of rare diseases. To alleviate the need for manual annotation, we
+further developed variants of CARE by leveraging the traditional saliency
+methods or a pretrained segmentation model for bounding box generation. Results
+show that the CARE variants with automated bounding box generation are
+comparable to the original CARE framework with \textit{manual} bounding box
+annotations. A series of experiments on an imbalanced skin image dataset and a
+pneumonia dataset indicates that our method can effectively help the network
+focus on the lesion regions of rare diseases and remarkably improves the
+classification performance of rare diseases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Neurocomputing on July 2023. 37 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09724v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09724v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kibeom Hong, Seogkyu Jeon, Junsoo Lee, Namhyuk Ahn, Kunhee Kim, Pilhyeon Lee, Daesik Kim, Youngjung Uh, Hyeran Byun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To deliver the artistic expression of the target style, recent studies
+exploit the attention mechanism owing to its ability to map the local patches
+of the style image to the corresponding patches of the content image. However,
+because of the low semantic correspondence between arbitrary content and
+artworks, the attention module repeatedly abuses specific local patches from
+the style image, resulting in disharmonious and evident repetitive artifacts.
+To overcome this limitation and accomplish impeccable artistic style transfer,
+we focus on enhancing the attention mechanism and capturing the rhythm of
+patterns that organize the style. In this paper, we introduce a novel metric,
+namely pattern repeatability, that quantifies the repetition of patterns in the
+style image. Based on the pattern repeatability, we propose Aesthetic
+Pattern-Aware style transfer Networks (AesPA-Net) that discover the sweet spot
+of local and global style expressions. In addition, we propose a novel
+self-supervisory task to encourage the attention mechanism to learn precise and
+meaningful semantic correspondence. Lastly, we introduce the patch-wise style
+loss to transfer the elaborate rhythm of local patterns. Through qualitative
+and quantitative evaluations, we verify the reliability of the proposed pattern
+repeatability that aligns with human perception, and demonstrate the
+superiority of the proposed framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Code is available at this
+  https://github.com/Kibeom-Hong/AesPA-Net</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion<span class="highlight-title">GPT</span>: Human Motion as a Foreign Language 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14795v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14795v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Biao Jiang, Xin Chen, Wen Liu, Jingyi Yu, Gang Yu, Tao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Though the advancement of pre-trained large language models unfolds, the
+exploration of building a unified model for language and other multi-modal
+data, such as motion, remains challenging and untouched so far. Fortunately,
+human motion displays a semantic coupling akin to human language, often
+perceived as a form of body language. By fusing language data with large-scale
+motion models, motion-language pre-training that can enhance the performance of
+motion-related tasks becomes feasible. Driven by this insight, we propose
+MotionGPT, a unified, versatile, and user-friendly motion-language model to
+handle multiple motion-relevant tasks. Specifically, we employ the discrete
+vector quantization for human motion and transfer 3D motion into motion tokens,
+similar to the generation process of word tokens. Building upon this "motion
+vocabulary", we perform language modeling on both motion and text in a unified
+manner, treating human motion as a specific language. Moreover, inspired by
+prompt learning, we pre-train MotionGPT with a mixture of motion-language data
+and fine-tune it on prompt-based question-and-answer tasks. Extensive
+experiments demonstrate that MotionGPT achieves state-of-the-art performances
+on multiple motion tasks including text-driven motion generation, motion
+captioning, motion prediction, and motion in-between.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://github.com/OpenMotionLab/MotionGPT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DETReg: Unsupervised <span class="highlight-title">Pretrain</span>ing with Region Priors for Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.04550v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.04550v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Bar, Xin Wang, Vadim Kantorov, Colorado J Reed, Roei Herzig, Gal Chechik, Anna Rohrbach, Trevor Darrell, Amir Globerson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent self-supervised pretraining methods for object detection largely focus
+on pretraining the backbone of the object detector, neglecting key parts of
+detection architecture. Instead, we introduce DETReg, a new self-supervised
+method that pretrains the entire object detection network, including the object
+localization and embedding components. During pretraining, DETReg predicts
+object localizations to match the localizations from an unsupervised region
+proposal generator and simultaneously aligns the corresponding feature
+embeddings with embeddings from a self-supervised image encoder. We implement
+DETReg using the DETR family of detectors and show that it improves over
+competitive baselines when finetuned on COCO, PASCAL VOC, and Airbus Ship
+benchmarks. In low-data regimes DETReg achieves improved performance, e.g.,
+when training with only 1% of the labels and in the few-shot learning settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://www.amirbar.net/detreg/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting 3-DoF Ground-to-Satellite Camera Localization Accuracy via
+  Geometry-Guided Cross-View <span class="highlight-title">Transformer</span> <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08015v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08015v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujiao Shi, Fei Wu, Akhil Perincherry, Ankit Vora, Hongdong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image retrieval-based cross-view localization methods often lead to very
+coarse camera pose estimation, due to the limited sampling density of the
+database satellite images. In this paper, we propose a method to increase the
+accuracy of a ground camera's location and orientation by estimating the
+relative rotation and translation between the ground-level image and its
+matched/retrieved satellite image. Our approach designs a geometry-guided
+cross-view transformer that combines the benefits of conventional geometry and
+learnable cross-view transformers to map the ground-view observations to an
+overhead view. Given the synthesized overhead view and observed satellite
+feature maps, we construct a neural pose optimizer with strong global
+information embedding ability to estimate the relative rotation between them.
+After aligning their rotations, we develop an uncertainty-guided spatial
+correlation to generate a probability map of the vehicle locations, from which
+the relative translation can be determined. Experimental results demonstrate
+that our method significantly outperforms the state-of-the-art. Notably, the
+likelihood of restricting the vehicle lateral pose to be within 1m of its
+Ground Truth (GT) value on the cross-view KITTI dataset has been improved from
+$35.54\%$ to $76.44\%$, and the likelihood of restricting the vehicle
+orientation to be within $1^{\circ}$ of its GT value has been improved from
+$19.64\%$ to $99.10\%$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semantically Adversarial Scenario Generation with Explicit Knowledge
+  Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.04066v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.04066v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhao Ding, Haohong Lin, Bo Li, Ding Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating adversarial scenarios, which have the potential to fail autonomous
+driving systems, provides an effective way to improve robustness. Extending
+purely data-driven generative models, recent specialized models satisfy
+additional controllable requirements such as embedding a traffic sign in a
+driving scene by manipulating patterns implicitly in the neuron level. In this
+paper, we introduce a method to incorporate domain knowledge explicitly in the
+generation process to achieve the Semantically Adversarial Generation (SAG). To
+be consistent with the composition of driving scenes, we first categorize the
+knowledge into two types, the property of objects and the relationship among
+objects. We then propose a tree-structured variational auto-encoder (T-VAE) to
+learn hierarchical scene representation. By imposing semantic rules on the
+properties of nodes and edges in the tree structure, explicit knowledge
+integration enables controllable generation. We construct a synthetic example
+to illustrate the controllability and explainability of our method in a
+succinct setting. We further extend to realistic environments for autonomous
+vehicles: our method efficiently identifies adversarial driving scenes against
+different state-of-the-art 3D point cloud segmentation models and satisfies the
+traffic rules specified as the explicit knowledge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preprocessors Matter! Realistic Decision-Based Attacks on Machine
+  Learning Systems <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03297v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03297v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chawin Sitawarin, Florian Tramèr, Nicholas Carlini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision-based attacks construct adversarial examples against a machine
+learning (ML) model by making only hard-label queries. These attacks have
+mainly been applied directly to standalone neural networks. However, in
+practice, ML models are just one component of a larger learning system. We find
+that by adding a single preprocessor in front of a classifier, state-of-the-art
+query-based attacks are up to 7$\times$ less effective at attacking a
+prediction pipeline than at attacking the model alone. We explain this
+discrepancy by the fact that most preprocessors introduce some notion of
+invariance to the input space. Hence, attacks that are unaware of this
+invariance inevitably waste a large number of queries to re-discover or
+overcome it. We, therefore, develop techniques to (i) reverse-engineer the
+preprocessor and then (ii) use this extracted information to attack the
+end-to-end system. Our preprocessors extraction method requires only a few
+hundred queries, and our preprocessor-aware attacks recover the same efficacy
+as when attacking the model alone. The code can be found at
+https://github.com/google-research/preprocessor-aware-black-box-attack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023. Code can be found at
+  https://github.com/google-research/preprocessor-aware-black-box-attack</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open Challenges for Monocular Single-shot 6D Object Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Thalhammer, Peter Hönig, Jean-Baptiste Weibel, Markus Vincze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object pose estimation is a non-trivial task that enables robotic
+manipulation, bin picking, augmented reality, and scene understanding, to name
+a few use cases. Monocular object pose estimation gained considerable momentum
+with the rise of high-performing deep learning-based solutions and is
+particularly interesting for the community since sensors are inexpensive and
+inference is fast. Prior works establish the comprehensive state of the art for
+diverse pose estimation problems. Their broad scopes make it difficult to
+identify promising future directions. We narrow down the scope to the problem
+of single-shot monocular 6D object pose estimation, which is commonly used in
+robotics, and thus are able to identify such trends. By reviewing recent
+publications in robotics and computer vision, the state of the art is
+established at the union of both fields. Following that, we identify promising
+research directions in order to help researchers to formulate relevant research
+ideas and effectively advance the state of the art. Findings include that
+methods are sophisticated enough to overcome the domain shift and that
+occlusion handling is a fundamental challenge. We also highlight problems such
+as novel object pose estimation and challenging materials handling as central
+challenges to advance robotics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revised version in the making</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tranfer Learning of Semantic Segmentation Methods for Identifying Buried
+  Archaeological Structures on LiDAR Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03512v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03512v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paolo Soleni, Wouter B. Verschoof-van der Vaart, Žiga Kokalj, Arianna Traviglia, Marco Fiorucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When applying deep learning to remote sensing data in archaeological
+research, a notable obstacle is the limited availability of suitable datasets
+for training models. The application of transfer learning is frequently
+employed to mitigate this drawback. However, there is still a need to explore
+its effectiveness when applied across different archaeological datasets. This
+paper compares the performance of various transfer learning configurations
+using two semantic segmentation deep neural networks on two LiDAR datasets. The
+experimental results indicate that transfer learning-based approaches in
+archaeology can lead to performance improvements, although a systematic
+enhancement has not yet been observed. We provide specific insights about the
+validity of such techniques that can serve as a baseline for future works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE International Geoscience and Remote Sensing
+  Symposium 2023 (IGARSS 2023) @IEEE copyright</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ADPS: Asymmetric Distillation Post-Segmentation Method for Image Anomaly
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10495v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10495v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Xing, Hao Tang, Jinhui Tang, Zechao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Distillation-based Anomaly Detection (KDAD) methods rely on the
+teacher-student paradigm to detect and segment anomalous regions by contrasting
+the unique features extracted by both networks. However, existing KDAD methods
+suffer from two main limitations: 1) the student network can effortlessly
+replicate the teacher network's representations, and 2) the features of the
+teacher network serve solely as a ``reference standard" and are not fully
+leveraged. Toward this end, we depart from the established paradigm and instead
+propose an innovative approach called Asymmetric Distillation Post-Segmentation
+(ADPS). Our ADPS employs an asymmetric distillation paradigm that takes
+distinct forms of the same image as the input of the teacher-student networks,
+driving the student network to learn discriminating representations for
+anomalous regions.
+  Meanwhile, a customized Weight Mask Block (WMB) is proposed to generate a
+coarse anomaly localization mask that transfers the distilled knowledge
+acquired from the asymmetric paradigm to the teacher network. Equipped with
+WMB, the proposed Post-Segmentation Module (PSM) is able to effectively detect
+and segment abnormal regions with fine structures and clear boundaries.
+Experimental results demonstrate that the proposed ADPS outperforms the
+state-of-the-art methods in detecting and segmenting anomalies. Surprisingly,
+ADPS significantly improves Average Precision (AP) metric by 9% and 20% on the
+MVTec AD and KolektorSDD2 datasets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11pages,9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">12</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Factual Knowledge Boundary of Large Language Models
+  with Retrieval Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiyang Ren, Yuhao Wang, Yingqi Qu, Wayne Xin Zhao, Jing Liu, Hao Tian, Hua Wu, Ji-Rong Wen, Haifeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require
+a substantial amount of factual knowledge and often rely on external
+information for assistance. Recently, large language models (LLMs) (e.g.,
+ChatGPT), have demonstrated impressive prowess in solving a wide range of tasks
+with world knowledge, including knowledge-intensive tasks. However, it remains
+unclear how well LLMs are able to perceive their factual knowledge boundaries,
+particularly how they behave when incorporating retrieval augmentation. In this
+study, we present an initial analysis of the factual knowledge boundaries of
+LLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,
+we focus on three primary research questions and analyze them by examining QA
+performance, priori judgement and posteriori judgement of LLMs. We show
+evidence that LLMs possess unwavering confidence in their capabilities to
+respond to questions and the accuracy of their responses. Furthermore,
+retrieval augmentation proves to be an effective approach in enhancing LLMs'
+awareness of knowledge boundaries, thereby improving their judgemental
+abilities. Additionally, we also find that LLMs have a propensity to rely on
+the provided retrieval results when formulating answers, while the quality of
+these results significantly impacts their reliance. The code to reproduce this
+work is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Job Recommendation through LLM-based Generative Adversarial
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10747v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10747v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingpeng Du, Di Luo, Rui Yan, Hongzhi Liu, Yang Song, Hengshu Zhu, Jie Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommending suitable jobs to users is a critical task in online recruitment
+platforms, as it can enhance users' satisfaction and the platforms'
+profitability. While existing job recommendation methods encounter challenges
+such as the low quality of users' resumes, which hampers their accuracy and
+practical effectiveness. With the rapid development of large language models
+(LLMs), utilizing the rich external knowledge encapsulated within them, as well
+as their powerful capabilities of text processing and reasoning, is a promising
+way to complete users' resumes for more accurate recommendations. However,
+directly leveraging LLMs to enhance recommendation results is not a
+one-size-fits-all solution, as LLMs may suffer from fabricated generation and
+few-shot problems, which degrade the quality of resume completion. In this
+paper, we propose a novel LLM-based approach for job recommendation. To
+alleviate the limitation of fabricated generation for LLMs, we extract accurate
+and valuable information beyond users' self-description, which helps the LLMs
+better profile users for resume completion. Specifically, we not only extract
+users' explicit properties (e.g., skills, interests) from their
+self-description but also infer users' implicit characteristics from their
+behaviors for more accurate and meaningful resume completion. Nevertheless,
+some users still suffer from few-shot problems, which arise due to scarce
+interaction records, leading to limited guidance for the models in generating
+high-quality resumes. To address this issue, we propose aligning unpaired
+low-quality with high-quality generated resumes by Generative Adversarial
+Networks (GANs), which can refine the resume representations for better
+recommendation results. Extensive experiments on three large real-world
+recruitment datasets demonstrate the effectiveness of our proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Constraint-based Recommender System via RDF Knowledge Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10702v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10702v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ngoc Luyen Le, Marie-Hélène Abel, Philippe Gouspillou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graphs, represented in RDF, are able to model entities and their
+relations by means of ontologies. The use of knowledge graphs for information
+modeling has attracted interest in recent years. In recommender systems, items
+and users can be mapped and integrated into the knowledge graph, which can
+represent more links and relationships between users and items.
+Constraint-based recommender systems are based on the idea of explicitly
+exploiting deep recommendation knowledge through constraints to identify
+relevant recommendations. When combined with knowledge graphs, a
+constraint-based recommender system gains several benefits in terms of
+constraint sets. In this paper, we investigate and propose the construction of
+a constraint-based recommender system via RDF knowledge graphs applied to the
+vehicle purchase/sale domain. The results of our experiments show that the
+proposed approach is able to efficiently identify recommendations in accordance
+with user preferences.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Personalized Recommender System Based-on Knowledge Graph Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ngoc Luyen Le, Marie-Hélène Abel, Philippe Gouspillou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graphs have proven to be effective for modeling entities and their
+relationships through the use of ontologies. The recent emergence in interest
+for using knowledge graphs as a form of information modeling has led to their
+increased adoption in recommender systems. By incorporating users and items
+into the knowledge graph, these systems can better capture the implicit
+connections between them and provide more accurate recommendations. In this
+paper, we investigate and propose the construction of a personalized
+recommender system via knowledge graphs embedding applied to the vehicle
+purchase/sale domain. The results of our experimentation demonstrate the
+efficacy of the proposed method in providing relevant recommendations that are
+consistent with individual users.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language-Enhanced Session-Based Recommendation with Decoupled
+  Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Zhang, Piao Tong, Yingwei Ma, Qiao Liu, Xujiang Liu, Xu Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Session-based recommendation techniques aim to capture dynamic user behavior
+by analyzing past interactions. However, existing methods heavily rely on
+historical item ID sequences to extract user preferences, leading to challenges
+such as popular bias and cold-start problems. In this paper, we propose a
+hybrid multimodal approach for session-based recommendation to address these
+challenges. Our approach combines different modalities, including textual
+content and item IDs, leveraging the complementary nature of these modalities
+using CatBoost. To learn universal item representations, we design a language
+representation-based item retrieval architecture that extracts features from
+the textual content utilizing pre-trained language models. Furthermore, we
+introduce a novel Decoupled Contrastive Learning method to enhance the
+effectiveness of the language representation. This technique decouples the
+sequence representation and item representation space, facilitating
+bidirectional alignment through dual-queue contrastive learning.
+Simultaneously, the momentum queue provides a large number of negative samples,
+effectively enhancing the effectiveness of contrastive learning. Our approach
+yielded competitive results, securing a 5th place ranking in KDD CUP 2023 Task
+1. We have released the source code and pre-trained models associated with this
+work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Semantic Similarity Measure Within a Recommender System
+  Based-on RDF Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ngoc Luyen Le, Marie-Hélène Abel, Philippe Gouspillou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In today's era of information explosion, more users are becoming more reliant
+upon recommender systems to have better advice, suggestions, or inspire them.
+The measure of the semantic relatedness or likeness between terms, words, or
+text data plays an important role in different applications dealing with
+textual data, as in a recommender system. Over the past few years, many
+ontologies have been developed and used as a form of structured representation
+of knowledge bases for information systems. The measure of semantic similarity
+from ontology has developed by several methods. In this paper, we propose and
+carry on an approach for the improvement of semantic similarity calculations
+within a recommender system based-on RDF graphs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting deceptive <span class="highlight-title">review</span>s using text classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anusuya Baby
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, online reviews play a vital role for promoting any kind of
+product or services. Businesses may embed fake reviews in order to attract
+customers to purchase their products. They may even highlight the benefits of
+their own product or criticize the competition's product. Marketers,
+advertisers, and other online business users have incentive to create fake
+positive reviews for products which they want to promote or give fake negative
+reviews for products which they really don't like. So now-a-days writing a
+deceptive review is inevitable thing for promoting their own business or
+degrading competitor's reputation. Thus, identifying deceptive reviews is an
+intense and on-going research area. This research paper proposes machine
+learning model approach to identify deceptive reviews. The paper investigates
+the performance of the several experiments done on a Deceptive Opinion Spam
+Corpus dataset of restaurants reviews. We developed a n-gram model and max
+features to identify deceptive contents with a particular focus on fake
+reviews. Further, we conduct a benchmark study to investigate the performance
+of two different features extraction techniques and apply five machine learning
+classification techniques. The experimental results show that passive
+aggressive classifier outperforms other algorithms, and it reaches the highest
+accuracy not only in text classification but also to fake reviews. We also
+study the data augmentation and implement different deep learning techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jina Embeddings: A Novel Set of High-Performance Sentence Embedding
+  Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11224v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11224v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Günther, Louis Milliken, Jonathan Geuter, Georgios Mastrapas, Bo Wang, Han Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Jina Embeddings constitutes a set of high-performance sentence embedding
+models adept at translating various textual inputs into numerical
+representations, thereby capturing the semantic essence of the text. While
+these models are not exclusively designed for text generation, they excel in
+applications such as dense retrieval and semantic textual similarity. This
+paper details the development of Jina Embeddings, starting with the creation of
+a high-quality pairwise and triplet dataset. It underlines the crucial role of
+data cleaning in dataset preparation, gives in-depth insights into the model
+training process, and concludes with a comprehensive performance evaluation
+using the Massive Textual Embedding Benchmark (MTEB).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 page appendix, EMNLP 2023 Industrial Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RCVaR: an Economic Approach to Estimate Cyberattacks Costs using Data
+  from Industry Reports 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11140v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11140v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muriel Figueredo Franco, Fabian Künzler, Jan von der Assen, Chao Feng, Burkhard Stiller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digitization increases business opportunities and the risk of companies being
+victims of devastating cyberattacks. Therefore, managing risk exposure and
+cybersecurity strategies is essential for digitized companies that want to
+survive in competitive markets. However, understanding company-specific risks
+and quantifying their associated costs is not trivial. Current approaches fail
+to provide individualized and quantitative monetary estimations of
+cybersecurity impacts. Due to limited resources and technical expertise, SMEs
+and even large companies are affected and struggle to quantify their
+cyberattack exposure. Therefore, novel approaches must be placed to support the
+understanding of the financial loss due to cyberattacks. This article
+introduces the Real Cyber Value at Risk (RCVaR), an economical approach for
+estimating cybersecurity costs using real-world information from public
+cybersecurity reports. RCVaR identifies the most significant cyber risk factors
+from various sources and combines their quantitative results to estimate
+specific cyberattacks costs for companies. Furthermore, RCVaR extends current
+methods to achieve cost and risk estimations based on historical real-world
+data instead of only probability-based simulations. The evaluation of the
+approach on unseen data shows the accuracy and efficiency of the RCVaR in
+predicting and managing cyber risks. Thus, it shows that the RCVaR is a
+valuable addition to cybersecurity planning and risk management processes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Criterion-based Heterogeneous Collaborative Filtering for Multi-behavior
+  Implicit Recommendation <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.11876v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.11876v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Luo, Daqing Wu, Yiyang Gu, Chong Chen, Luchen Liu, Jinwen Ma, Ming Zhang, Minghua Deng, Jianqiang Huang, Xian-Sheng Hua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed the explosive growth of interaction behaviors in
+multimedia information systems, where multi-behavior recommender systems have
+received increasing attention by leveraging data from various auxiliary
+behaviors such as tip and collect. Among various multi-behavior recommendation
+methods, non-sampling methods have shown superiority over negative sampling
+methods. However, two observations are usually ignored in existing
+state-of-the-art non-sampling methods based on binary regression: (1) users
+have different preference strengths for different items, so they cannot be
+measured simply by binary implicit data; (2) the dependency across multiple
+behaviors varies for different users and items. To tackle the above issue, we
+propose a novel non-sampling learning framework named Criterion-guided
+Heterogeneous Collaborative Filtering (CHCF). CHCF introduces both upper and
+lower thresholds to indicate selection criteria, which will guide user
+preference learning. Besides, CHCF integrates criterion learning and user
+preference learning into a unified framework, which can be trained jointly for
+the interaction prediction of the target behavior. We further theoretically
+demonstrate that the optimization of Collaborative Metric Learning can be
+approximately achieved by the CHCF learning framework in a non-sampling form
+effectively. Extensive experiments on three real-world datasets show the
+effectiveness of CHCF in heterogeneous scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Transactions on Knowledge Discovery from Data (TKDD)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ABNIRML: Analyzing the Behavior of Neural IR Models <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.00696v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.00696v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean MacAvaney, Sergey Feldman, Nazli Goharian, Doug Downey, Arman Cohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained contextualized language models such as BERT and T5 have
+established a new state-of-the-art for ad-hoc search. However, it is not yet
+well-understood why these methods are so effective, what makes some variants
+more effective than others, and what pitfalls they may have. We present a new
+comprehensive framework for Analyzing the Behavior of Neural IR ModeLs
+(ABNIRML), which includes new types of diagnostic probes that allow us to test
+several characteristics -- such as writing styles, factuality, sensitivity to
+paraphrasing and word order -- that are not addressed by previous techniques.
+To demonstrate the value of the framework, we conduct an extensive empirical
+study that yields insights into the factors that contribute to the neural
+model's gains, and identify potential unintended biases the models exhibit.
+Some of our results confirm conventional wisdom, like that recent neural
+ranking models rely less on exact term overlap with the query, and instead
+leverage richer linguistic information, evidenced by their higher sensitivity
+to word and sentence order. Other results are more surprising, such as that
+some models (e.g., T5 and ColBERT) are biased towards factually correct (rather
+than simply relevant) texts. Further, some characteristics vary even for the
+same base language model, and other characteristics can appear due to random
+variations during model training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TACL version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chat<span class="highlight-title">GPT</span> Chemistry Assistant for Text Mining and Prediction of MOF
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiling Zheng, Oufan Zhang, Christian Borgs, Jennifer T. Chayes, Omar M. Yaghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We use prompt engineering to guide ChatGPT in the automation of text mining
+of metal-organic frameworks (MOFs) synthesis conditions from diverse formats
+and styles of the scientific literature. This effectively mitigates ChatGPT's
+tendency to hallucinate information -- an issue that previously made the use of
+Large Language Models (LLMs) in scientific fields challenging. Our approach
+involves the development of a workflow implementing three different processes
+for text mining, programmed by ChatGPT itself. All of them enable parsing,
+searching, filtering, classification, summarization, and data unification with
+different tradeoffs between labor, speed, and accuracy. We deploy this system
+to extract 26,257 distinct synthesis parameters pertaining to approximately 800
+MOFs sourced from peer-reviewed research articles. This process incorporates
+our ChemPrompt Engineering strategy to instruct ChatGPT in text mining,
+resulting in impressive precision, recall, and F1 scores of 90-99%.
+Furthermore, with the dataset built by text mining, we constructed a
+machine-learning model with over 86% accuracy in predicting MOF experimental
+crystallization outcomes and preliminarily identifying important factors in MOF
+crystallization. We also developed a reliable data-grounded MOF chatbot to
+answer questions on chemical reactions and synthesis procedures. Given that the
+process of using ChatGPT reliably mines and tabulates diverse MOF synthesis
+information in a unified format, while using only narrative language requiring
+no coding expertise, we anticipate that our ChatGPT Chemistry Assistant will be
+very useful across various other chemistry sub-disciplines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on Journal of the American Chemical Society (2023); 102
+  pages (18-page manuscript, 84 pages of supporting information)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">170</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-driven criteria for quantum correlations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mateusz Krawczyk, Jarosław Pawłowski, Maciej M. Maśka, Katarzyna Roszak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We build a machine learning model to detect correlations in a three-qubit
+system using a neural network trained in an unsupervised manner on randomly
+generated states. The network is forced to recognize separable states, and
+correlated states are detected as anomalies. Quite surprisingly, we find that
+the proposed detector performs much better at distinguishing a weaker form of
+quantum correlations, namely, the quantum discord, than entanglement. In fact,
+it has a tendency to grossly overestimate the set of entangled states even at
+the optimal threshold for entanglement detection, while it underestimates the
+set of discordant states to a much lesser extent. In order to illustrate the
+nature of states classified as quantum-correlated, we construct a diagram
+containing various types of states -- entangled, as well as separable, both
+discordant and non-discordant. We find that the near-zero value of the
+recognition loss reproduces the shape of the non-discordant separable states
+with high accuracy, especially considering the non-trivial shape of this set on
+the diagram. The network architecture is designed carefully: it preserves
+separability, and its output is equivariant with respect to qubit permutations.
+We show that the choice of architecture is important to get the highest
+detection accuracy, much better than for a baseline model that just utilizes a
+partial trace operation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, 3 tables, and extra 5 pages of supplementary
+  materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAPR: Proximity Attention Point Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanshu Zhang, Shichong Peng, Alireza Moazeni, Ke Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning accurate and parsimonious point cloud representations of scene
+surfaces from scratch remains a challenge in 3D representation learning.
+Existing point-based methods often suffer from the vanishing gradient problem
+or require a large number of points to accurately model scene geometry and
+texture. To address these limitations, we propose Proximity Attention Point
+Rendering (PAPR), a novel method that consists of a point-based scene
+representation and a differentiable renderer. Our scene representation uses a
+point cloud where each point is characterized by its spatial position,
+foreground score, and view-independent feature vector. The renderer selects the
+relevant points for each ray and produces accurate colours using their
+associated features. PAPR effectively learns point cloud positions to represent
+the correct scene geometry, even when the initialization drastically differs
+from the target geometry. Notably, our method captures fine texture details
+while using only a parsimonious set of points. We also demonstrate four
+practical applications of our method: geometry editing, object manipulation,
+texture transfer, and exposure control. More results and code are available on
+our project website at https://zvict.github.io/papr/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Representation Learning in Anomaly Detection: Successes, Limits and a
+  Grand Challenge <span class="chip">CVPR'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yedid Hoshen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this perspective paper, we argue that the dominant paradigm in anomaly
+detection cannot scale indefinitely and will eventually hit fundamental limits.
+This is due to the a no free lunch principle for anomaly detection. These
+limitations can be overcome when there are strong tasks priors, as is the case
+for many industrial tasks. When such priors do not exists, the task is much
+harder for anomaly detection. We pose two such tasks as grand challenges for
+anomaly detection: i) scientific discovery by anomaly detection ii) a
+"mini-grand" challenge of detecting the most anomalous image in the ImageNet
+dataset. We believe new anomaly detection tools and ideas would need to be
+developed to overcome these challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Keynote talk at the Visual Anomaly and Novelty Detection Workshop,
+  CVPR'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GLSFormer : Gated - Long, Short Sequence <span class="highlight-title">Transformer</span> for Step
+  Recognition in Surgical Videos <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nisarg A. Shah, Shameema Sikder, S. Swaroop Vedula, Vishal M. Patel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated surgical step recognition is an important task that can
+significantly improve patient safety and decision-making during surgeries.
+Existing state-of-the-art methods for surgical step recognition either rely on
+separate, multi-stage modeling of spatial and temporal information or operate
+on short-range temporal resolution when learned jointly. However, the benefits
+of joint modeling of spatio-temporal features and long-range information are
+not taken in account. In this paper, we propose a vision transformer-based
+approach to jointly learn spatio-temporal features directly from sequence of
+frame-level patches. Our method incorporates a gated-temporal attention
+mechanism that intelligently combines short-term and long-term spatio-temporal
+feature representations. We extensively evaluate our approach on two cataract
+surgery video datasets, namely Cataract-101 and D99, and demonstrate superior
+performance compared to various state-of-the-art methods. These results
+validate the suitability of our proposed approach for automated surgical step
+recognition. Our code is released at:
+https://github.com/nisargshah1999/GLSFormer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023 (Early Accept)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Brain2Music: Reconstructing Music from Human Brain Activity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timo I. Denk, Yu Takagi, Takuya Matsuyama, Andrea Agostinelli, Tomoya Nakai, Christian Frank, Shinji Nishimoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The process of reconstructing experiences from human brain activity offers a
+unique lens into how the brain interprets and represents the world. In this
+paper, we introduce a method for reconstructing music from brain activity,
+captured using functional magnetic resonance imaging (fMRI). Our approach uses
+either music retrieval or the MusicLM music generation model conditioned on
+embeddings derived from fMRI data. The generated music resembles the musical
+stimuli that human subjects experienced, with respect to semantic properties
+like genre, instrumentation, and mood. We investigate the relationship between
+different components of MusicLM and brain activity through a voxel-wise
+encoding modeling analysis. Furthermore, we discuss which brain regions
+represent information derived from purely textual descriptions of music
+stimuli. We provide supplementary material including examples of the
+reconstructed music at https://google-research.github.io/seanet/brain2music
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint; 21 pages; supplementary material:
+  https://google-research.github.io/seanet/brain2music</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AlignDet: Aligning <span class="highlight-title">Pre-train</span>ing and Fine-tuning in Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Li, Jie Wu, Xionghui Wang, Chen Chen, Jie Qin, Xuefeng Xiao, Rui Wang, Min Zheng, Xin Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paradigm of large-scale pre-training followed by downstream fine-tuning
+has been widely employed in various object detection algorithms. In this paper,
+we reveal discrepancies in data, model, and task between the pre-training and
+fine-tuning procedure in existing practices, which implicitly limit the
+detector's performance, generalization ability, and convergence speed. To this
+end, we propose AlignDet, a unified pre-training framework that can be adapted
+to various existing detectors to alleviate the discrepancies. AlignDet
+decouples the pre-training process into two stages, i.e., image-domain and
+box-domain pre-training. The image-domain pre-training optimizes the detection
+backbone to capture holistic visual abstraction, and box-domain pre-training
+learns instance-level semantics and task-aware concepts to initialize the parts
+out of the backbone. By incorporating the self-supervised pre-trained
+backbones, we can pre-train all modules for various detectors in an
+unsupervised paradigm. As depicted in Figure 1, extensive experiments
+demonstrate that AlignDet can achieve significant improvements across diverse
+protocols, such as detection algorithm, model backbone, data setting, and
+training schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by
+2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Code and Models are publicly available.
+  Project Page: https://liming-ai.github.io/AlignDet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effectiveness and predictability of in-network storage cache for
+  scientific workflows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caitlin Sim, Kesheng Wu, Alex Sim, Inder Monga, Chin Guok, Frank Wurthwein, Diego Davila, Harvey Newman, Justas Balcas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large scientific collaborations often have multiple scientists accessing the
+same set of files while doing different analyses, which create repeated
+accesses to the large amounts of shared data located far away. These data
+accesses have long latency due to distance and occupy the limited bandwidth
+available over the wide-area network. To reduce the wide-area network traffic
+and the data access latency, regional data storage caches have been installed
+as a new networking service. To study the effectiveness of such a cache system
+in scientific applications, we examine the Southern California Petabyte Scale
+Cache for a high-energy physics experiment. By examining about 3TB of
+operational logs, we show that this cache removed 67.6% of file requests from
+the wide-area network and reduced the traffic volume on wide-area network by
+12.3TB (or 35.4%) an average day. The reduction in the traffic volume (35.4%)
+is less than the reduction in file counts (67.6%) because the larger files are
+less likely to be reused. Due to this difference in data access patterns, the
+cache system has implemented a policy to avoid evicting smaller files when
+processing larger files. We also build a machine learning model to study the
+predictability of the cache behavior. Tests show that this model is able to
+accurately predict the cache accesses, cache misses, and network throughput,
+making the model useful for future studies on resource provisioning and
+planning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Breadcrumbs to the Goal: Goal-Conditioned Exploration from
+  Human-in-the-Loop Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcel Torne, Max Balsells, Zihan Wang, Samedh Desai, Tao Chen, Pulkit Agrawal, Abhishek Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploration and reward specification are fundamental and intertwined
+challenges for reinforcement learning. Solving sequential decision-making tasks
+requiring expansive exploration requires either careful design of reward
+functions or the use of novelty-seeking exploration bonuses. Human supervisors
+can provide effective guidance in the loop to direct the exploration process,
+but prior methods to leverage this guidance require constant synchronous
+high-quality human feedback, which is expensive and impractical to obtain. In
+this work, we present a technique called Human Guided Exploration (HuGE), which
+uses low-quality feedback from non-expert users that may be sporadic,
+asynchronous, and noisy. HuGE guides exploration for reinforcement learning not
+only in simulation but also in the real world, all without meticulous reward
+specification. The key concept involves bifurcating human feedback and policy
+learning: human feedback steers exploration, while self-supervised learning
+from the exploration data yields unbiased policies. This procedure can leverage
+noisy, asynchronous human feedback to learn policies with no hand-crafted
+reward design or exploration bonuses. HuGE is able to learn a variety of
+challenging multi-stage robotic navigation and manipulation tasks in simulation
+using crowdsourced feedback from non-expert users. Moreover, this paradigm can
+be scaled to learning directly on real-world robots, using occasional,
+asynchronous feedback from human supervisors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Definition of Continual Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Abel, André Barreto, Benjamin Van Roy, Doina Precup, Hado van Hasselt, Satinder Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we develop a foundation for continual reinforcement learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Convergence of Bounded Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11044v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11044v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Abel, André Barreto, Hado van Hasselt, Benjamin Van Roy, Doina Precup, Satinder Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When has an agent converged? Standard models of the reinforcement learning
+problem give rise to a straightforward definition of convergence: An agent
+converges when its behavior or performance in each environment state stops
+changing. However, as we shift the focus of our learning problem from the
+environment's state to the agent's state, the concept of an agent's convergence
+becomes significantly less clear. In this paper, we propose two complementary
+accounts of agent convergence in a framing of the reinforcement learning
+problem that centers around bounded agents. The first view says that a bounded
+agent has converged when the minimal number of states needed to describe the
+agent's future behavior cannot decrease. The second view says that a bounded
+agent has converged just when the agent's performance only changes if the
+agent's internal state changes. We establish basic properties of these two
+definitions, show that they accommodate typical views of convergence in
+standard settings, and prove several facts about their nature and relationship.
+We take these perspectives, definitions, and analysis to bring clarity to a
+central idea of the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embroid: Unsupervised Prediction Smoothing Can Improve Few-Shot
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neel Guha, Mayee F. Chen, Kush Bhatia, Azalia Mirhoseini, Frederic Sala, Christopher Ré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that language models' (LMs) prompt-based learning
+capabilities make them well suited for automating data labeling in domains
+where manual annotation is expensive. The challenge is that while writing an
+initial prompt is cheap, improving a prompt is costly -- practitioners often
+require significant labeled data in order to evaluate the impact of prompt
+modifications. Our work asks whether it is possible to improve prompt-based
+learning without additional labeled data. We approach this problem by
+attempting to modify the predictions of a prompt, rather than the prompt
+itself. Our intuition is that accurate predictions should also be consistent:
+samples which are similar under some feature representation should receive the
+same prompt prediction. We propose Embroid, a method which computes multiple
+representations of a dataset under different embedding functions, and uses the
+consistency between the LM predictions for neighboring samples to identify
+mispredictions. Embroid then uses these neighborhoods to create additional
+predictions for each sample, and combines these predictions with a simple
+latent variable graphical model in order to generate a final corrected
+prediction. In addition to providing a theoretical analysis of Embroid, we
+conduct a rigorous empirical evaluation across six different LMs and up to 95
+different tasks. We find that (1) Embroid substantially improves performance
+over original prompts (e.g., by an average of 7.3 points on GPT-JT), (2) also
+realizes improvements for more sophisticated prompting strategies (e.g.,
+chain-of-thought), and (3) can be specialized to domains like law through the
+embedding functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 22 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cluster-aware Semi-supervised Learning: Relational Knowledge
+  Distillation Provably Learns Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijun Dong, Kevin Miller, Qi Lei, Rachel Ward
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the empirical success and practical significance of (relational)
+knowledge distillation that matches (the relations of) features between teacher
+and student models, the corresponding theoretical interpretations remain
+limited for various knowledge distillation paradigms. In this work, we take an
+initial step toward a theoretical understanding of relational knowledge
+distillation (RKD), with a focus on semi-supervised classification problems. We
+start by casting RKD as spectral clustering on a population-induced graph
+unveiled by a teacher model. Via a notion of clustering error that quantifies
+the discrepancy between the predicted and ground truth clusterings, we
+illustrate that RKD over the population provably leads to low clustering error.
+Moreover, we provide a sample complexity bound for RKD with limited unlabeled
+samples. For semi-supervised learning, we further demonstrate the label
+efficiency of RKD through a general framework of cluster-aware semi-supervised
+learning that assumes low clustering errors. Finally, by unifying data
+augmentation consistency regularization into this cluster-aware framework, we
+show that despite the common effect of learning accurate clusterings, RKD
+facilitates a "global" perspective through spectral clustering, whereas
+consistency regularization focuses on a "local" perspective via expansion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Amortized Variational Inference: When and Why? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charles C. Margossian, David M. Blei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Amortized variational inference (A-VI) is a method for approximating the
+intractable posterior distributions that arise in probabilistic models. The
+defining feature of A-VI is that it learns a global inference function that
+maps each observation to its local latent variable's approximate posterior.
+This stands in contrast to the more classical factorized (or mean-field)
+variational inference (F-VI), which directly learns the parameters of the
+approximating distribution for each latent variable. In deep generative models,
+A-VI is used as a computational trick to speed up inference for local latent
+variables. In this paper, we study A-VI as a general alternative to F-VI for
+approximate posterior inference. A-VI cannot produce an approximation with a
+lower Kullback-Leibler divergence than F-VI's optimal solution, because the
+amortized family is a subset of the factorized family. Thus a central
+theoretical problem is to characterize when A-VI still attains F-VI's optimal
+solution. We derive conditions on both the model and the inference function
+under which A-VI can theoretically achieve F-VI's optimum. We show that for a
+broad class of hierarchical models, including deep generative models, it is
+possible to close the gap between A-VI and F-VI. Further, for an even broader
+class of models, we establish when and how to expand the domain of the
+inference function to make amortization a feasible strategy. Finally, we prove
+that for certain models -- including hidden Markov models and Gaussian
+processes -- A-VI cannot match F-VI's solution, no matter how expressive the
+inference function is. We also study A-VI empirically [...]
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-objective point cloud autoencoders for explainable myocardial
+  infarction prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcel Beetz, Abhirup Banerjee, Vicente Grau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Myocardial infarction (MI) is one of the most common causes of death in the
+world. Image-based biomarkers commonly used in the clinic, such as ejection
+fraction, fail to capture more complex patterns in the heart's 3D anatomy and
+thus limit diagnostic accuracy. In this work, we present the multi-objective
+point cloud autoencoder as a novel geometric deep learning approach for
+explainable infarction prediction, based on multi-class 3D point cloud
+representations of cardiac anatomy and function. Its architecture consists of
+multiple task-specific branches connected by a low-dimensional latent space to
+allow for effective multi-objective learning of both reconstruction and MI
+prediction, while capturing pathology-specific 3D shape information in an
+interpretable latent space. Furthermore, its hierarchical branch design with
+point cloud-based deep learning operations enables efficient multi-scale
+feature learning directly on high-resolution anatomy point clouds. In our
+experiments on a large UK Biobank dataset, the multi-objective point cloud
+autoencoder is able to accurately reconstruct multi-temporal 3D shapes with
+Chamfer distances between predicted and input anatomies below the underlying
+images' pixel resolution. Our method outperforms multiple machine learning and
+deep learning benchmarks for the task of incident MI prediction by 19% in terms
+of Area Under the Receiver Operating Characteristic curve. In addition, its
+task-specific compact latent space exhibits easily separable control and MI
+clusters with clinically plausible associations between subject encodings and
+corresponding 3D shapes, thus demonstrating the explainability of the
+prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flow Map Learning for Unknown Dynamical Systems: <span class="highlight-title">Overview</span>,
+  Implementation, and Benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11013v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11013v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Churchill, Dongbin Xiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Flow map learning (FML), in conjunction with deep neural networks (DNNs), has
+shown promises for data driven modeling of unknown dynamical systems. A
+remarkable feature of FML is that it is capable of producing accurate
+predictive models for partially observed systems, even when their exact
+mathematical models do not exist. In this paper, we present an overview of the
+FML framework, along with the important computational details for its
+successful implementation. We also present a set of well defined benchmark
+problems for learning unknown dynamical systems. All the numerical details of
+these problems are presented, along with their FML results, to ensure that the
+problems are accessible for cross-examination and the results are reproducible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neuron Sensitivity Guided Test Case Selection for Deep Learning Testing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Huang, Qingwen Bu, Yichao Fu, Yuhao Qing, Bocheng Xiao, Heming Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks~(DNNs) have been widely deployed in software to address
+various tasks~(e.g., autonomous driving, medical diagnosis). However, they
+could also produce incorrect behaviors that result in financial losses and even
+threaten human safety. To reveal the incorrect behaviors in DNN and repair
+them, DNN developers often collect rich unlabeled datasets from the natural
+world and label them to test the DNN models. However, properly labeling a large
+number of unlabeled datasets is a highly expensive and time-consuming task.
+  To address the above-mentioned problem, we propose NSS, Neuron Sensitivity
+guided test case Selection, which can reduce the labeling time by selecting
+valuable test cases from unlabeled datasets. NSS leverages the internal
+neuron's information induced by test cases to select valuable test cases, which
+have high confidence in causing the model to behave incorrectly. We evaluate
+NSS with four widely used datasets and four well-designed DNN models compared
+to SOTA baseline methods. The results show that NSS performs well in assessing
+the test cases' probability of fault triggering and model improvement
+capabilities. Specifically, compared with baseline approaches, NSS obtains a
+higher fault detection rate~(e.g., when selecting 5\% test case from the
+unlabeled dataset in MNIST \& LeNet1 experiment, NSS can obtain 81.8\% fault
+detection rate, 20\% higher than baselines).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sharpness Minimization Algorithms Do Not Only Minimize Sharpness To
+  Achieve Better Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyue Wen, Tengyu Ma, Zhiyuan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite extensive studies, the underlying reason as to why overparameterized
+neural networks can generalize remains elusive. Existing theory shows that
+common stochastic optimizers prefer flatter minimizers of the training loss,
+and thus a natural potential explanation is that flatness implies
+generalization. This work critically examines this explanation. Through
+theoretical and empirical investigation, we identify the following three
+scenarios for two-layer ReLU networks: (1) flatness provably implies
+generalization; (2) there exist non-generalizing flattest models and sharpness
+minimization algorithms fail to generalize, and (3) perhaps most surprisingly,
+there exist non-generalizing flattest models, but sharpness minimization
+algorithms still generalize. Our results suggest that the relationship between
+sharpness and generalization subtly depends on the data distributions and the
+model architectures and sharpness minimization algorithms do not only minimize
+sharpness to achieve better generalization. This calls for the search for other
+explanations for the generalization of over-parameterized neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages,11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Private Federated Learning with Autotuned Compression <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enayat Ullah, Christopher A. Choquette-Choo, Peter Kairouz, Sewoong Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose new techniques for reducing communication in private federated
+learning without the need for setting or tuning compression rates. Our
+on-the-fly methods automatically adjust the compression rate based on the error
+induced during training, while maintaining provable privacy guarantees through
+the use of secure aggregation and differential privacy. Our techniques are
+provably instance-optimal for mean estimation, meaning that they can adapt to
+the ``hardness of the problem" with minimal interactivity. We demonstrate the
+effectiveness of our approach on real-world datasets by achieving favorable
+compression rates without the need for tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DREAM: Domain-free Reverse Engineering Attributes of Black-box Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10997v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10997v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongqing Li, Jiaqi Yu, Changsheng Li, Wenhan Luo, Ye Yuan, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models are usually black boxes when deployed on machine
+learning platforms. Prior works have shown that the attributes ($e.g.$, the
+number of convolutional layers) of a target black-box neural network can be
+exposed through a sequence of queries. There is a crucial limitation: these
+works assume the dataset used for training the target model to be known
+beforehand and leverage this dataset for model attribute attack. However, it is
+difficult to access the training dataset of the target black-box model in
+reality. Therefore, whether the attributes of a target black-box model could be
+still revealed in this case is doubtful. In this paper, we investigate a new
+problem of Domain-agnostic Reverse Engineering the Attributes of a black-box
+target Model, called DREAM, without requiring the availability of the target
+model's training dataset, and put forward a general and principled framework by
+casting this problem as an out of distribution (OOD) generalization problem. In
+this way, we can learn a domain-agnostic model to inversely infer the
+attributes of a target black-box model with unknown training data. This makes
+our method one of the kinds that can gracefully apply to an arbitrary domain
+for model attribute reverse engineering with strong generalization ability.
+Extensive experimental studies are conducted and the results validate the
+superiority of our proposed method over the baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive distillation diffusion for raw music generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Svetlana Pavlova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper aims to apply a new deep learning approach to the task of
+generating raw audio files. It is based on diffusion models, a recent type of
+deep generative model. This new type of method has recently shown outstanding
+results with image generation. A lot of focus has been given to those models by
+the computer vision community. On the other hand, really few have been given
+for other types of applications such as music generation in waveform domain.
+  In this paper the model for unconditional generating applied to music is
+implemented: Progressive distillation diffusion with 1D U-Net. Then, a
+comparison of different parameters of diffusion and their value in a full
+result is presented. One big advantage of the methods implemented through this
+work is the fact that the model is able to deal with progressing audio
+processing and generating , using transformation from 1-channel 128 x 384 to
+3-channel 128 x 128 mel-spectrograms and looped generation. The empirical
+comparisons are realized across different self-collected datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating minimizing the training set fill distance in machine
+  learning regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paolo Climaco, Jochen Garcke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many machine learning regression methods leverage large datasets for training
+predictive models. However, using large datasets may not be feasible due to
+computational limitations or high labelling costs. Therefore, sampling small
+training sets from large pools of unlabelled data points is essential to
+maximize model performance while maintaining computational efficiency. In this
+work, we study a sampling approach aimed to minimize the fill distance of the
+selected set. We derive an upper bound for the maximum expected prediction
+error that linearly depends on the training set fill distance, conditional to
+the knowledge of data features. For empirical validation, we perform
+experiments using two regression models on two datasets. We empirically show
+that selecting a training set by aiming to minimize the fill distance, thereby
+minimizing the bound, significantly reduces the maximum prediction error of
+various regression models, outperforming existing sampling approaches by a
+large margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MASR: Metadata Aware Speech Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anjali Raj, Shikhar Bharadwaj, Sriram Ganapathy, Min Ma, Shikhar Vashishth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the recent years, speech representation learning is constructed primarily
+as a self-supervised learning (SSL) task, using the raw audio signal alone,
+while ignoring the side-information that is often available for a given speech
+recording. In this paper, we propose MASR, a Metadata Aware Speech
+Representation learning framework, which addresses the aforementioned
+limitations. MASR enables the inclusion of multiple external knowledge sources
+to enhance the utilization of meta-data information. The external knowledge
+sources are incorporated in the form of sample-level pair-wise similarity
+matrices that are useful in a hard-mining loss. A key advantage of the MASR
+framework is that it can be combined with any choice of SSL method. Using MASR
+representations, we perform evaluations on several downstream tasks such as
+language identification, speech recognition and other non-semantic tasks such
+as speaker and emotion recognition. In these experiments, we illustrate
+significant performance improvements for the MASR over other established
+benchmarks. We perform a detailed analysis on the language identification task
+to provide insights on how the proposed loss function enables the
+representations to separate closely related languages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PATROL: Privacy-Oriented Pruning for Collaborative Inference Against
+  Model Inversion Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiwei Ding, Lan Zhang, Miao Pan, Xiaoyong Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative inference has been a promising solution to enable
+resource-constrained edge devices to perform inference using state-of-the-art
+deep neural networks (DNNs). In collaborative inference, the edge device first
+feeds the input to a partial DNN locally and then uploads the intermediate
+result to the cloud to complete the inference. However, recent research
+indicates model inversion attacks (MIAs) can reconstruct input data from
+intermediate results, posing serious privacy concerns for collaborative
+inference. Existing perturbation and cryptography techniques are inefficient
+and unreliable in defending against MIAs while performing accurate inference.
+This paper provides a viable solution, named PATROL, which develops
+privacy-oriented pruning to balance privacy, efficiency, and utility of
+collaborative inference. PATROL takes advantage of the fact that later layers
+in a DNN can extract more task-specific features. Given limited local resources
+for collaborative inference, PATROL intends to deploy more layers at the edge
+based on pruning techniques to enforce task-specific features for inference and
+reduce task-irrelevant but sensitive features for privacy preservation. To
+achieve privacy-oriented pruning, PATROL introduces two key components:
+Lipschitz regularization and adversarial reconstruction training, which
+increase the reconstruction errors by reducing the stability of MIAs and
+enhance the target inference model by adversarial training, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Globally Normalising the Transducer for Streaming Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rogier van Dalen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Transducer (e.g. RNN-Transducer or Conformer-Transducer) generates an
+output label sequence as it traverses the input sequence. It is straightforward
+to use in streaming mode, where it generates partial hypotheses before the
+complete input has been seen. This makes it popular in speech recognition.
+However, in streaming mode the Transducer has a mathematical flaw which, simply
+put, restricts the model's ability to change its mind. The fix is to replace
+local normalisation (e.g. a softmax) with global normalisation, but then the
+loss function becomes impossible to evaluate exactly. A recent paper proposes
+to solve this by approximating the model, severely degrading performance.
+Instead, this paper proposes to approximate the loss function, allowing global
+normalisation to apply to a state-of-the-art streaming model. Global
+normalisation reduces its word error rate by 9-11% relative, closing almost
+half the gap between streaming and lookahead mode.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages plus references and appendices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PASTA: <span class="highlight-title">Pretrain</span>ed Action-State <span class="highlight-title">Transformer</span> Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Boige, Yannis Flet-Berliac, Arthur Flajolet, Guillaume Richard, Thomas Pierrot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning has brought about a revolutionary paradigm shift in
+various computing domains, including NLP, vision, and biology. Recent
+approaches involve pre-training transformer models on vast amounts of unlabeled
+data, serving as a starting point for efficiently solving downstream tasks. In
+the realm of reinforcement learning, researchers have recently adapted these
+approaches by developing models pre-trained on expert trajectories, enabling
+them to address a wide range of tasks, from robotics to recommendation systems.
+However, existing methods mostly rely on intricate pre-training objectives
+tailored to specific downstream applications. This paper presents a
+comprehensive investigation of models we refer to as Pretrained Action-State
+Transformer Agents (PASTA). Our study uses a unified methodology and covers an
+extensive set of general downstream tasks including behavioral cloning, offline
+RL, sensor failure robustness, and dynamics change adaptation. Our goal is to
+systematically compare various design choices and provide valuable insights to
+practitioners for building robust models. Key highlights of our study include
+tokenization at the action and state component level, using fundamental
+pre-training objectives like next token prediction, training models across
+diverse domains simultaneously, and using parameter efficient fine-tuning
+(PEFT). The developed models in our study contain fewer than 10 million
+parameters and the application of PEFT enables fine-tuning of fewer than 10,000
+parameters during downstream adaptation, allowing a broad community to use
+these models and reproduce our experiments. We hope that this study will
+encourage further research into the use of transformers with first-principles
+design choices to represent RL trajectories and contribute to robust policy
+learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inorganic synthesis-structure maps in zeolites with machine learning and
+  crystallographic distances 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Schwalbe-Koda, Daniel E. Widdowson, Tuan Anh Pham, Vitaliy A. Kurlin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zeolites are inorganic materials known for their diversity of applications,
+synthesis conditions, and resulting polymorphs. Although their synthesis is
+controlled both by inorganic and organic synthesis conditions, computational
+studies of zeolite synthesis have focused mostly on organic template design. In
+this work, we use a strong distance metric between crystal structures and
+machine learning (ML) to create inorganic synthesis maps in zeolites. Starting
+with 253 known zeolites, we show how the continuous distances between
+frameworks reproduce inorganic synthesis conditions from the literature without
+using labels such as building units. An unsupervised learning analysis shows
+that neighboring zeolites according to our metric often share similar inorganic
+synthesis conditions, even in template-based routes. In combination with ML
+classifiers, we find synthesis-structure relationships for 14 common inorganic
+conditions in zeolites, namely Al, B, Be, Ca, Co, F, Ga, Ge, K, Mg, Na, P, Si,
+and Zn. By explaining the model predictions, we demonstrate how
+(dis)similarities towards known structures can be used as features for the
+synthesis space. Finally, we show how these methods can be used to predict
+inorganic synthesis conditions for unrealized frameworks in hypothetical
+databases and interpret the outcomes by extracting local structural patterns
+from zeolites. In combination with template design, this work can accelerate
+the exploration of the space of synthesis conditions for zeolites.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling 3D cardiac contraction and relaxation with point cloud
+  deformation networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcel Beetz, Abhirup Banerjee, Vicente Grau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global single-valued biomarkers of cardiac function typically used in
+clinical practice, such as ejection fraction, provide limited insight on the
+true 3D cardiac deformation process and hence, limit the understanding of both
+healthy and pathological cardiac mechanics. In this work, we propose the Point
+Cloud Deformation Network (PCD-Net) as a novel geometric deep learning approach
+to model 3D cardiac contraction and relaxation between the extreme ends of the
+cardiac cycle. It employs the recent advances in point cloud-based deep
+learning into an encoder-decoder structure, in order to enable efficient
+multi-scale feature learning directly on multi-class 3D point cloud
+representations of the cardiac anatomy. We evaluate our approach on a large
+dataset of over 10,000 cases from the UK Biobank study and find average Chamfer
+distances between the predicted and ground truth anatomies below the pixel
+resolution of the underlying image acquisition. Furthermore, we observe similar
+clinical metrics between predicted and ground truth populations and show that
+the PCD-Net can successfully capture subpopulation-specific differences between
+normal subjects and myocardial infarction (MI) patients. We then demonstrate
+that the learned 3D deformation patterns outperform multiple clinical
+benchmarks by 13% and 7% in terms of area under the receiver operating
+characteristic curve for the tasks of prevalent MI detection and incident MI
+prediction and by 7% in terms of Harrell's concordance index for MI survival
+analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Confidence intervals for performance estimates in 3D medical image
+  segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        R. El Jurdi, G. Varoquax, O. Colliot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical segmentation models are evaluated empirically. As such an evaluation
+is based on a limited set of example images, it is unavoidably noisy. Beyond a
+mean performance measure, reporting confidence intervals is thus crucial.
+However, this is rarely done in medical image segmentation. The width of the
+confidence interval depends on the test set size and on the spread of the
+performance measure (its standard-deviation across of the test set). For
+classification, many test images are needed to avoid wide confidence intervals.
+Segmentation, however, has not been studied, and it differs by the amount of
+information brought by a given test image. In this paper, we study the typical
+confidence intervals in medical image segmentation. We carry experiments on 3D
+image segmentation using the standard nnU-net framework, two datasets from the
+Medical Decathlon challenge and two performance measures: the Dice accuracy and
+the Hausdorff distance. We show that the parametric confidence intervals are
+reasonable approximations of the bootstrap estimates for varying test set sizes
+and spread of the performance metric. Importantly, we show that the test size
+needed to achieve a given precision is often much lower than for classification
+tasks. Typically, a 1% wide confidence interval requires about 100-200 test
+samples when the spread is low (standard-deviation around 3%). More difficult
+segmentation tasks may lead to higher spreads and require over 1000 samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sequential Multi-Dimensional <span class="highlight-title">Self-Supervised</span> Learning for Clinical Time
+  Series <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aniruddh Raghu, Payal Chandak, Ridwan Alam, John Guttag, Collin M. Stultz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) for clinical time series data has received
+significant attention in recent literature, since these data are highly rich
+and provide important information about a patient's physiological state.
+However, most existing SSL methods for clinical time series are limited in that
+they are designed for unimodal time series, such as a sequence of structured
+features (e.g., lab values and vitals signs) or an individual high-dimensional
+physiological signal (e.g., an electrocardiogram). These existing methods
+cannot be readily extended to model time series that exhibit multimodality,
+with structured features and high-dimensional data being recorded at each
+timestep in the sequence. In this work, we address this gap and propose a new
+SSL method -- Sequential Multi-Dimensional SSL -- where a SSL loss is applied
+both at the level of the entire sequence and at the level of the individual
+high-dimensional data points in the sequence in order to better capture
+information at both scales. Our strategy is agnostic to the specific form of
+loss function used at each level -- it can be contrastive, as in SimCLR, or
+non-contrastive, as in VICReg. We evaluate our method on two real-world
+clinical datasets, where the time series contains sequences of (1)
+high-frequency electrocardiograms and (2) structured data from lab values and
+vitals signs. Our experimental results indicate that pre-training with our
+method and then fine-tuning on downstream tasks improves performance over
+baselines on both datasets, and in several settings, can lead to improvements
+across different self-supervised loss functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language-based Action Concept Spaces Improve Video <span class="highlight-title">Self-Supervised</span>
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10922v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10922v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanchana Ranasinghe, Michael Ryoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent contrastive language image pre-training has led to learning highly
+transferable and robust image representations. However, adapting these models
+to video domains with minimal supervision remains an open problem. We explore a
+simple step in that direction, using language tied self-supervised learning to
+adapt an image CLIP model to the video domain. A backbone modified for temporal
+modeling is trained under self-distillation settings with train objectives
+operating in an action concept space. Feature vectors of various action
+concepts extracted from a language encoder using relevant textual prompts
+construct this space. We introduce two train objectives, concept distillation
+and concept alignment, that retain generality of original representations while
+enforcing relations between actions and their attributes. Our approach improves
+zero-shot and linear probing performance on three action recognition
+benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Role of Entropy and Reconstruction in Multi-View <span class="highlight-title">Self-Supervised</span>
+  Learning <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Borja Rodríguez-Gálvez, Arno Blaas, Pau Rodríguez, Adam Goliński, Xavier Suau, Jason Ramapuram, Dan Busbridge, Luca Zappella
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The mechanisms behind the success of multi-view self-supervised learning
+(MVSSL) are not yet fully understood. Contrastive MVSSL methods have been
+studied through the lens of InfoNCE, a lower bound of the Mutual Information
+(MI). However, the relation between other MVSSL methods and MI remains unclear.
+We consider a different lower bound on the MI consisting of an entropy and a
+reconstruction term (ER), and analyze the main MVSSL families through its lens.
+Through this ER bound, we show that clustering-based methods such as
+DeepCluster and SwAV maximize the MI. We also re-interpret the mechanisms of
+distillation-based approaches such as BYOL and DINO, showing that they
+explicitly maximize the reconstruction term and implicitly encourage a stable
+entropy, and we confirm this empirically. We show that replacing the objectives
+of common MVSSL methods with this ER bound achieves competitive performance,
+while making them stable when training with smaller batch sizes or smaller
+exponential moving average (EMA) coefficients.
+  Github repo: https://github.com/apple/ml-entropy-reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages: 9 of main text, 2 of references, and 7 of supplementary
+  material. Appears in the proceedings of ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Variational Point Encoding Deformation for Dental Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johan Ziruo Ye, Thomas Ørkild, Peter Lempel Søndergaard, Søren Hauberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital dentistry has made significant advancements in recent years, yet
+numerous challenges remain to be addressed. In this study, we release a new
+extensive dataset of tooth meshes to encourage further research. Additionally,
+we propose Variational FoldingNet (VF-Net), which extends FoldingNet to enable
+probabilistic learning of point cloud representations. A key challenge in
+existing latent variable models for point clouds is the lack of a 1-to-1
+mapping between input points and output points. Instead, they must rely on
+optimizing Chamfer distances, a metric that does not have a normalized
+distributional counterpart, preventing its usage in probabilistic models. We
+demonstrate that explicit minimization of Chamfer distances can be replaced by
+a suitable encoder, which allows us to increase computational efficiency while
+simplifying the probabilistic extension. Our experimental findings present
+empirical evidence demonstrating the superior performance of VF-Net over
+existing models in terms of dental scan reconstruction and extrapolation.
+Additionally, our investigation highlights the robustness of VF-Net's latent
+representations. These results underscore the promising prospects of VF-Net as
+an effective and reliable method for point cloud reconstruction and analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning and Generalizing Polynomials in Simulation Metamodeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jesper Hauch, Christoffer Riis, Francisco C. Pereira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to learn polynomials and generalize out-of-distribution is
+essential for simulation metamodels in many disciplines of engineering, where
+the time step updates are described by polynomials. While feed forward neural
+networks can fit any function, they cannot generalize out-of-distribution for
+higher-order polynomials. Therefore, this paper collects and proposes
+multiplicative neural network (MNN) architectures that are used as recursive
+building blocks for approximating higher-order polynomials. Our experiments
+show that MNNs are better than baseline models at generalizing, and their
+performance in validation is true to their performance in out-of-distribution
+tests. In addition to MNN architectures, a simulation metamodeling approach is
+proposed for simulations with polynomial time step updates. For these
+simulations, simulating a time interval can be performed in fewer steps by
+increasing the step size, which entails approximating higher-order polynomials.
+While our approach is compatible with any simulation with polynomial time step
+updates, a demonstration is shown for an epidemiology simulation model, which
+also shows the inductive bias in MNNs for learning and generalizing
+higher-order polynomials.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Syntactic vs Semantic Linear Abstraction and Refinement of Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Calvin Chau, Jan Křetínský, Stefanie Mohr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abstraction is a key verification technique to improve scalability. However,
+its use for neural networks is so far extremely limited. Previous approaches
+for abstracting classification networks replace several neurons with one of
+them that is similar enough. We can classify the similarity as defined either
+syntactically (using quantities on the connections between neurons) or
+semantically (on the activation values of neurons for various inputs).
+Unfortunately, the previous approaches only achieve moderate reductions, when
+implemented at all. In this work, we provide a more flexible framework where a
+neuron can be replaced with a linear combination of other neurons, improving
+the reduction. We apply this approach both on syntactic and semantic
+abstractions, and implement and evaluate them experimentally. Further, we
+introduce a refinement method for our abstractions, allowing for finding a
+better balance between reduction and precision.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ATVA 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Player-optimal Stable Regret for Bandit Learning in Matching Markets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10890v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10890v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fang Kong, Shuai Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of matching markets has been studied for a long time in the
+literature due to its wide range of applications. Finding a stable matching is
+a common equilibrium objective in this problem. Since market participants are
+usually uncertain of their preferences, a rich line of recent works study the
+online setting where one-side participants (players) learn their unknown
+preferences from iterative interactions with the other side (arms). Most
+previous works in this line are only able to derive theoretical guarantees for
+player-pessimal stable regret, which is defined compared with the players'
+least-preferred stable matching. However, under the pessimal stable matching,
+players only obtain the least reward among all stable matchings. To maximize
+players' profits, player-optimal stable matching would be the most desirable.
+Though \citet{basu21beyond} successfully bring an upper bound for
+player-optimal stable regret, their result can be exponentially large if
+players' preference gap is small. Whether a polynomial guarantee for this
+regret exists is a significant but still open problem. In this work, we provide
+a new algorithm named explore-then-Gale-Shapley (ETGS) and show that the
+optimal stable regret of each player can be upper bounded by $O(K\log
+T/\Delta^2)$ where $K$ is the number of arms, $T$ is the horizon and $\Delta$
+is the players' minimum preference gap among the first $N+1$-ranked arms. This
+result significantly improves previous works which either have a weaker
+player-pessimal stable matching objective or apply only to markets with special
+assumptions. When the preferences of participants satisfy some special
+conditions, our regret upper bound also matches the previously derived lower
+bound.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SODA 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Risk-optimized Outlier Removal for Robust Point Cloud Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10875v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10875v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinke Li, Junchi Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of point cloud deep models for safety-critical purposes has
+increased, but the reliability and security of these models can be compromised
+by intentional or naturally occurring point cloud noise. To combat this issue,
+we present a novel point cloud outlier removal method called PointCVaR, which
+empowers standard-trained models to eliminate additional outliers and restore
+the data. Our approach begins by conducting attribution analysis to determine
+the influence of each point on the model output, which we refer to as point
+risk. We then optimize the process of filtering high-risk points using
+Conditional Value at Risk (CVaR) as the objective. The rationale for this
+approach is based on the observation that noise points in point clouds tend to
+cluster in the tail of the risk distribution, with a low frequency but a high
+level of risk, resulting in significant interference with classification
+results. Despite requiring no additional training effort, our method produces
+exceptional results in various removal-and-classification experiments for noisy
+point clouds, which are corrupted by random noise, adversarial noise, and
+backdoor trigger noise. Impressively, it achieves 87% accuracy in defense
+against the backdoor attack by removing triggers. Overall, the proposed
+PointCVaR effectively eliminates noise points and enhances point cloud
+classification, making it a promising plug-in module for various models in
+different scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonlinear Meta-Learning Can Guarantee Faster Rates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitri Meunier, Zhu Li, Arthur Gretton, Samory Kpotufe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many recent theoretical works on \emph{meta-learning} aim to achieve
+guarantees in leveraging similar representational structures from related tasks
+towards simplifying a target task. Importantly, the main aim in theory works on
+the subject is to understand the extent to which convergence rates -- in
+learning a common representation -- \emph{may scale with the number $N$ of
+tasks} (as well as the number of samples per task). First steps in this setting
+demonstrate this property when both the shared representation amongst tasks,
+and task-specific regression functions, are linear. This linear setting readily
+reveals the benefits of aggregating tasks, e.g., via averaging arguments. In
+practice, however, the representation is often highly nonlinear, introducing
+nontrivial biases in each task that cannot easily be averaged out as in the
+linear case. In the present work, we derive theoretical guarantees for
+meta-learning with nonlinear representations. In particular, assuming the
+shared nonlinearity maps to an infinite-dimensional RKHS, we show that
+additional biases can be mitigated with careful regularization that leverages
+the smoothness of task-specific regression functions,
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Performance Issue Identification in Cloud Systems with
+  Relational-Temporal Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenwei Gu, Jinyang Liu, Zhuangbin Chen, Jianping Zhang, Yuxin Su, Jiazhen Gu, Cong Feng, Zengyin Yang, Michael Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Performance issues permeate large-scale cloud service systems, which can lead
+to huge revenue losses. To ensure reliable performance, it's essential to
+accurately identify and localize these issues using service monitoring metrics.
+Given the complexity and scale of modern cloud systems, this task can be
+challenging and may require extensive expertise and resources beyond the
+capacity of individual humans. Some existing methods tackle this problem by
+analyzing each metric independently to detect anomalies. However, this could
+incur overwhelming alert storms that are difficult for engineers to diagnose
+manually. To pursue better performance, not only the temporal patterns of
+metrics but also the correlation between metrics (i.e., relational patterns)
+should be considered, which can be formulated as a multivariate metrics anomaly
+detection problem. However, most of the studies fall short of extracting these
+two types of features explicitly. Moreover, there exist some unlabeled
+anomalies mixed in the training data, which may hinder the detection
+performance. To address these limitations, we propose the Relational- Temporal
+Anomaly Detection Model (RTAnomaly) that combines the relational and temporal
+information of metrics. RTAnomaly employs a graph attention layer to learn the
+dependencies among metrics, which will further help pinpoint the anomalous
+metrics that may cause the anomaly effectively. In addition, we exploit the
+concept of positive unlabeled learning to address the issue of potential
+anomalies in the training data. To evaluate our method, we conduct experiments
+on a public dataset and two industrial datasets. RTAnomaly outperforms all the
+baseline models by achieving an average F1 score of 0.929 and Hit@3 of 0.920,
+demonstrating its superiority.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with
+  Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashish Singh, Prateek Agarwal, Zixuan Huang, Arpita Singh, Tong Yu, Sungchul Kim, Victor Bursztyn, Nikos Vlassis, Ryan A. Rossi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Captions are crucial for understanding scientific visualizations and
+documents. Existing captioning methods for scientific figures rely on
+figure-caption pairs extracted from documents for training, many of which fall
+short with respect to metrics like helpfulness, explainability, and
+visual-descriptiveness [15] leading to generated captions being misaligned with
+reader preferences. To enable the generation of high-quality figure captions,
+we introduce FigCaps-HF a new framework for figure-caption generation that can
+incorporate domain expert feedback in generating captions optimized for reader
+preferences. Our framework comprises of 1) an automatic method for evaluating
+quality of figure-caption pairs, 2) a novel reinforcement learning with human
+feedback (RLHF) method to optimize a generative figure-to-caption model for
+reader preferences. We demonstrate the effectiveness of our simple learning
+framework by improving performance over standard fine-tuning across different
+types of models. In particular, when using BLIP as the base model, our RLHF
+framework achieves a mean gain of 35.7%, 16.9%, and 9% in ROUGE, BLEU, and
+Meteor, respectively. Finally, we release a large-scale benchmark dataset with
+human feedback on figure-caption pairs to enable further evaluation and
+development of RLHF techniques for this problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 4 figures. Benchmark Documentation:
+  https://figcapshf.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing caveats of neural persistence with deep graph persistence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leander Girrbach, Anders Christensen, Ole Winther, Zeynep Akata, A. Sophia Koepke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Persistence is a prominent measure for quantifying neural network
+complexity, proposed in the emerging field of topological data analysis in deep
+learning. In this work, however, we find both theoretically and empirically
+that the variance of network weights and spatial concentration of large weights
+are the main factors that impact neural persistence. Whilst this captures
+useful information for linear classifiers, we find that no relevant spatial
+structure is present in later layers of deep neural networks, making neural
+persistence roughly equivalent to the variance of weights. Additionally, the
+proposed averaging procedure across layers for deep neural networks does not
+consider interaction between layers. Based on our analysis, we propose an
+extension of the filtration underlying neural persistence to the whole neural
+network instead of single layers, which is equivalent to calculating neural
+persistence on one particular matrix. This yields our deep graph persistence
+measure, which implicitly incorporates persistent paths through the network and
+alleviates variance-related issues through standardisation. Code is available
+at https://github.com/ExplainableML/Deep-Graph-Persistence .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Divide & Bind Your Attention for Improved Generative Semantic Nursing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yumeng Li, Margret Keuper, Dan Zhang, Anna Khoreva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emerging large-scale text-to-image generative models, e.g., Stable Diffusion
+(SD), have exhibited overwhelming results with high fidelity. Despite the
+magnificent progress, current state-of-the-art models still struggle to
+generate images fully adhering to the input prompt. Prior work, Attend &
+Excite, has introduced the concept of Generative Semantic Nursing (GSN), aiming
+to optimize cross-attention during inference time to better incorporate the
+semantics. It demonstrates promising results in generating simple prompts,
+e.g., ``a cat and a dog''. However, its efficacy declines when dealing with
+more complex prompts, and it does not explicitly address the problem of
+improper attribute binding. To address the challenges posed by complex prompts
+or scenarios involving multiple entities and to achieve improved attribute
+binding, we propose Divide & Bind. We introduce two novel loss objectives for
+GSN: a novel attendance loss and a binding loss. Our approach stands out in its
+ability to faithfully synthesize desired objects with improved attribute
+alignment from complex prompts and exhibits superior performance across
+multiple evaluation benchmarks. More videos and updates can be found on the
+project page \url{https://sites.google.com/view/divide-and-bind}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: \url{https://sites.google.com/view/divide-and-bind}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-paced Weight Consolidation for Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Cong, Yang Cong, Gan Sun, Yuyang Liu, Jiahua Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning algorithms which keep the parameters of new tasks close to
+that of previous tasks, are popular in preventing catastrophic forgetting in
+sequential task learning settings. However, 1) the performance for the new
+continual learner will be degraded without distinguishing the contributions of
+previously learned tasks; 2) the computational cost will be greatly increased
+with the number of tasks, since most existing algorithms need to regularize all
+previous tasks when learning new tasks. To address the above challenges, we
+propose a self-paced Weight Consolidation (spWC) framework to attain robust
+continual learning via evaluating the discriminative contributions of previous
+tasks. To be specific, we develop a self-paced regularization to reflect the
+priorities of past tasks via measuring difficulty based on key performance
+indicator (i.e., accuracy). When encountering a new task, all previous tasks
+are sorted from "difficult" to "easy" based on the priorities. Then the
+parameters of the new continual learner will be learned via selectively
+maintaining the knowledge amongst more difficult past tasks, which could well
+overcome catastrophic forgetting with less computational cost. We adopt an
+alternative convex search to iteratively update the model parameters and
+priority weights in the bi-convex formulation. The proposed spWC framework is
+plug-and-play, which is applicable to most continual learning algorithms (e.g.,
+EWC, MAS and RCIL) in different directions (e.g., classification and
+segmentation). Experimental results on several public benchmark datasets
+demonstrate that our proposed framework can effectively improve performance
+when compared with other popular continual learning algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Precipitation Nowcasting of Integrated Multi-satellitE Retrievals
+  for GPM: A U-Net Convolutional LSTM Architecture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10843v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10843v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reyhaneh Rahimi, Ardeshir Ebtehaj, Ali Behrangi, Jackson Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a deep learning architecture for nowcasting of
+precipitation almost globally every 30 min with a 4-hour lead time. The
+architecture fuses a U-Net and a convolutional long short-term memory (LSTM)
+neural network and is trained using data from the Integrated MultisatellitE
+Retrievals for GPM (IMERG) and a few key precipitation drivers from the Global
+Forecast System (GFS). The impacts of different training loss functions,
+including the mean-squared error (regression) and the focal-loss
+(classification), on the quality of precipitation nowcasts are studied. The
+results indicate that the regression network performs well in capturing light
+precipitation (below 1.6 mm/hr), but the classification network can outperform
+the regression network for nowcasting of precipitation extremes (>8 mm/hr), in
+terms of the critical success index (CSI).. Using the Wasserstein distance, it
+is shown that the predicted precipitation by the classification network has a
+closer class probability distribution to the IMERG than the regression network.
+It is uncovered that the inclusion of the physical variables can improve
+precipitation nowcasting, especially at longer lead times in both networks.
+Taking IMERG as a relative reference, a multi-scale analysis in terms of
+fractions skill score (FSS), shows that the nowcasting machine remains skillful
+(FSS > 0.5) at the resolution of 10 km compared to 50 km for GFS. For
+precipitation rates greater than 4~mm/hr, only the classification network
+remains FSS-skillful on scales greater than 50 km within a 2-hour lead time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Label Calibration for Semantic Segmentation Under Domain Shift <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ondrej Bohdal, Da Li, Timothy Hospedales
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Performance of a pre-trained semantic segmentation model is likely to
+substantially decrease on data from a new domain. We show a pre-trained model
+can be adapted to unlabelled target domain data by calculating soft-label
+prototypes under the domain shift and making predictions according to the
+prototype closest to the vector with predicted class probabilities. The
+proposed adaptation procedure is fast, comes almost for free in terms of
+computational resources and leads to considerable performance improvements. We
+demonstrate the benefits of such label calibration on the highly-practical
+synthetic-to-real semantic segmentation problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2023 Workshop on Pitfalls of Limited Data and Computation for
+  Trustworthy ML</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Combining Expert Demonstrations in Imitation Learning via Optimal
+  Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilana Sebag, Samuel Cohen, Marc Peter Deisenroth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imitation learning (IL) seeks to teach agents specific tasks through expert
+demonstrations. One of the key approaches to IL is to define a distance between
+agent and expert and to find an agent policy that minimizes that distance.
+Optimal transport methods have been widely used in imitation learning as they
+provide ways to measure meaningful distances between agent and expert
+trajectories. However, the problem of how to optimally combine multiple expert
+demonstrations has not been widely studied. The standard method is to simply
+concatenate state (-action) trajectories, which is problematic when
+trajectories are multi-modal. We propose an alternative method that uses a
+multi-marginal optimal transport distance and enables the combination of
+multiple and diverse state-trajectories in the OT sense, providing a more
+sensible geometric average of the demonstrations. Our approach enables an agent
+to learn from several experts, and its efficiency is analyzed on OpenAI Gym
+control environments and demonstrates that the standard method is not always
+optimal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Communication-Efficient Split Learning via Adaptive Feature-Wise
+  Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10805v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10805v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongjeong Oh, Jaeho Lee, Christopher G. Brinton, Yo-Seb Jeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel communication-efficient split learning (SL)
+framework, named SplitFC, which reduces the communication overhead required for
+transmitting intermediate feature and gradient vectors during the SL training
+process. The key idea of SplitFC is to leverage different dispersion degrees
+exhibited in the columns of the matrices. SplitFC incorporates two compression
+strategies: (i) adaptive feature-wise dropout and (ii) adaptive feature-wise
+quantization. In the first strategy, the intermediate feature vectors are
+dropped with adaptive dropout probabilities determined based on the standard
+deviation of these vectors. Then, by the chain rule, the intermediate gradient
+vectors associated with the dropped feature vectors are also dropped. In the
+second strategy, the non-dropped intermediate feature and gradient vectors are
+quantized using adaptive quantization levels determined based on the ranges of
+the vectors. To minimize the quantization error, the optimal quantization
+levels of this strategy are derived in a closed-form expression. Simulation
+results on the MNIST, CIFAR-10, and CelebA datasets demonstrate that SplitFC
+provides more than a 5.6% increase in classification accuracy compared to
+state-of-the-art SL frameworks, while they require 320 times less communication
+overhead compared to the vanilla SL framework without compression.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatial-Temporal Data Mining for Ocean Science: Data, Methodologies, and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10803v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10803v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanchen Yang, Wengen Li, Shuyu Wang, Hui Li, Jihong Guan, Shuigeng Zhou, Jiannong Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increasing amount of spatial-temporal~(ST) ocean data, numerous
+spatial-temporal data mining (STDM) studies have been conducted to address
+various oceanic issues, e.g., climate forecasting and disaster warning.
+Compared with typical ST data (e.g., traffic data), ST ocean data is more
+complicated with some unique characteristics, e.g., diverse regionality and
+high sparsity. These characteristics make it difficult to design and train STDM
+models. Unfortunately, an overview of these studies is still missing, hindering
+computer scientists to identify the research issues in ocean while discouraging
+researchers in ocean science from applying advanced STDM techniques. To remedy
+this situation, we provide a comprehensive survey to summarize existing STDM
+studies in ocean. Concretely, we first summarize the widely-used ST ocean
+datasets and identify their unique characteristics. Then, typical ST ocean data
+quality enhancement techniques are discussed. Next, we classify existing STDM
+studies for ocean into four types of tasks, i.e., prediction, event detection,
+pattern mining, and anomaly detection, and elaborate the techniques for these
+tasks. Finally, promising research opportunities are highlighted. This survey
+will help scientists from the fields of both computer science and ocean science
+have a better understanding of the fundamental concepts, key techniques, and
+open challenges of STDM in ocean.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-<span class="highlight-title">Transformer</span>: A Unified Framework for Multimodal Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyuan Zhang, Kaixiong Gong, Kaipeng Zhang, Hongsheng Li, Yu Qiao, Wanli Ouyang, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal learning aims to build models that can process and relate
+information from multiple modalities. Despite years of development in this
+field, it still remains challenging to design a unified network for processing
+various modalities ($\textit{e.g.}$ natural language, 2D images, 3D point
+clouds, audio, video, time series, tabular data) due to the inherent gaps among
+them. In this work, we propose a framework, named Meta-Transformer, that
+leverages a $\textbf{frozen}$ encoder to perform multimodal perception without
+any paired multimodal training data. In Meta-Transformer, the raw input data
+from various modalities are mapped into a shared token space, allowing a
+subsequent encoder with frozen parameters to extract high-level semantic
+features of the input data. Composed of three main components: a unified data
+tokenizer, a modality-shared encoder, and task-specific heads for downstream
+tasks, Meta-Transformer is the first framework to perform unified learning
+across 12 modalities with unpaired data. Experiments on different benchmarks
+reveal that Meta-Transformer can handle a wide range of tasks including
+fundamental perception (text, image, point cloud, audio, video), practical
+application (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph,
+tabular, and time-series). Meta-Transformer indicates a promising future for
+developing unified multimodal intelligence with transformers. Code will be
+available at https://github.com/invictus717/MetaTransformer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://kxgong.github.io/meta_transformer/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing PatchCore for Few/many-shot Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10792v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10792v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        João Santos, Triet Tran, Oliver Rippel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot anomaly detection (AD) is an emerging sub-field of general AD, and
+tries to distinguish between normal and anomalous data using only few selected
+samples. While newly proposed few-shot AD methods do compare against
+pre-existing algorithms developed for the full-shot domain as baselines, they
+do not dedicatedly optimize them for the few-shot setting. It thus remains
+unclear if the performance of such pre-existing algorithms can be further
+improved. We address said question in this work. Specifically, we present a
+study on the AD/anomaly segmentation (AS) performance of PatchCore, the current
+state-of-the-art full-shot AD/AS algorithm, in both the few-shot and the
+many-shot settings. We hypothesize that further performance improvements can be
+realized by (I) optimizing its various hyperparameters, and by (II)
+transferring techniques known to improve few-shot supervised learning to the AD
+domain. Exhaustive experiments on the public VisA and MVTec AD datasets reveal
+that (I) significant performance improvements can be realized by optimizing
+hyperparameters such as the underlying feature extractor, and that (II)
+image-level augmentations can, but are not guaranteed, to improve performance.
+Based on these findings, we achieve a new state of the art in few-shot AD on
+VisA, further demonstrating the merit of adapting pre-existing AD/AS methods to
+the few-shot setting. Last, we identify the investigation of feature extractors
+with a strong inductive bias as a potential future research direction for
+(few-shot) AD/AS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial attacks for mixtures of classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Gnecco Heredia, Benjamin Negrevergne, Yann Chevaleyre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mixtures of classifiers (a.k.a. randomized ensembles) have been proposed as a
+way to improve robustness against adversarial attacks. However, it has been
+shown that existing attacks are not well suited for this kind of classifiers.
+In this paper, we discuss the problem of attacking a mixture in a principled
+way and introduce two desirable properties of attacks based on a geometrical
+analysis of the problem (effectiveness and maximality). We then show that
+existing attacks do not meet both of these properties. Finally, we introduce a
+new attack called lattice climber attack with theoretical guarantees on the
+binary linear setting, and we demonstrate its performance by conducting
+experiments on synthetic and real datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages + 4 pages of appendix. 5 figures in main text</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feed-Forward Source-Free Domain Adaptation via Class Prototypes <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ondrej Bohdal, Da Li, Timothy Hospedales
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Source-free domain adaptation has become popular because of its practical
+usefulness and no need to access source data. However, the adaptation process
+still takes a considerable amount of time and is predominantly based on
+optimization that relies on back-propagation. In this work we present a simple
+feed-forward approach that challenges the need for back-propagation based
+adaptation. Our approach is based on computing prototypes of classes under the
+domain shift using a pre-trained model. It achieves strong improvements in
+accuracy compared to the pre-trained model and requires only a small fraction
+of time of existing domain adaptation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2022 Workshop on Out of Distribution Generalization in Computer
+  Vision (OOD-CV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Beam Tree Recursion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10779v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10779v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jishnu Ray Chowdhury, Cornelia Caragea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Beam Tree Recursive Neural Network (BT-RvNN) was recently proposed as a
+simple extension of Gumbel Tree RvNN and it was shown to achieve
+state-of-the-art length generalization performance in ListOps while maintaining
+comparable performance on other tasks. However, although not the worst in its
+kind, BT-RvNN can be still exorbitantly expensive in memory usage. In this
+paper, we identify the main bottleneck in BT-RvNN's memory usage to be the
+entanglement of the scorer function and the recursive cell function. We propose
+strategies to remove this bottleneck and further simplify its memory usage.
+Overall, our strategies not only reduce the memory usage of BT-RvNN by
+$10$-$16$ times but also create a new state-of-the-art in ListOps while
+maintaining similar performance in other tasks. In addition, we also propose a
+strategy to utilize the induced latent-tree node representations produced by
+BT-RvNN to turn BT-RvNN from a sentence encoder of the form $f:\mathbb{R}^{n
+\times d} \rightarrow \mathbb{R}^{d}$ into a sequence contextualizer of the
+form $f:\mathbb{R}^{n \times d} \rightarrow \mathbb{R}^{n \times d}$. Thus, our
+proposals not only open up a path for further scalability of RvNNs but also
+standardize a way to use BT-RvNNs as another building block in the deep
+learning toolkit that can be easily stacked or interfaced with other popular
+models such as Transformers and Structured State Space models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing the Use of AutoML for Data-Driven Software Engineering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabio Calefato, Luigi Quaranta, Filippo Lanubile, Marcos Kalinowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background. Due to the widespread adoption of Artificial Intelligence (AI)
+and Machine Learning (ML) for building software applications, companies are
+struggling to recruit employees with a deep understanding of such technologies.
+In this scenario, AutoML is soaring as a promising solution to fill the AI/ML
+skills gap since it promises to automate the building of end-to-end AI/ML
+pipelines that would normally be engineered by specialized team members. Aims.
+Despite the growing interest and high expectations, there is a dearth of
+information about the extent to which AutoML is currently adopted by teams
+developing AI/ML-enabled systems and how it is perceived by practitioners and
+researchers. Method. To fill these gaps, in this paper, we present a
+mixed-method study comprising a benchmark of 12 end-to-end AutoML tools on two
+SE datasets and a user survey with follow-up interviews to further our
+understanding of AutoML adoption and perception. Results. We found that AutoML
+solutions can generate models that outperform those trained and optimized by
+researchers to perform classification tasks in the SE domain. Also, our
+findings show that the currently available AutoML solutions do not live up to
+their names as they do not equally support automation across the stages of the
+ML development workflow and for all the team members. Conclusions. We derive
+insights to inform the SE research community on how AutoML can facilitate their
+activities and tool builders on how to design the next generation of AutoML
+technologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Music Genre Classification with ResNet and Bi-GRU Using Visual
+  Spectrograms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10773v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10773v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junfei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music recommendation systems have emerged as a vital component to enhance
+user experience and satisfaction for the music streaming services, which
+dominates music consumption. The key challenge in improving these recommender
+systems lies in comprehending the complexity of music data, specifically for
+the underpinning music genre classification. The limitations of manual genre
+classification have highlighted the need for a more advanced system, namely the
+Automatic Music Genre Classification (AMGC) system. While traditional machine
+learning techniques have shown potential in genre classification, they heavily
+rely on manually engineered features and feature selection, failing to capture
+the full complexity of music data. On the other hand, deep learning
+classification architectures like the traditional Convolutional Neural Networks
+(CNN) are effective in capturing the spatial hierarchies but struggle to
+capture the temporal dynamics inherent in music data. To address these
+challenges, this study proposes a novel approach using visual spectrograms as
+input, and propose a hybrid model that combines the strength of the Residual
+neural Network (ResNet) and the Gated Recurrent Unit (GRU). This model is
+designed to provide a more comprehensive analysis of music data, offering the
+potential to improve the music recommender systems through achieving a more
+comprehensive analysis of music data and hence potentially more accurate genre
+classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoding the Enigma: Benchmarking Humans and AIs on the Many Facets of
+  Working Memory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankur Sikarwar, Mengmi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Working memory (WM), a fundamental cognitive process facilitating the
+temporary storage, integration, manipulation, and retrieval of information,
+plays a vital role in reasoning and decision-making tasks. Robust benchmark
+datasets that capture the multifaceted nature of WM are crucial for the
+effective development and evaluation of AI WM models. Here, we introduce a
+comprehensive Working Memory (WorM) benchmark dataset for this purpose. WorM
+comprises 10 tasks and a total of 1 million trials, assessing 4
+functionalities, 3 domains, and 11 behavioral and neural characteristics of WM.
+We jointly trained and tested state-of-the-art recurrent neural networks and
+transformers on all these tasks. We also include human behavioral benchmarks as
+an upper bound for comparison. Our results suggest that AI models replicate
+some characteristics of WM in the brain, most notably primacy and recency
+effects, and neural clusters and correlates specialized for different domains
+and functionalities of WM. In the experiments, we also reveal some limitations
+in existing models to approximate human behavior. This dataset serves as a
+valuable resource for communities in cognitive psychology, neuroscience, and
+AI, offering a standardized framework to compare and enhance WM models,
+investigate WM's neural underpinnings, and develop WM models with human-like
+capabilities. Our source code and data are available at
+https://github.com/ZhangLab-DeepNeuroCogLab/WorM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MSQNet: Actor-agnostic Action Recognition with Multi-modal Query 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anindya Mondal, Sauradip Nag, Joaquin M Prada, Xiatian Zhu, Anjan Dutta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing action recognition methods are typically actor-specific due to the
+intrinsic topological and apparent differences among the actors. This requires
+actor-specific pose estimation (e.g., humans vs. animals), leading to
+cumbersome model design complexity and high maintenance costs. Moreover, they
+often focus on learning the visual modality alone and single-label
+classification whilst neglecting other available information sources (e.g.,
+class name text) and the concurrent occurrence of multiple actions. To overcome
+these limitations, we propose a new approach called 'actor-agnostic multi-modal
+multi-label action recognition,' which offers a unified solution for various
+types of actors, including humans and animals. We further formulate a novel
+Multi-modal Semantic Query Network (MSQNet) model in a transformer-based object
+detection framework (e.g., DETR), characterized by leveraging visual and
+textual modalities to represent the action classes better. The elimination of
+actor-specific model designs is a key advantage, as it removes the need for
+actor pose estimation altogether. Extensive experiments on five publicly
+available benchmarks show that our MSQNet consistently outperforms the prior
+arts of actor-specific alternatives on human and animal single- and multi-label
+action recognition tasks by up to 50%. Code will be released at
+https://github.com/mondalanindya/MSQNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Voter Attribute Bias for Fair Opinion Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10749v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10749v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryosuke Ueda, Koh Takeuchi, Hisashi Kashima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The aggregation of multiple opinions plays a crucial role in decision-making,
+such as in hiring and loan review, and in labeling data for supervised
+learning. Although majority voting and existing opinion aggregation models are
+effective for simple tasks, they are inappropriate for tasks without
+objectively true labels in which disagreements may occur. In particular, when
+voter attributes such as gender or race introduce bias into opinions, the
+aggregation results may vary depending on the composition of voter attributes.
+A balanced group of voters is desirable for fair aggregation results but may be
+difficult to prepare. In this study, we consider methods to achieve fair
+opinion aggregation based on voter attributes and evaluate the fairness of the
+aggregated results. To this end, we consider an approach that combines opinion
+aggregation models such as majority voting and the Dawid and Skene model (D&S
+model) with fairness options such as sample weighting. To evaluate the fairness
+of opinion aggregation, probabilistic soft labels are preferred over discrete
+class labels. First, we address the problem of soft label estimation without
+considering voter attributes and identify some issues with the D&S model. To
+address these limitations, we propose a new Soft D&S model with improved
+accuracy in estimating soft labels. Moreover, we evaluated the fairness of an
+opinion aggregation model, including Soft D&S, in combination with different
+fairness options using synthetic and semi-synthetic data. The experimental
+results suggest that the combination of Soft D&S and data splitting as a
+fairness option is effective for dense data, whereas weighted majority voting
+is effective for sparse data. These findings should prove particularly valuable
+in supporting decision-making by human and machine-learning models with
+balanced opinion aggregation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fairness-Aware Client Selection for Federated Learning <span class="chip">ICME 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10738v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10738v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxin Shi, Zelei Liu, Zhuan Shi, Han Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) has enabled multiple data owners (a.k.a. FL clients)
+to train machine learning models collaboratively without revealing private
+data. Since the FL server can only engage a limited number of clients in each
+training round, FL client selection has become an important research problem.
+Existing approaches generally focus on either enhancing FL model performance or
+enhancing the fair treatment of FL clients. The problem of balancing
+performance and fairness considerations when selecting FL clients remains open.
+To address this problem, we propose the Fairness-aware Federated Client
+Selection (FairFedCS) approach. Based on Lyapunov optimization, it dynamically
+adjusts FL clients' selection probabilities by jointly considering their
+reputations, times of participation in FL tasks and contributions to the
+resulting model performance. By not using threshold-based reputation filtering,
+it provides FL clients with opportunities to redeem their reputations after a
+perceived poor performance, thereby further enhancing fair client treatment.
+Extensive experiments based on real-world multimedia datasets show that
+FairFedCS achieves 19.6% higher fairness and 0.73% higher test accuracy on
+average than the best-performing state-of-the-art approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICME 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Long-Tail Theory under Gaussian Mixtures <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10736v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10736v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arman Bolatov, Maxat Tezekbayev, Igor Melnykov, Artur Pak, Vassilina Nikoulina, Zhenisbek Assylbekov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We suggest a simple Gaussian mixture model for data generation that complies
+with Feldman's long tail theory (2020). We demonstrate that a linear classifier
+cannot decrease the generalization error below a certain level in the proposed
+model, whereas a nonlinear classifier with a memorization capacity can. This
+confirms that for long-tailed distributions, rare training examples must be
+considered for optimal generalization to new data. Finally, we show that the
+performance gap between linear and nonlinear models can be lessened as the tail
+becomes shorter in the subpopulation frequency distribution, as confirmed by
+experiments on synthetic and real data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differences Between Hard and Noisy-labeled Samples: An Empirical Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahsa Forouzesh, Patrick Thiran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting noisy or incorrectly labeled samples from a labeled dataset with
+hard/difficult samples is an important yet under-explored topic. Two general
+and often independent lines of work exist, one focuses on addressing noisy
+labels, and another deals with hard samples. However, when both types of data
+are present, most existing methods treat them equally, which results in a
+decline in the overall performance of the model. In this paper, we first design
+various synthetic datasets with custom hardness and noisiness levels for
+different samples. Our proposed systematic empirical study enables us to better
+understand the similarities and more importantly the differences between
+hard-to-learn samples and incorrectly-labeled samples. These controlled
+experiments pave the way for the development of methods that distinguish
+between hard and noisy samples. Through our study, we introduce a simple yet
+effective metric that filters out noisy-labeled samples while keeping the hard
+samples. We study various data partitioning methods in the presence of label
+noise and observe that filtering out noisy samples from hard samples with this
+proposed metric results in the best datasets as evidenced by the high test
+accuracy achieved after models are trained on the filtered datasets. We
+demonstrate this for both our created synthetic datasets and for datasets with
+real-world label noise. Furthermore, our proposed data partitioning method
+significantly outperforms other methods when employed within a semi-supervised
+learning framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reparameterized Policy Learning for Multimodal Trajectory Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiao Huang, Litian Liang, Zhan Ling, Xuanlin Li, Chuang Gan, Hao Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the challenge of parametrizing policies for reinforcement
+learning (RL) in high-dimensional continuous action spaces. Our objective is to
+develop a multimodal policy that overcomes limitations inherent in the
+commonly-used Gaussian parameterization. To achieve this, we propose a
+principled framework that models the continuous RL policy as a generative model
+of optimal trajectories. By conditioning the policy on a latent variable, we
+derive a novel variational bound as the optimization objective, which promotes
+exploration of the environment. We then present a practical model-based RL
+method, called Reparameterized Policy Gradient (RPG), which leverages the
+multimodal policy parameterization and learned world model to achieve strong
+exploration capabilities and high data efficiency. Empirical results
+demonstrate that our method can help agents evade local optima in tasks with
+dense rewards and solve challenging sparse-reward environments by incorporating
+an object-centric intrinsic reward. Our method consistently outperforms
+previous approaches across a range of tasks. Code and supplementary materials
+are available on the project page https://haosulab.github.io/RPG/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and
+  Lane Segmentation in Self-Driving Cars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10705v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10705v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quang Huy Che, Dinh Phuc Nguyen, Minh Quan Pham, Duc Khai Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation is a common task in autonomous driving to understand
+the surrounding environment. Driveable Area Segmentation and Lane Detection are
+particularly important for safe and efficient navigation on the road. However,
+original semantic segmentation models are computationally expensive and require
+high-end hardware, which is not feasible for embedded systems in autonomous
+vehicles. This paper proposes a lightweight model for the driveable area and
+lane line segmentation. TwinLiteNet is designed cheaply but achieves accurate
+and efficient segmentation results. We evaluate TwinLiteNet on the BDD100K
+dataset and compare it with modern models. Experimental results show that our
+TwinLiteNet performs similarly to existing approaches, requiring significantly
+fewer computational resources. Specifically, TwinLiteNet achieves a mIoU score
+of 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task
+with only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000.
+Furthermore, TwinLiteNet can run in real-time on embedded devices with limited
+computing power, especially since it achieves 60FPS on Jetson Xavier NX, making
+it an ideal solution for self-driving vehicles. Code is available:
+url{https://github.com/chequanghuy/TwinLiteNet}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decentralized Smart Charging of Large-Scale EVs using Adaptive
+  Multi-Agent Multi-Armed Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10704v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10704v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharyal Zafar, Raphaël Feraud, Anne Blavette, Guy Camilleri, Hamid Ben
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The drastic growth of electric vehicles and photovoltaics can introduce new
+challenges, such as electrical current congestion and voltage limit violations
+due to peak load demands. These issues can be mitigated by controlling the
+operation of electric vehicles i.e., smart charging. Centralized smart charging
+solutions have already been proposed in the literature. But such solutions may
+lack scalability and suffer from inherent drawbacks of centralization, such as
+a single point of failure, and data privacy concerns. Decentralization can help
+tackle these challenges. In this paper, a fully decentralized smart charging
+system is proposed using the philosophy of adaptive multi-agent systems. The
+proposed system utilizes multi-armed bandit learning to handle uncertainties in
+the system. The presented system is decentralized, scalable, real-time,
+model-free, and takes fairness among different players into account. A detailed
+case study is also presented for performance evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CIRED 2023 International Conference & Exhibition on Electricity
+  Distribution, Jun 2023, Rome, Italy</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graphs in State-Space Models for Granger Causality in Climate Science 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Víctor Elvira, Émilie Chouzenoux, Jordi Cerdà, Gustau Camps-Valls
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Granger causality (GC) is often considered not an actual form of causality.
+Still, it is arguably the most widely used method to assess the predictability
+of a time series from another one. Granger causality has been widely used in
+many applied disciplines, from neuroscience and econometrics to Earth sciences.
+We revisit GC under a graphical perspective of state-space models. For that, we
+use GraphEM, a recently presented expectation-maximisation algorithm for
+estimating the linear matrix operator in the state equation of a
+linear-Gaussian state-space model. Lasso regularisation is included in the
+M-step, which is solved using a proximal splitting Douglas-Rachford algorithm.
+Experiments in toy examples and challenging climate problems illustrate the
+benefits of the proposed model and inference technique over standard Granger
+causality methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 2 figures, 3 tables, CausalStats23: When Causal Inference
+  meets Statistical Analysis, April 17-21, 2023, Paris, France</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self2Self+: Single-Image Denoising with <span class="highlight-title">Self-Supervised</span> Learning and
+  Image Quality Assessment Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaekyun Ko, Sanghwan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, denoising methods based on supervised learning have exhibited
+promising performance. However, their reliance on external datasets containing
+noisy-clean image pairs restricts their applicability. To address this
+limitation, researchers have focused on training denoising networks using
+solely a set of noisy inputs. To improve the feasibility of denoising
+procedures, in this study, we proposed a single-image self-supervised learning
+method in which only the noisy input image is used for network training. Gated
+convolution was used for feature extraction and no-reference image quality
+assessment was used for guiding the training process. Moreover, the proposed
+method sampled instances from the input image dataset using Bernoulli sampling
+with a certain dropout rate for training. The corresponding result was produced
+by averaging the generated predictions from various instances of the trained
+network with dropouts. The experimental results indicated that the proposed
+method achieved state-of-the-art denoising performance on both synthetic and
+real-world datasets. This highlights the effectiveness and practicality of our
+method as a potential solution for various noise removal tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report and supplemantry materials are combined into one
+  paper. - Technical report: Page 1~7 - Supplemantry materials : Page 8~18</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fractional Denoising for 3D Molecular <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10683v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10683v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shikun Feng, Yuyan Ni, Yanyan Lan, Zhi-Ming Ma, Wei-Ying Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coordinate denoising is a promising 3D molecular pre-training method, which
+has achieved remarkable performance in various downstream drug discovery tasks.
+Theoretically, the objective is equivalent to learning the force field, which
+is revealed helpful for downstream tasks. Nevertheless, there are two
+challenges for coordinate denoising to learn an effective force field, i.e. low
+coverage samples and isotropic force field. The underlying reason is that
+molecular distributions assumed by existing denoising methods fail to capture
+the anisotropic characteristic of molecules. To tackle these challenges, we
+propose a novel hybrid noise strategy, including noises on both dihedral angel
+and coordinate. However, denoising such hybrid noise in a traditional way is no
+more equivalent to learning the force field. Through theoretical deductions, we
+find that the problem is caused by the dependency of the input conformation for
+covariance. To this end, we propose to decouple the two types of noise and
+design a novel fractional denoising method (Frad), which only denoises the
+latter coordinate part. In this way, Frad enjoys both the merits of sampling
+more low-energy structures and the force field equivalence. Extensive
+experiments show the effectiveness of Frad in molecular representation, with a
+new state-of-the-art on 9 out of 12 tasks of QM9 and on 7 out of 8 targets of
+MD17.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep learning for classification of noisy QR codes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10677v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10677v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rebecca Leygonie, Sylvain Lobry,  ), Laurent Wendling (LIPADE)
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We wish to define the limits of a classical classification model based on
+deep learning when applied to abstract images, which do not represent visually
+identifiable objects.QR codes (Quick Response codes) fall into this category of
+abstract images: one bit corresponding to one encoded character, QR codes were
+not designed to be decoded manually. To understand the limitations of a deep
+learning-based model for abstract image classification, we train an image
+classification model on QR codes generated from information obtained when
+reading a health pass. We compare a classification model with a classical
+(deterministic) decoding method in the presence of noise. This study allows us
+to conclude that a model based on deep learning can be relevant for the
+understanding of abstract images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in French language. RFIAP 2022 - Reconnaissance des Formes, Image,
+  Apprentissage et Perception, Jul 2022, Vannes (Bretagne), France</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of What to Share in Federated Learning: Perspectives on Model
+  Utility, Privacy Leakage, and Communication Efficiency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Shao, Zijian Li, Wenqiang Sun, Tailin Zhou, Yuchang Sun, Lumin Liu, Zehong Lin, Jun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) has emerged as a highly effective paradigm for
+privacy-preserving collaborative training among different parties. Unlike
+traditional centralized learning, which requires collecting data from each
+party, FL allows clients to share privacy-preserving information without
+exposing private datasets. This approach not only guarantees enhanced privacy
+protection but also facilitates more efficient and secure collaboration among
+multiple participants. Therefore, FL has gained considerable attention from
+researchers, promoting numerous surveys to summarize the related works.
+However, the majority of these surveys concentrate on methods sharing model
+parameters during the training process, while overlooking the potential of
+sharing other forms of local information. In this paper, we present a
+systematic survey from a new perspective, i.e., what to share in FL, with an
+emphasis on the model utility, privacy leakage, and communication efficiency.
+This survey differs from previous ones due to four distinct contributions.
+First, we present a new taxonomy of FL methods in terms of the sharing methods,
+which includes three categories of shared information: model sharing, synthetic
+data sharing, and knowledge sharing. Second, we analyze the vulnerability of
+different sharing methods to privacy attacks and review the defense mechanisms
+that provide certain privacy guarantees. Third, we conduct extensive
+experiments to compare the performance and communication overhead of various
+sharing methods in FL. Besides, we assess the potential privacy leakage through
+model inversion and membership inference attacks, while comparing the
+effectiveness of various defense approaches. Finally, we discuss potential
+deficiencies in current methods and outline future directions for improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conditional expectation network for SHAP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ronald Richman, Mario V. Wüthrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A very popular model-agnostic technique for explaining predictive models is
+the SHapley Additive exPlanation (SHAP). The two most popular versions of SHAP
+are a conditional expectation version and an unconditional expectation version
+(the latter is also known as interventional SHAP). Except for tree-based
+methods, usually the unconditional version is used (for computational reasons).
+We provide a (surrogate) neural network approach which allows us to efficiently
+calculate the conditional version for both neural networks and other regression
+models, and which properly considers the dependence structure in the feature
+components. This proposal is also useful to provide drop1 and anova analyses in
+complex regression models which are similar to their generalized linear model
+(GLM) counterparts, and we provide a partial dependence plot (PDP) counterpart
+that considers the right dependence structure in the feature components.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Refining the Optimization Target for Automatic Univariate Time Series
+  Anomaly Detection in Monitoring Services <span class="chip">IJCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manqing Dong, Zhanxiang Zhao, Yitong Geng, Wentao Li, Wei Wang, Huai Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series anomaly detection is crucial for industrial monitoring services
+that handle a large volume of data, aiming to ensure reliability and optimize
+system performance. Existing methods often require extensive labeled resources
+and manual parameter selection, highlighting the need for automation. This
+paper proposes a comprehensive framework for automatic parameter optimization
+in time series anomaly detection models. The framework introduces three
+optimization targets: prediction score, shape score, and sensitivity score,
+which can be easily adapted to different model backbones without prior
+knowledge or manual labeling efforts. The proposed framework has been
+successfully applied online for over six months, serving more than 50,000 time
+series every minute. It simplifies the user's experience by requiring only an
+expected sensitive value, offering a user-friendly interface, and achieving
+desired detection results. Extensive evaluations conducted on public datasets
+and comparison with other methods further confirm the effectiveness of the
+proposed framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 2023 IJCAI Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-Driven Latency Probability Prediction for Wireless Networks:
+  Focusing on Tail Probabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samie Mostafavi, Gourav Prateek Sharma, James Gross
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of new application areas, such as cyber-physical systems
+and human-in-the-loop applications, there is a need to guarantee a certain
+level of end-to-end network latency with extremely high reliability, e.g.,
+99.999%. While mechanisms specified under IEEE 802.1as time-sensitive
+networking (TSN) can be used to achieve these requirements for switched
+Ethernet networks, implementing TSN mechanisms in wireless networks is
+challenging due to their stochastic nature. To conform the wireless link to a
+reliability level of 99.999%, the behavior of extremely rare outliers in the
+latency probability distribution, or the tail of the distribution, must be
+analyzed and controlled. This work proposes predicting the tail of the latency
+distribution using state-of-the-art data-driven approaches, such as mixture
+density networks (MDN) and extreme value mixture models, to estimate the
+likelihood of rare latencies conditioned on the network parameters, which can
+be used to make more informed decisions in wireless transmission. Actual
+latency measurements of IEEE 802.11g (WiFi), commercial private and a
+software-defined 5G network are used to benchmark the proposed approaches and
+evaluate their sensitivities concerning the tail probabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Global Communications (GLOBECOM) 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fisher-Rao distance and pullback SPD cone distances between multivariate
+  normal distributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank Nielsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data sets of multivariate normal distributions abound in many scientific
+areas like diffusion tensor imaging, structure tensor computer vision, radar
+signal processing, machine learning, just to name a few. In order to process
+those normal data sets for downstream tasks like filtering, classification or
+clustering, one needs to define proper notions of dissimilarities between
+normals and paths joining them. The Fisher-Rao distance defined as the
+Riemannian geodesic distance induced by the Fisher information metric is such a
+principled metric distance which however is not known in closed-form excepts
+for a few particular cases. In this work, we first report a fast and robust
+method to approximate arbitrarily finely the Fisher-Rao distance between
+multivariate normal distributions. Second, we introduce a class of distances
+based on diffeomorphic embeddings of the normal manifold into a submanifold of
+the higher-dimensional symmetric positive-definite cone corresponding to the
+manifold of centered normal distributions. We show that the projective Hilbert
+distance on the cone yields a metric on the embedded normal submanifold and we
+pullback that cone distance with its associated straight line Hilbert cone
+geodesics to obtain a distance and smooth paths between normal distributions.
+Compared to the Fisher-Rao distance approximation, the pullback Hilbert cone
+distance is computationally light since it requires to compute only the extreme
+minimal and maximal eigenvalues of matrices. Finally, we show how to use those
+distances in clustering tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SciBench: Evaluating College-Level Scientific Problem-Solving Abilities
+  of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxuan Wang, Ziniu Hu, Pan Lu, Yanqiao Zhu, Jieyu Zhang, Satyen Subramaniam, Arjun R. Loomba, Shichang Zhang, Yizhou Sun, Wei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language models (LLMs) have demonstrated notable
+progress on many mathematical benchmarks. However, most of these benchmarks
+only feature problems grounded in junior and senior high school subjects,
+contain only multiple-choice questions, and are confined to a limited scope of
+elementary arithmetic operations. To address these issues, this paper
+introduces an expansive benchmark suite SciBench that aims to systematically
+examine the reasoning capabilities required for complex scientific problem
+solving. SciBench contains two carefully curated datasets: an open set
+featuring a range of collegiate-level scientific problems drawn from
+mathematics, chemistry, and physics textbooks, and a closed set comprising
+problems from undergraduate-level exams in computer science and mathematics.
+Based on the two datasets, we conduct an in-depth benchmark study of two
+representative LLMs with various prompting strategies. The results reveal that
+current LLMs fall short of delivering satisfactory performance, with an overall
+score of merely 35.80%. Furthermore, through a detailed user study, we
+categorize the errors made by LLMs into ten problem-solving abilities. Our
+analysis indicates that no single prompting strategy significantly outperforms
+others and some strategies that demonstrate improvements in certain
+problem-solving skills result in declines in other skills. We envision that
+SciBench will catalyze further developments in the reasoning abilities of LLMs,
+thereby ultimately contributing to scientific research and discovery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress, 18 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Language Models on Nucleotide Sequences of Human Genes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10634v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10634v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Musa Nuri Ihtiyar, Arzucan Ozgur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models, primarily transformer-based ones, obtained colossal success
+in NLP. To be more precise, studies like BERT in NLU and works such as GPT-3
+for NLG are very crucial. DNA sequences are very close to natural language in
+terms of structure, so if the DNA-related bioinformatics domain is concerned,
+discriminative models, like DNABert, exist. Yet, the generative side of the
+coin is mainly unexplored to the best of our knowledge. Consequently, we
+focused on developing an autoregressive generative language model like GPT-3
+for DNA sequences. Because working with whole DNA sequences is challenging
+without substantial computational resources, we decided to carry out our study
+on a smaller scale, focusing on nucleotide sequences of human genes, unique
+parts in DNA with specific functionalities, instead of the whole DNA. This
+decision did not change the problem structure a lot due to the fact that both
+DNA and genes can be seen as 1D sequences consisting of four different
+nucleotides without losing much information and making too much simplification.
+First of all, we systematically examined an almost entirely unexplored problem
+and observed that RNNs performed the best while simple techniques like N-grams
+were also promising. Another beneficial point was learning how to work with
+generative models on languages we do not understand, unlike natural language.
+How essential using real-life tasks beyond the classical metrics such as
+perplexity is observed. Furthermore, checking whether the data-hungry nature of
+these models can be changed through selecting a language with minimal
+vocabulary size, four owing to four different types of nucleotides, is
+examined. The reason for reviewing this was that choosing such a language might
+make the problem easier. However, what we observed in this study was it did not
+provide that much of a change in the amount of data needed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Method Self-Training: Improving Code Generation With Text, And
+  Vice Versa 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shriyash K. Upadhyay, Etan J. Ginsberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models have many methods for solving the same problem. This
+introduces novel strengths (different methods may work well for different
+problems) and weaknesses (it may be difficult for users to know which method to
+use). In this paper, we introduce Multi-Method Self-Training (MMST), where one
+method is trained on the filtered outputs of another, allowing us to augment
+the strengths and ameliorate the weaknesses of each method. Using a 176B
+parameter model trained on both language and code, we show that MMST can 1)
+improve the less performant method (up to 30%) making the model easier to use,
+2) improve the more performant method (up to 32.2%) making the model more
+performant, and 3) improve the performance of related but distinct tasks (up to
+10.3%) by improving the ability of the model to generate rationales. We then
+conduct ablation analyses to explore why MMST works. We show that MMST
+generates more data than traditional self-training, but the improvement in
+performance is driven by the use of multiple methods. We also analyze
+prompt-engineering and anti-correlated performance between methods as means of
+making MMST more effective. We hope the evidence from our paper motivates
+machine learning researchers to explore ways in which advances in language
+models allow for new forms of training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting deceptive <span class="highlight-title">review</span>s using text classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anusuya Baby
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, online reviews play a vital role for promoting any kind of
+product or services. Businesses may embed fake reviews in order to attract
+customers to purchase their products. They may even highlight the benefits of
+their own product or criticize the competition's product. Marketers,
+advertisers, and other online business users have incentive to create fake
+positive reviews for products which they want to promote or give fake negative
+reviews for products which they really don't like. So now-a-days writing a
+deceptive review is inevitable thing for promoting their own business or
+degrading competitor's reputation. Thus, identifying deceptive reviews is an
+intense and on-going research area. This research paper proposes machine
+learning model approach to identify deceptive reviews. The paper investigates
+the performance of the several experiments done on a Deceptive Opinion Spam
+Corpus dataset of restaurants reviews. We developed a n-gram model and max
+features to identify deceptive contents with a particular focus on fake
+reviews. Further, we conduct a benchmark study to investigate the performance
+of two different features extraction techniques and apply five machine learning
+classification techniques. The experimental results show that passive
+aggressive classifier outperforms other algorithms, and it reaches the highest
+accuracy not only in text classification but also to fake reviews. We also
+study the data augmentation and implement different deep learning techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heterogeneous Federated Learning: State-of-the-art and Research
+  Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mang Ye, Xiuwen Fang, Bo Du, Pong C. Yuen, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) has drawn increasing attention owing to its potential
+use in large-scale industrial applications. Existing federated learning works
+mainly focus on model homogeneous settings. However, practical federated
+learning typically faces the heterogeneity of data distributions, model
+architectures, network environments, and hardware devices among participant
+clients. Heterogeneous Federated Learning (HFL) is much more challenging, and
+corresponding solutions are diverse and complex. Therefore, a systematic survey
+on this topic about the research challenges and state-of-the-art is essential.
+In this survey, we firstly summarize the various research challenges in HFL
+from five aspects: statistical heterogeneity, model heterogeneity,
+communication heterogeneity, device heterogeneity, and additional challenges.
+In addition, recent advances in HFL are reviewed and a new taxonomy of existing
+HFL methods is proposed with an in-depth analysis of their pros and cons. We
+classify existing methods from three different levels according to the HFL
+procedure: data-level, model-level, and server-level. Finally, several critical
+and promising future research directions in HFL are discussed, which may
+facilitate further developments in this field. A periodically updated
+collection on HFL is available at https://github.com/marswhu/HFL_Survey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 11 figures, and 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ensemble Learning based Anomaly Detection for IoT Cybersecurity via
+  Bayesian Hyperparameters Sensitivity Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10596v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10596v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tin Lai, Farnaz Farid, Abubakar Bello, Fariza Sabrina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Internet of Things (IoT) integrates more than billions of intelligent
+devices over the globe with the capability of communicating with other
+connected devices with little to no human intervention. IoT enables data
+aggregation and analysis on a large scale to improve life quality in many
+domains. In particular, data collected by IoT contain a tremendous amount of
+information for anomaly detection. The heterogeneous nature of IoT is both a
+challenge and an opportunity for cybersecurity. Traditional approaches in
+cybersecurity monitoring often require different kinds of data pre-processing
+and handling for various data types, which might be problematic for datasets
+that contain heterogeneous features. However, heterogeneous types of network
+devices can often capture a more diverse set of signals than a single type of
+device readings, which is particularly useful for anomaly detection. In this
+paper, we present a comprehensive study on using ensemble machine learning
+methods for enhancing IoT cybersecurity via anomaly detection. Rather than
+using one single machine learning model, ensemble learning combines the
+predictive power from multiple models, enhancing their predictive accuracy in
+heterogeneous datasets rather than using one single machine learning model. We
+propose a unified framework with ensemble learning that utilises Bayesian
+hyperparameter optimisation to adapt to a network environment that contains
+multiple IoT sensor readings. Experimentally, we illustrate their high
+predictive power when compared to traditional methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Forecasting Battery Electric Vehicle Charging Behavior: A Deep Learning
+  Approach Equipped with Micro-Clustering and SMOTE Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanif Tayarani, Trisha V. Ramadoss, Vaishnavi Karanam, Gil Tal, Christopher Nitta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Energy systems, climate change, and public health are among the primary
+reasons for moving toward electrification in transportation. Transportation
+electrification is being promoted worldwide to reduce emissions. As a result,
+many automakers will soon start making only battery electric vehicles (BEVs).
+BEV adoption rates are rising in California, mainly due to climate change and
+air pollution concerns. While great for climate and pollution goals, improperly
+managed BEV charging can lead to insufficient charging infrastructure and power
+outages. This study develops a novel Micro Clustering Deep Neural Network
+(MCDNN), an artificial neural network algorithm that is highly effective at
+learning BEVs trip and charging data to forecast BEV charging events,
+information that is essential for electricity load aggregators and utility
+managers to provide charging stations and electricity capacity effectively. The
+MCDNN is configured using a robust dataset of trips and charges that occurred
+in California between 2015 and 2020 from 132 BEVs, spanning 5 BEV models for a
+total of 1570167 vehicle miles traveled. The numerical findings revealed that
+the proposed MCDNN is more effective than benchmark approaches in this field,
+such as support vector machine, k nearest neighbors, decision tree, and other
+neural network-based models in predicting the charging events.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages,8 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Holistic Assessment of the Reliability of Machine Learning Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anthony Corso, David Karamadian, Romeo Valentin, Mary Cooper, Mykel J. Kochenderfer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As machine learning (ML) systems increasingly permeate high-stakes settings
+such as healthcare, transportation, military, and national security, concerns
+regarding their reliability have emerged. Despite notable progress, the
+performance of these systems can significantly diminish due to adversarial
+attacks or environmental changes, leading to overconfident predictions,
+failures to detect input faults, and an inability to generalize in unexpected
+scenarios. This paper proposes a holistic assessment methodology for the
+reliability of ML systems. Our framework evaluates five key properties:
+in-distribution accuracy, distribution-shift robustness, adversarial
+robustness, calibration, and out-of-distribution detection. A reliability score
+is also introduced and used to assess the overall system reliability. To
+provide insights into the performance of different algorithmic approaches, we
+identify and categorize state-of-the-art techniques, then evaluate a selection
+on real-world tasks using our proposed reliability metrics and reliability
+score. Our analysis of over 500 models reveals that designing for one metric
+does not necessarily constrain others but certain algorithmic techniques can
+improve reliability across multiple metrics simultaneously. This study
+contributes to a more comprehensive understanding of ML reliability and
+provides a roadmap for future research and development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intelligent model for offshore China sea fog forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10580v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10580v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanfei Xiang, Qinghong Zhang, Mingqing Wang, Ruixue Xia, Yang Kong, Xiaomeng Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and timely prediction of sea fog is very important for effectively
+managing maritime and coastal economic activities. Given the intricate nature
+and inherent variability of sea fog, traditional numerical and statistical
+forecasting methods are often proven inadequate. This study aims to develop an
+advanced sea fog forecasting method embedded in a numerical weather prediction
+model using the Yangtze River Estuary (YRE) coastal area as a case study. Prior
+to training our machine learning model, we employ a time-lagged correlation
+analysis technique to identify key predictors and decipher the underlying
+mechanisms driving sea fog occurrence. In addition, we implement ensemble
+learning and a focal loss function to address the issue of imbalanced data,
+thereby enhancing the predictive ability of our model. To verify the accuracy
+of our method, we evaluate its performance using a comprehensive dataset
+spanning one year, which encompasses both weather station observations and
+historical forecasts. Remarkably, our machine learning-based approach surpasses
+the predictive performance of two conventional methods, the weather research
+and forecasting nonhydrostatic mesoscale model (WRF-NMM) and the algorithm
+developed by the National Oceanic and Atmospheric Administration (NOAA)
+Forecast Systems Laboratory (FSL). Specifically, in regard to predicting sea
+fog with a visibility of less than or equal to 1 km with a lead time of 60
+hours, our methodology achieves superior results by increasing the probability
+of detection (POD) while simultaneously reducing the false alarm ratio (FAR).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SecureBoost Hyperparameter Tuning via Multi-Objective Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10579v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10579v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyao Ren, Yan Kang, Lixin Fan, Linghua Yang, Tao Fan, Yongxin Tong, Qiang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SecureBoost is a tree-boosting algorithm leveraging homomorphic encryption to
+protect data privacy in vertical federated learning setting. It is widely used
+in fields such as finance and healthcare due to its interpretability,
+effectiveness, and privacy-preserving capability. However, SecureBoost suffers
+from high computational complexity and risk of label leakage. To harness the
+full potential of SecureBoost, hyperparameters of SecureBoost should be
+carefully chosen to strike an optimal balance between utility, efficiency, and
+privacy. Existing methods either set hyperparameters empirically or
+heuristically, which are far from optimal. To fill this gap, we propose a
+Constrained Multi-Objective SecureBoost (CMOSB) algorithm to find Pareto
+optimal solutions that each solution is a set of hyperparameters achieving
+optimal tradeoff between utility loss, training cost, and privacy leakage. We
+design measurements of the three objectives. In particular, the privacy leakage
+is measured using our proposed instance clustering attack. Experimental results
+demonstrate that the CMOSB yields not only hyperparameters superior to the
+baseline but also optimal sets of hyperparameters that can support the flexible
+requirements of FL participants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Federated Learning Convergence with Prototype Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Qiao, Huy Q. Le, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a distributed machine learning technique, federated learning (FL) requires
+clients to collaboratively train a shared model with an edge server without
+leaking their local data. However, the heterogeneous data distribution among
+clients often leads to a decrease in model performance. To tackle this issue,
+this paper introduces a prototype-based regularization strategy to address the
+heterogeneity in the data distribution. Specifically, the regularization
+process involves the server aggregating local prototypes from distributed
+clients to generate a global prototype, which is then sent back to the
+individual clients to guide their local training. The experimental results on
+MNIST and Fashion-MNIST show that our proposal achieves improvements of 3.3%
+and 8.9% in average test accuracy, respectively, compared to the most popular
+baseline FedAvg. Furthermore, our approach has a fast convergence rate in
+heterogeneous settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deceptive Alignment Monitoring <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10569v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10569v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andres Carranza, Dhruv Pai, Rylan Schaeffer, Arnuv Tandon, Sanmi Koyejo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the capabilities of large machine learning models continue to grow, and as
+the autonomy afforded to such models continues to expand, the spectre of a new
+adversary looms: the models themselves. The threat that a model might behave in
+a seemingly reasonable manner, while secretly and subtly modifying its behavior
+for ulterior reasons is often referred to as deceptive alignment in the AI
+Safety & Alignment communities. Consequently, we call this new direction
+Deceptive Alignment Monitoring. In this work, we identify emerging directions
+in diverse machine learning subfields that we believe will become increasingly
+important and intertwined in the near future for deceptive alignment
+monitoring, and we argue that advances in these fields present both long-term
+challenges and new research opportunities. We conclude by advocating for
+greater involvement by the adversarial machine learning community in these
+emerging directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as BlueSky Oral to 2023 ICML AdvML Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FACADE: A Framework for Adversarial Circuit Anomaly Detection and
+  Evaluation <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhruv Pai, Andres Carranza, Rylan Schaeffer, Arnuv Tandon, Sanmi Koyejo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present FACADE, a novel probabilistic and geometric framework designed for
+unsupervised mechanistic anomaly detection in deep neural networks. Its primary
+goal is advancing the understanding and mitigation of adversarial attacks.
+FACADE aims to generate probabilistic distributions over circuits, which
+provide critical insights to their contribution to changes in the manifold
+properties of pseudo-classes, or high-dimensional modes in activation space,
+yielding a powerful tool for uncovering and combating adversarial attacks. Our
+approach seeks to improve model robustness, enhance scalable model oversight,
+and demonstrates promising applications in real-world deployment settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as BlueSky Poster at 2023 ICML AdvML Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shared Adversarial Unlearning: Backdoor Mitigation by Unlearning Shared
+  Adversarial Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10562v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10562v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaokui Wei, Mingda Zhang, Hongyuan Zha, Baoyuan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor attacks are serious security threats to machine learning models
+where an adversary can inject poisoned samples into the training set, causing a
+backdoored model which predicts poisoned samples with particular triggers to
+particular target classes, while behaving normally on benign samples. In this
+paper, we explore the task of purifying a backdoored model using a small clean
+dataset. By establishing the connection between backdoor risk and adversarial
+risk, we derive a novel upper bound for backdoor risk, which mainly captures
+the risk on the shared adversarial examples (SAEs) between the backdoored model
+and the purified model. This upper bound further suggests a novel bi-level
+optimization problem for mitigating backdoor using adversarial training
+techniques. To solve it, we propose Shared Adversarial Unlearning (SAU).
+Specifically, SAU first generates SAEs, and then, unlearns the generated SAEs
+such that they are either correctly classified by the purified model and/or
+differently classified by the two models, such that the backdoor effect in the
+backdoored model will be mitigated in the purified model. Experiments on
+various benchmark datasets and network architectures show that our proposed
+method achieves state-of-the-art performance for backdoor defense.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Post-variational quantum neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Po-Wei Huang, Patrick Rebentrost
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum computing has the potential to provide substantial computational
+advantages over current state-of-the-art classical supercomputers. However,
+current hardware is not advanced enough to execute fault-tolerant quantum
+algorithms. An alternative of using hybrid quantum-classical computing with
+variational algorithms can exhibit barren plateau issues, causing slow
+convergence of gradient-based optimization techniques. In this paper, we
+discuss "post-variational strategies", which shift tunable parameters from the
+quantum computer to the classical computer, opting for ensemble strategies when
+optimizing quantum models. We discuss various strategies and design principles
+for constructing individual quantum circuits, where the resulting ensembles can
+be optimized with convex programming. Further, we discuss architectural designs
+of post-variational quantum neural networks and analyze the propagation of
+estimation errors throughout such neural networks. Lastly, we show that our
+algorithm can be applied to real-world applications such as image
+classification on handwritten digits, producing a 96% classification accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Air Traffic Controller Workload Level Prediction using Conformalized
+  Dynamical Graph Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10559v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10559v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutian Pang, Jueming Hu, Christopher S. Lieber, Nancy J. Cooke, Yongming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Air traffic control (ATC) is a safety-critical service system that demands
+constant attention from ground air traffic controllers (ATCos) to maintain
+daily aviation operations. The workload of the ATCos can have negative effects
+on operational safety and airspace usage. To avoid overloading and ensure an
+acceptable workload level for the ATCos, it is important to predict the ATCos'
+workload accurately for mitigation actions. In this paper, we first perform a
+review of research on ATCo workload, mostly from the air traffic perspective.
+Then, we briefly introduce the setup of the human-in-the-loop (HITL)
+simulations with retired ATCos, where the air traffic data and workload labels
+are obtained. The simulations are conducted under three Phoenix approach
+scenarios while the human ATCos are requested to self-evaluate their workload
+ratings (i.e., low-1 to high-7). Preliminary data analysis is conducted. Next,
+we propose a graph-based deep-learning framework with conformal prediction to
+identify the ATCo workload levels. The number of aircraft under the
+controller's control varies both spatially and temporally, resulting in
+dynamically evolving graphs. The experiment results suggest that (a) besides
+the traffic density feature, the traffic conflict feature contributes to the
+workload prediction capabilities (i.e., minimum horizontal/vertical separation
+distance); (b) directly learning from the spatiotemporal graph layout of
+airspace with graph neural network can achieve higher prediction accuracy,
+compare to hand-crafted traffic complexity features; (c) conformal prediction
+is a valuable tool to further boost model prediction accuracy, resulting a
+range of predicted workload labels. The code used is available at
+\href{https://github.com/ymlasu/para-atm-collection/blob/master/air-traffic-prediction/ATC-Workload-Prediction/}{$\mathsf{Link}$}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SC VALL-E: Style-Controllable Zero-Shot Text to Speech Synthesizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10550v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10550v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daegyeom Kim, Seongho Hong, Yong-Hoon Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expressive speech synthesis models are trained by adding corpora with diverse
+speakers, various emotions, and different speaking styles to the dataset, in
+order to control various characteristics of speech and generate the desired
+voice. In this paper, we propose a style control (SC) VALL-E model based on the
+neural codec language model (called VALL-E), which follows the structure of the
+generative pretrained transformer 3 (GPT-3). The proposed SC VALL-E takes input
+from text sentences and prompt audio and is designed to generate controllable
+speech by not simply mimicking the characteristics of the prompt audio but by
+controlling the attributes to produce diverse voices. We identify tokens in the
+style embedding matrix of the newly designed style network that represent
+attributes such as emotion, speaking rate, pitch, and voice intensity, and
+design a model that can control these attributes. To evaluate the performance
+of SC VALL-E, we conduct comparative experiments with three representative
+expressive speech synthesis models: global style token (GST) Tacotron2,
+variational autoencoder (VAE) Tacotron2, and original VALL-E. We measure word
+error rate (WER), F0 voiced error (FVE), and F0 gross pitch error (F0GPE) as
+evaluation metrics to assess the accuracy of generated sentences. For comparing
+the quality of synthesized speech, we measure comparative mean option score
+(CMOS) and similarity mean option score (SMOS). To evaluate the style control
+ability of the generated speech, we observe the changes in F0 and
+mel-spectrogram by modifying the trained tokens. When using prompt audio that
+is not present in the training data, SC VALL-E generates a variety of
+expressive sounds and demonstrates competitive performance compared to the
+existing models. Our implementation, pretrained models, and audio samples are
+located on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differentially Flat Learning-based Model Predictive Control Using a
+  Stability, State, and Input Constraining Safety Filter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam W. Hall, Melissa Greeff, Angela P. Schoellig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning-based optimal control algorithms control unknown systems using past
+trajectory data and a learned model of the system dynamics. These controllers
+use either a linear approximation of the learned dynamics, trading performance
+for faster computation, or nonlinear optimization methods, which typically
+perform better but can limit real-time applicability. In this work, we present
+a novel nonlinear controller that exploits differential flatness to achieve
+similar performance to state-of-the-art learning-based controllers but with
+significantly less computational effort. Differential flatness is a property of
+dynamical systems whereby nonlinear systems can be exactly linearized through a
+nonlinear input mapping. Here, the nonlinear transformation is learned as a
+Gaussian process and is used in a safety filter that guarantees, with high
+probability, stability as well as input and flat state constraint satisfaction.
+This safety filter is then used to refine inputs from a flat model predictive
+controller to perform constrained nonlinear learning-based optimal control
+through two successive convex optimizations. We compare our method to
+state-of-the-art learning-based control strategies and achieve similar
+performance, but with significantly better computational efficiency, while also
+respecting flat state and input constraints, and guaranteeing stability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures, Published in IEEE Control Systems Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Unsupervised Deep Outlier Model Selection with Hypernetworks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueying Ding, Yue Zhao, Leman Akoglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Outlier detection (OD) finds many applications with a rich literature of
+numerous techniques. Deep neural network based OD (DOD) has seen a recent surge
+of attention thanks to the many advances in deep learning. In this paper, we
+consider a critical-yet-understudied challenge with unsupervised DOD, that is,
+effective hyperparameter (HP) tuning/model selection. While several prior work
+report the sensitivity of OD models to HPs, it becomes ever so critical for the
+modern DOD models that exhibit a long list of HPs. We introduce HYPER for
+tuning DOD models, tackling two fundamental challenges: (1) validation without
+supervision (due to lack of labeled anomalies), and (2) efficient search of the
+HP/model space (due to exponential growth in the number of HPs). A key idea is
+to design and train a novel hypernetwork (HN) that maps HPs onto optimal
+weights of the main DOD model. In turn, HYPER capitalizes on a single HN that
+can dynamically generate weights for many DOD models (corresponding to varying
+HPs), which offers significant speed-up. In addition, it employs meta-learning
+on historical OD tasks with labels to train a proxy validation function,
+likewise trained with our proposed HN efficiently. Extensive experiments on 35
+OD tasks show that HYPER achieves high performance against 8 baselines with
+significant efficiency gains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Black-Box Advice: Learning-Augmented Algorithms for MDPs with
+  Q-Value Predictions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10524v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10524v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongxin Li, Yiheng Lin, Shaolei Ren, Adam Wierman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the tradeoff between consistency and robustness in the context of a
+single-trajectory time-varying Markov Decision Process (MDP) with untrusted
+machine-learned advice. Our work departs from the typical approach of treating
+advice as coming from black-box sources by instead considering a setting where
+additional information about how the advice is generated is available. We prove
+a first-of-its-kind consistency and robustness tradeoff given Q-value advice
+under a general MDP model that includes both continuous and discrete
+state/action spaces. Our results highlight that utilizing Q-value advice
+enables dynamic pursuit of the better of machine-learned advice and a robust
+baseline, thus result in near-optimal performance guarantees, which provably
+improves what can be obtained solely with black-box advice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedSoup: Improving Generalization and Personalization in Federated
+  Learning via Selective Model Interpolation <span class="chip">MICCAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghui Chen, Meirui Jiang, Qi Dou, Zehua Wang, Xiaoxiao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-silo federated learning (FL) enables the development of machine
+learning models on datasets distributed across data centers such as hospitals
+and clinical research laboratories. However, recent research has found that
+current FL algorithms face a trade-off between local and global performance
+when confronted with distribution shifts. Specifically, personalized FL methods
+have a tendency to overfit to local data, leading to a sharp valley in the
+local model and inhibiting its ability to generalize to out-of-distribution
+data. In this paper, we propose a novel federated model soup method (i.e.,
+selective interpolation of model parameters) to optimize the trade-off between
+local and global performance. Specifically, during the federated training
+phase, each client maintains its own global model pool by monitoring the
+performance of the interpolated model between the local and global models. This
+allows us to alleviate overfitting and seek flat minima, which can
+significantly improve the model's generalization performance. We evaluate our
+method on retinal and pathological image classification tasks, and our proposed
+method achieves significant improvements for out-of-distribution
+generalization. Our code is available at https://github.com/ubc-tea/FedSoup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MICCAI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identifying Interpretable Subspaces in Image Representations <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10504v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10504v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neha Kalibhat, Shweta Bhardwaj, Bayan Bruss, Hamed Firooz, Maziar Sanjabi, Soheil Feizi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Automatic Feature Explanation using Contrasting Concepts (FALCON),
+an interpretability framework to explain features of image representations. For
+a target feature, FALCON captions its highly activating cropped images using a
+large captioning dataset (like LAION-400m) and a pre-trained vision-language
+model like CLIP. Each word among the captions is scored and ranked leading to a
+small number of shared, human-understandable concepts that closely describe the
+target feature. FALCON also applies contrastive interpretation using lowly
+activating (counterfactual) images, to eliminate spurious concepts. Although
+many existing approaches interpret features independently, we observe in
+state-of-the-art self-supervised and supervised models, that less than 20% of
+the representation space can be explained by individual features. We show that
+features in larger spaces become more interpretable when studied in groups and
+can be explained with high-order scoring concepts through FALCON. We discuss
+how extracted concepts can be used to explain and debug failures in downstream
+tasks. Finally, we present a technique to transfer concepts from one
+(explainable) representation space to another unseen representation space by
+learning a simple linear transformation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GLSFormer: Gated - Long, Short Sequence <span class="highlight-title">Transformer</span> for Step Recognition
+  in Surgical Videos <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nisarg A. Shah, Shameema Sikder, S. Swaroop Vedula, Vishal M. Patel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated surgical step recognition is an important task that can
+significantly improve patient safety and decision-making during surgeries.
+Existing state-of-the-art methods for surgical step recognition either rely on
+separate, multi-stage modeling of spatial and temporal information or operate
+on short-range temporal resolution when learned jointly. However, the benefits
+of joint modeling of spatio-temporal features and long-range information are
+not taken in account. In this paper, we propose a vision transformer-based
+approach to jointly learn spatio-temporal features directly from sequence of
+frame-level patches. Our method incorporates a gated-temporal attention
+mechanism that intelligently combines short-term and long-term spatio-temporal
+feature representations. We extensively evaluate our approach on two cataract
+surgery video datasets, namely Cataract-101 and D99, and demonstrate superior
+performance compared to various state-of-the-art methods. These results
+validate the suitability of our proposed approach for automated surgical step
+recognition. Our code is released at:
+https://github.com/nisargshah1999/GLSFormer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023 (Early Accept)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Amortized Variational Inference: When and Why? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charles C. Margossian, David M. Blei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Amortized variational inference (A-VI) is a method for approximating the
+intractable posterior distributions that arise in probabilistic models. The
+defining feature of A-VI is that it learns a global inference function that
+maps each observation to its local latent variable's approximate posterior.
+This stands in contrast to the more classical factorized (or mean-field)
+variational inference (F-VI), which directly learns the parameters of the
+approximating distribution for each latent variable. In deep generative models,
+A-VI is used as a computational trick to speed up inference for local latent
+variables. In this paper, we study A-VI as a general alternative to F-VI for
+approximate posterior inference. A-VI cannot produce an approximation with a
+lower Kullback-Leibler divergence than F-VI's optimal solution, because the
+amortized family is a subset of the factorized family. Thus a central
+theoretical problem is to characterize when A-VI still attains F-VI's optimal
+solution. We derive conditions on both the model and the inference function
+under which A-VI can theoretically achieve F-VI's optimum. We show that for a
+broad class of hierarchical models, including deep generative models, it is
+possible to close the gap between A-VI and F-VI. Further, for an even broader
+class of models, we establish when and how to expand the domain of the
+inference function to make amortization a feasible strategy. Finally, we prove
+that for certain models -- including hidden Markov models and Gaussian
+processes -- A-VI cannot match F-VI's solution, no matter how expressive the
+inference function is. We also study A-VI empirically. On several examples, we
+corroborate our theoretical results and investigate the performance of A-VI
+when varying the complexity of the inference function. When the gap between
+A-VI and F-VI can be closed, we find that the required complexity of the
+function need not scale with the number of observations, and that A-VI often
+converges faster than F-VI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Fisher-Rao Gradient of the Evidence Lower Bound 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nihat Ay, Jesse van Oostrum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article studies the Fisher-Rao gradient, also referred to as the natural
+gradient, of the evidence lower bound, the ELBO, which plays a crucial role
+within the theory of the Variational Autonecoder, the Helmholtz Machine and the
+Free Energy Principle. The natural gradient of the ELBO is related to the
+natural gradient of the Kullback-Leibler divergence from a target distribution,
+the prime objective function of learning. Based on invariance properties of
+gradients within information geometry, conditions on the underlying model are
+provided that ensure the equivalence of minimising the prime objective function
+and the maximisation of the ELBO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On-Sensor Data Filtering using Neuromorphic Computing for High Energy
+  Physics Experiments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11242v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11242v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shruti R. Kulkarni, Aaron Young, Prasanna Date, Narasinga Rao Miniskar, Jeffrey S. Vetter, Farah Fahim, Benjamin Parpillon, Jennet Dickinson, Nhan Tran, Jieun Yoo, Corrinne Mills, Morris Swartz, Petar Maksimovic, Catherine D. Schuman, Alice Bean
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work describes the investigation of neuromorphic computing-based spiking
+neural network (SNN) models used to filter data from sensor electronics in high
+energy physics experiments conducted at the High Luminosity Large Hadron
+Collider. We present our approach for developing a compact neuromorphic model
+that filters out the sensor data based on the particle's transverse momentum
+with the goal of reducing the amount of data being sent to the downstream
+electronics. The incoming charge waveforms are converted to streams of
+binary-valued events, which are then processed by the SNN. We present our
+insights on the various system design choices - from data encoding to optimal
+hyperparameters of the training algorithm - for an accurate and compact SNN
+optimized for hardware deployment. Our results show that an SNN trained with an
+evolutionary algorithm and an optimized set of hyperparameters obtains a signal
+efficiency of about 91% with nearly half as many parameters as a deep neural
+network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Manuscript accepted at ICONS'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Edgewise outliers of network indexed signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Rieser, Anne Ruiz-Gazen, Christine Thomas-Agnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider models for network indexed multivariate data involving a
+dependence between variables as well as across graph nodes.
+  In the framework of these models, we focus on outliers detection and
+introduce the concept of edgewise outliers. For this purpose, we first derive
+the distribution of some sums of squares, in particular squared Mahalanobis
+distances that can be used to fix detection rules and thresholds for outlier
+detection. We then propose a robust version of the deterministic MCD algorithm
+that we call edgewise MCD. An application on simulated data shows the interest
+of taking the dependence structure into account. We also illustrate the utility
+of the proposed method with a real data set.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QDC: Quantum Diffusion Convolution Kernels on Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Markovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph convolutional neural networks (GCNs) operate by aggregating messages
+over local neighborhoods given the prediction task under interest. Many GCNs
+can be understood as a form of generalized diffusion of input features on the
+graph, and significant work has been dedicated to improving predictive accuracy
+by altering the ways of message passing. In this work, we propose a new
+convolution kernel that effectively rewires the graph according to the
+occupation correlations of the vertices by trading on the generalized diffusion
+paradigm for the propagation of a quantum particle over the graph. We term this
+new convolution kernel the Quantum Diffusion Convolution (QDC) operator. In
+addition, we introduce a multiscale variant that combines messages from the QDC
+operator and the traditional combinatorial Laplacian. To understand our method,
+we explore the spectral dependence of homophily and the importance of quantum
+dynamics in the construction of a bandpass filter. Through these studies, as
+well as experiments on a range of datasets, we observe that QDC improves
+predictive performance on the widely used benchmark datasets when compared to
+similar methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Adaptive Query Release to Machine Unlearning <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enayat Ullah, Raman Arora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We formalize the problem of machine unlearning as design of efficient
+unlearning algorithms corresponding to learning algorithms which perform a
+selection of adaptive queries from structured query classes. We give efficient
+unlearning algorithms for linear and prefix-sum query classes. As applications,
+we show that unlearning in many problems, in particular, stochastic convex
+optimization (SCO), can be reduced to the above, yielding improved guarantees
+for the problem. In particular, for smooth Lipschitz losses and any $\rho>0$,
+our results yield an unlearning algorithm with excess population risk of
+$\tilde O\big(\frac{1}{\sqrt{n}}+\frac{\sqrt{d}}{n\rho}\big)$ with unlearning
+query (gradient) complexity $\tilde O(\rho \cdot \text{Retraining
+Complexity})$, where $d$ is the model dimensionality and $n$ is the initial
+number of samples. For non-smooth Lipschitz losses, we give an unlearning
+algorithm with excess population risk $\tilde
+O\big(\frac{1}{\sqrt{n}}+\big(\frac{\sqrt{d}}{n\rho}\big)^{1/2}\big)$ with the
+same unlearning query (gradient) complexity. Furthermore, in the special case
+of Generalized Linear Models (GLMs), such as those in linear and logistic
+regression, we get dimension-independent rates of $\tilde
+O\big(\frac{1}{\sqrt{n}} +\frac{1}{(n\rho)^{2/3}}\big)$ and $\tilde
+O\big(\frac{1}{\sqrt{n}} +\frac{1}{(n\rho)^{1/3}}\big)$ for smooth Lipschitz
+and non-smooth Lipschitz losses respectively. Finally, we give generalizations
+of the above from one unlearning request to \textit{dynamic} streams consisting
+of insertions and deletions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jina Embeddings: A Novel Set of High-Performance Sentence Embedding
+  Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11224v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11224v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Günther, Louis Milliken, Jonathan Geuter, Georgios Mastrapas, Bo Wang, Han Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Jina Embeddings constitutes a set of high-performance sentence embedding
+models adept at translating various textual inputs into numerical
+representations, thereby capturing the semantic essence of the text. While
+these models are not exclusively designed for text generation, they excel in
+applications such as dense retrieval and semantic textual similarity. This
+paper details the development of Jina Embeddings, starting with the creation of
+a high-quality pairwise and triplet dataset. It underlines the crucial role of
+data cleaning in dataset preparation, gives in-depth insights into the model
+training process, and concludes with a comprehensive performance evaluation
+using the Massive Textual Embedding Benchmark (MTEB).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 page appendix, EMNLP 2023 Industrial Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FairMobi-Net: A Fairness-aware Deep Learning Model for Urban Mobility
+  Flow Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhewei Liu, Lipai Huang, Chao Fan, Ali Mostafavi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating realistic human flows across regions is essential for our
+understanding of urban structures and population activity patterns, enabling
+important applications in the fields of urban planning and management. However,
+a notable shortcoming of most existing mobility generation methodologies is
+neglect of prediction fairness, which can result in underestimation of mobility
+flows across regions with vulnerable population groups, potentially resulting
+in inequitable resource distribution and infrastructure development. To
+overcome this limitation, our study presents a novel, fairness-aware deep
+learning model, FairMobi-Net, for inter-region human flow prediction. The
+FairMobi-Net model uniquely incorporates fairness loss into the loss function
+and employs a hybrid approach, merging binary classification and numerical
+regression techniques for human flow prediction. We validate the FairMobi-Net
+model using comprehensive human mobility datasets from four U.S. cities,
+predicting human flow at the census-tract level. Our findings reveal that the
+FairMobi-Net model outperforms state-of-the-art models (such as the DeepGravity
+model) in producing more accurate and equitable human flow predictions across a
+variety of region pairs, regardless of regional income differences. The model
+maintains a high degree of accuracy consistently across diverse regions,
+addressing the previous fairness concern. Further analysis of feature
+importance elucidates the impact of physical distances and road network
+structures on human flows across regions. With fairness as its touchstone, the
+model and results provide researchers and practitioners across the fields of
+urban sciences, transportation engineering, and computing with an effective
+tool for accurate generation of human mobility flows across regions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Effect of Epidemiological Cohort Creation on the Machine Learning
+  Prediction of Homelessness and Police Interaction Outcomes Using
+  Administrative Health Care Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11211v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11211v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faezehsadat Shahidi, M. Ethan MacDonald, Dallas Seitz, Geoffrey Messier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: Mental illness can lead to adverse outcomes such as homelessness
+and police interaction and understanding of the events leading up to these
+adverse outcomes is important. Predictive models may help identify individuals
+at risk of such adverse outcomes. Using a fixed observation window cohort with
+logistic regression (LR) or machine learning (ML) models can result in lower
+performance when compared with adaptive and parcellated windows. Method: An
+administrative healthcare dataset was used, comprising of 240,219 individuals
+in Calgary, Alberta, Canada who were diagnosed with addiction or mental health
+(AMH) between April 1, 2013, and March 31, 2018. The cohort was followed for 2
+years to identify factors associated with homelessness and police interactions.
+To understand the benefit of flexible windows to predictive models, an
+alternative cohort was created. Then LR and ML models, including random forests
+(RF), and extreme gradient boosting (XGBoost) were compared in the two cohorts.
+Results: Among 237,602 individuals, 0.8% (1,800) experienced first
+homelessness, while 0.32% (759) reported initial police interaction among
+237,141 individuals. Male sex (AORs: H=1.51, P=2.52), substance disorder (AORs:
+H=3.70, P=2.83), psychiatrist visits (AORs: H=1.44, P=1.49), and drug abuse
+(AORs: H=2.67, P=1.83) were associated with initial homelessness (H) and police
+interaction (P). XGBoost showed superior performance using the flexible method
+(sensitivity =91%, AUC =90% for initial homelessness, and sensitivity =90%,
+AUC=89% for initial police interaction)
+  Conclusion: This study identified key features associated with initial
+homelessness and police interaction and demonstrated that flexible windows can
+improve predictive modeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in Frontiers in Digital Health, Health Informatics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Clinical Trial Active Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zoe Fowler, Kiran Kokilepersaud, Mohit Prabhushankar, Ghassan AlRegib
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach to active learning that takes into
+account the non-independent and identically distributed (non-i.i.d.) structure
+of a clinical trial setting. There exists two types of clinical trials:
+retrospective and prospective. Retrospective clinical trials analyze data after
+treatment has been performed; prospective clinical trials collect data as
+treatment is ongoing. Typically, active learning approaches assume the dataset
+is i.i.d. when selecting training samples; however, in the case of clinical
+trials, treatment results in a dependency between the data collected at the
+current and past visits. Thus, we propose prospective active learning to
+overcome the limitations present in traditional active learning methods and
+apply it to disease detection in optical coherence tomography (OCT) images,
+where we condition on the time an image was collected to enforce the i.i.d.
+assumption. We compare our proposed method to the traditional active learning
+paradigm, which we refer to as retrospective in nature. We demonstrate that
+prospective active learning outperforms retrospective active learning in two
+different types of test settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 14th ACM International Conference on Bioinformatics,
+  Computational Biology and Health Informatics (ACM-BCB)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heuristic Hyperparameter Choice for Image Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyu Jiang, João P. C. Bertoldo, Etienne Decencière
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection (AD) in images is a fundamental computer vision problem by
+deep learning neural network to identify images deviating significantly from
+normality. The deep features extracted from pretrained models have been proved
+to be essential for AD based on multivariate Gaussian distribution analysis.
+However, since models are usually pretrained on a large dataset for
+classification tasks such as ImageNet, they might produce lots of redundant
+features for AD, which increases computational cost and degrades the
+performance. We aim to do the dimension reduction of Negated Principal
+Component Analysis (NPCA) for these features. So we proposed some heuristic to
+choose hyperparameter of NPCA algorithm for getting as fewer components of
+features as possible while ensuring a good performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring reinforcement learning techniques for discrete and continuous
+  control tasks in the MuJoCo environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vaddadi Sai Rahul, Debajyoti Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We leverage the fast physics simulator, MuJoCo to run tasks in a continuous
+control environment and reveal details like the observation space, action
+space, rewards, etc. for each task. We benchmark value-based methods for
+continuous control by comparing Q-learning and SARSA through a discretization
+approach, and using them as baselines, progressively moving into one of the
+state-of-the-art deep policy gradient method DDPG. Over a large number of
+episodes, Qlearning outscored SARSA, but DDPG outperformed both in a small
+number of episodes. Lastly, we also fine-tuned the model hyper-parameters
+expecting to squeeze more performance but using lesser time and resources. We
+anticipated that the new design for DDPG would vastly improve performance, yet
+after only a few episodes, we were able to achieve decent average rewards. We
+expect to improve the performance provided adequate time and computational
+resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Released @ Dec 2021. For associated project files, see
+  https://github.com/chakrabortyde/mujoco-control-tasks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Frequency Domain Adversarial Training for Robust Volumetric Medical
+  Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07269v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07269v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asif Hanif, Muzammal Naseer, Salman Khan, Mubarak Shah, Fahad Shahbaz Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is imperative to ensure the robustness of deep learning models in critical
+applications such as, healthcare. While recent advances in deep learning have
+improved the performance of volumetric medical image segmentation models, these
+models cannot be deployed for real-world applications immediately due to their
+vulnerability to adversarial attacks. We present a 3D frequency domain
+adversarial attack for volumetric medical image segmentation models and
+demonstrate its advantages over conventional input or voxel domain attacks.
+Using our proposed attack, we introduce a novel frequency domain adversarial
+training approach for optimizing a robust model against voxel and frequency
+domain attacks. Moreover, we propose frequency consistency loss to regulate our
+frequency domain adversarial training that achieves a better tradeoff between
+model's performance on clean and adversarial samples. Code is publicly
+available at https://github.com/asif-hanif/vafa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted in MICCAI 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mathematical Capabilities of Chat<span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Frieder, Luca Pinchetti, Alexis Chevalier, Ryan-Rhys Griffiths, Tommaso Salvatori, Thomas Lukasiewicz, Philipp Christian Petersen, Julius Berner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the mathematical capabilities of two iterations of ChatGPT
+(released 9-January-2023 and 30-January-2023) and of GPT-4 by testing them on
+publicly available datasets, as well as hand-crafted ones, using a novel
+methodology. In contrast to formal mathematics, where large databases of formal
+proofs are available (e.g., the Lean Mathematical Library), current datasets of
+natural-language mathematics, used to benchmark language models, either cover
+only elementary mathematics or are very small. We address this by publicly
+releasing two new datasets: GHOSTS and miniGHOSTS. These are the first
+natural-language datasets curated by working researchers in mathematics that
+(1) aim to cover graduate-level mathematics, (2) provide a holistic overview of
+the mathematical capabilities of language models, and (3) distinguish multiple
+dimensions of mathematical reasoning. These datasets also test whether ChatGPT
+and GPT-4 can be helpful assistants to professional mathematicians by emulating
+use cases that arise in the daily professional activities of mathematicians. We
+benchmark the models on a range of fine-grained performance metrics. For
+advanced mathematics, this is the most detailed evaluation effort to date. We
+find that ChatGPT can be used most successfully as a mathematical assistant for
+querying facts, acting as a mathematical search engine and knowledge base
+interface. GPT-4 can additionally be used for undergraduate-level mathematics
+but fails on graduate-level difficulty. Contrary to many positive reports in
+the media about GPT-4 and ChatGPT's exam-solving abilities (a potential case of
+selection bias), their overall mathematical performance is well below the level
+of a graduate student. Hence, if your goal is to use ChatGPT to pass a
+graduate-level math exam, you would be better off copying from your average
+peer!
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added further evaluations on another ChatGPT version and on GPT-4.
+  The GHOSTS and miniGHOSTS datasets are available at
+  https://github.com/xyfrieder/science-GHOSTS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Torchhd: An Open Source Python Library to Support Research on
+  Hyperdimensional Computing and Vector Symbolic Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.09208v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.09208v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mike Heddes, Igor Nunes, Pere Vergés, Denis Kleyko, Danny Abraham, Tony Givargis, Alexandru Nicolau, Alexander Veidenbaum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperdimensional computing (HD), also known as vector symbolic architectures
+(VSA), is a framework for computing with distributed representations by
+exploiting properties of random high-dimensional vector spaces. The commitment
+of the scientific community to aggregate and disseminate research in this
+particularly multidisciplinary area has been fundamental for its advancement.
+Joining these efforts, we present Torchhd, a high-performance open source
+Python library for HD/VSA. Torchhd seeks to make HD/VSA more accessible and
+serves as an efficient foundation for further research and application
+development. The easy-to-use library builds on top of PyTorch and features
+state-of-the-art HD/VSA functionality, clear documentation, and implementation
+examples from well-known publications. Comparing publicly available code with
+their corresponding Torchhd implementation shows that experiments can run up to
+100x faster. Torchhd is available at:
+https://github.com/hyperdimensional-computing/torchhd.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Uncertainty Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02719v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02719v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shang Liu, Xiaocheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty sampling is a prevalent active learning algorithm that queries
+sequentially the annotations of data samples which the current prediction model
+is uncertain about. However, the usage of uncertainty sampling has been largely
+heuristic: (i) There is no consensus on the proper definition of "uncertainty"
+for a specific task under a specific loss; (ii) There is no theoretical
+guarantee that prescribes a standard protocol to implement the algorithm, for
+example, how to handle the sequentially arrived annotated data under the
+framework of optimization algorithms such as stochastic gradient descent. In
+this work, we systematically examine uncertainty sampling algorithms under both
+stream-based and pool-based active learning. We propose a notion of equivalent
+loss which depends on the used uncertainty measure and the original loss
+function and establish that an uncertainty sampling algorithm essentially
+optimizes against such an equivalent loss. The perspective verifies the
+properness of existing uncertainty measures from two aspects: surrogate
+property and loss convexity. Furthermore, we propose a new notion for designing
+uncertainty measures called \textit{loss as uncertainty}. The idea is to use
+the conditional expected loss given the features as the uncertainty measure.
+Such an uncertainty measure has nice analytical properties and generality to
+cover both classification and regression problems, which enable us to provide
+the first generalization bound for uncertainty sampling algorithms under both
+stream-based and pool-based settings, in the full generality of the underlying
+model and problem. Lastly, we establish connections between certain variants of
+the uncertainty sampling algorithms with risk-sensitive objectives and
+distributional robustness, which can partly explain the advantage of
+uncertainty sampling algorithms when the sample size is small.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Update: add numerical illustrations and experiments; correct some
+  typos and modify the numbering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can point cloud networks learn statistical shape models of anatomies? <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05610v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05610v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jadie Adams, Shireen Elhabian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical Shape Modeling (SSM) is a valuable tool for investigating and
+quantifying anatomical variations within populations of anatomies. However,
+traditional correspondence-based SSM generation methods have a prohibitive
+inference process and require complete geometric proxies (e.g., high-resolution
+binary volumes or surface meshes) as input shapes to construct the SSM.
+Unordered 3D point cloud representations of shapes are more easily acquired
+from various medical imaging practices (e.g., thresholded images and surface
+scanning). Point cloud deep networks have recently achieved remarkable success
+in learning permutation-invariant features for different point cloud tasks
+(e.g., completion, semantic segmentation, classification). However, their
+application to learning SSM from point clouds is to-date unexplored. In this
+work, we demonstrate that existing point cloud encoder-decoder-based completion
+networks can provide an untapped potential for SSM, capturing population-level
+statistical representations of shapes while reducing the inference burden and
+relaxing the input requirement. We discuss the limitations of these techniques
+to the SSM application and suggest future improvements. Our work paves the way
+for further exploration of point cloud deep learning for SSM, a promising
+avenue for advancing shape analysis literature and broadening SSM to diverse
+use cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023. 13 pages, 5 figures, appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-Driven Modeling of Noise Time Series with Convolutional Generative
+  Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.01110v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.01110v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam Wunderlich, Jack Sklar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Random noise arising from physical processes is an inherent characteristic of
+measurements and a limiting factor for most signal processing and data analysis
+tasks. Given the recent interest in generative adversarial networks (GANs) for
+data-driven modeling, it is important to determine to what extent GANs can
+faithfully reproduce noise in target data sets. In this paper, we present an
+empirical investigation that aims to shed light on this issue for time series.
+Namely, we assess two general-purpose GANs for time series that are based on
+the popular deep convolutional GAN (DCGAN) architecture, a direct time-series
+model and an image-based model that uses a short-time Fourier transform (STFT)
+data representation. The GAN models are trained and quantitatively evaluated
+using distributions of simulated noise time series with known ground-truth
+parameters. Target time series distributions include a broad range of noise
+types commonly encountered in physical measurements, electronics, and
+communication systems: band-limited thermal noise, power law noise, shot noise,
+and impulsive noise. We find that GANs are capable of learning many noise
+types, although they predictably struggle when the GAN architecture is not well
+suited to some aspects of the noise, e.g., impulsive time-series with extreme
+outliers. Our findings provide insights into the capabilities and potential
+limitations of current approaches to time-series GANs and highlight areas for
+further research. In addition, our battery of tests provides a useful benchmark
+to aid the development of deep generative models for time series.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Mixture of HyperGenerators for Learning Distributions Over
+  Functions <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.06223v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.06223v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Batuhan Koyuncu, Pablo Sanchez-Martin, Ignacio Peis, Pablo M. Olmos, Isabel Valera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent approaches build on implicit neural representations (INRs) to propose
+generative models over function spaces. However, they are computationally
+costly when dealing with inference tasks, such as missing data imputation, or
+directly cannot tackle them. In this work, we propose a novel deep generative
+model, named VAMoH. VAMoH combines the capabilities of modeling continuous
+functions using INRs and the inference capabilities of Variational Autoencoders
+(VAEs). In addition, VAMoH relies on a normalizing flow to define the prior,
+and a mixture of hypernetworks to parametrize the data log-likelihood. This
+gives VAMoH a high expressive capability and interpretability. Through
+experiments on a diverse range of data types, such as images, voxels, and
+climate data, we show that VAMoH can effectively learn rich distributions over
+continuous functions. Furthermore, it can perform inference-related tasks, such
+as conditional super-resolution generation and in-painting, as well or better
+than previous approaches, while being less computationally demanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2023. Camera ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Perceptron Theory Can Predict the Accuracy of Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2012.07881v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2012.07881v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Denis Kleyko, Antonello Rosato, E. Paxon Frady, Massimo Panella, Friedrich T. Sommer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multilayer neural networks set the current state of the art for many
+technical classification problems. But, these networks are still, essentially,
+black boxes in terms of analyzing them and predicting their performance. Here,
+we develop a statistical theory for the one-layer perceptron and show that it
+can predict performances of a surprisingly large variety of neural networks
+with different architectures. A general theory of classification with
+perceptrons is developed by generalizing an existing theory for analyzing
+reservoir computing models and connectionist models for symbolic reasoning
+known as vector symbolic architectures. Our statistical theory offers three
+formulas leveraging the signal statistics with increasing detail. The formulas
+are analytically intractable, but can be evaluated numerically. The description
+level that captures maximum details requires stochastic sampling methods.
+Depending on the network model, the simpler formulas already yield high
+prediction accuracy. The quality of the theory predictions is assessed in three
+experimental settings, a memorization task for echo state networks (ESNs) from
+reservoir computing literature, a collection of classification datasets for
+shallow randomly connected networks, and the ImageNet dataset for deep
+convolutional neural networks. We find that the second description level of the
+perceptron theory can predict the performance of types of ESNs, which could not
+be described previously. The theory can predict deep multilayer neural networks
+by being applied to their output layer. While other methods for prediction of
+neural networks performance commonly require to train an estimator model, the
+proposed theory requires only the first two moments of the distribution of the
+postsynaptic sums in the output neurons. The perceptron theory compares
+favorably to other methods that do not rely on training an estimator model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tuning Stochastic Gradient Algorithms for Statistical Inference via
+  Large-Sample Asymptotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.12395v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.12395v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeffrey Negrea, Jun Yang, Haoyue Feng, Daniel M. Roy, Jonathan H. Huggins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The tuning of stochastic gradient algorithms (SGAs) for optimization and
+sampling is often based on heuristics and trial-and-error rather than
+generalizable theory. We address this theory--practice gap by characterizing
+the large-sample statistical asymptotics of SGAs via a joint
+step-size--sample-size scaling limit. We show that iterate averaging with a
+large fixed step size is robust to the choice of tuning parameters and
+asymptotically has covariance proportional to that of the MLE sampling
+distribution. We also prove a Bernstein--von Mises-like theorem to guide
+tuning, including for generalized posteriors that are robust to model
+misspecification. Numerical experiments validate our results and
+recommendations in realistic finite-sample regimes. Our work lays the
+foundation for a systematic analysis of other stochastic gradient Markov chain
+Monte Carlo algorithms for a wide range of models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pgs</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Impatient Bandits: Optimizing Recommendations for the Long-Term Without
+  Delay <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09943v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09943v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas M. McDonald, Lucas Maystre, Mounia Lalmas, Daniel Russo, Kamil Ciosek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems are a ubiquitous feature of online platforms.
+Increasingly, they are explicitly tasked with increasing users' long-term
+satisfaction. In this context, we study a content exploration task, which we
+formalize as a multi-armed bandit problem with delayed rewards. We observe that
+there is an apparent trade-off in choosing the learning signal: Waiting for the
+full reward to become available might take several weeks, hurting the rate at
+which learning happens, whereas measuring short-term proxy rewards reflects the
+actual long-term goal only imperfectly. We address this challenge in two steps.
+First, we develop a predictive model of delayed rewards that incorporates all
+information obtained to date. Full observations as well as partial (short or
+medium-term) outcomes are combined through a Bayesian filter to obtain a
+probabilistic belief. Second, we devise a bandit algorithm that takes advantage
+of this new predictive model. The algorithm quickly learns to identify content
+aligned with long-term success by carefully balancing exploration and
+exploitation. We apply our approach to a podcast recommendation problem, where
+we seek to identify shows that users engage with repeatedly over two months. We
+empirically validate that our approach results in substantially better
+performance compared to approaches that either optimize for short-term proxies,
+or wait for the long-term outcome to be fully realized.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at the 29th ACM SIGKDD Conference on Knowledge Discovery
+  and Data Mining (KDD '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Code Example Recommendations on Informal Documentation Using
+  <span class="highlight-title">BERT</span> and Query-Aware LSH: A Comparative Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03017v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03017v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sajjad Rahmani, AmirHossein Naghshzan, Latifa Guerrouj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our research investigates the recommendation of code examples to aid software
+developers, a practice that saves developers significant time by providing
+ready-to-use code snippets. The focus of our study is Stack Overflow, a
+commonly used resource for coding discussions and solutions, particularly in
+the context of the Java programming language. We applied BERT, a powerful Large
+Language Model (LLM) that enables us to transform code examples into numerical
+vectors by extracting their semantic information. Once these numerical
+representations are prepared, we identify Approximate Nearest Neighbors (ANN)
+using Locality-Sensitive Hashing (LSH). Our research employed two variants of
+LSH: Random Hyperplane-based LSH and Query-Aware LSH. We rigorously compared
+these two approaches across four parameters: HitRate, Mean Reciprocal Rank
+(MRR), Average Execution Time, and Relevance. Our study revealed that the
+Query-Aware (QA) approach showed superior performance over the Random
+Hyperplane-based (RH) method. Specifically, it exhibited a notable improvement
+of 20% to 35% in HitRate for query pairs compared to the RH approach.
+Furthermore, the QA approach proved significantly more time-efficient, with its
+speed in creating hashing tables and assigning data samples to buckets being at
+least four times faster. It can return code examples within milliseconds,
+whereas the RH approach typically requires several seconds to recommend code
+examples. Due to the superior performance of the QA approach, we tested it
+against PostFinder and FaCoY, the state-of-the-art baselines. Our QA method
+showed comparable efficiency proving its potential for effective code
+recommendation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Class-Incremental Learning based on Label Generation <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12619v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12619v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijia Shao, Yiduo Guo, Dongyan Zhao, Bing Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the great success of pre-trained language models, it is still a
+challenge to use these models for continual learning, especially for the
+class-incremental learning (CIL) setting due to catastrophic forgetting (CF).
+This paper reports our finding that if we formulate CIL as a continual label
+generation problem, CF is drastically reduced and the generalizable
+representations of pre-trained models can be better retained. We thus propose a
+new CIL method (VAG) that also leverages the sparsity of vocabulary to focus
+the generation and creates pseudo-replay samples by using label semantics.
+Experimental results show that VAG outperforms baselines by a large margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, ACL 2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When are Local Queries Useful for Robust Learning? <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06089v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06089v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascale Gourdeau, Varun Kanade, Marta Kwiatkowska, James Worrell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributional assumptions have been shown to be necessary for the robust
+learnability of concept classes when considering the exact-in-the-ball robust
+risk and access to random examples by Gourdeau et al. (2019). In this paper, we
+study learning models where the learner is given more power through the use of
+local queries, and give the first distribution-free algorithms that perform
+robust empirical risk minimization (ERM) for this notion of robustness. The
+first learning model we consider uses local membership queries (LMQ), where the
+learner can query the label of points near the training sample. We show that,
+under the uniform distribution, LMQs do not increase the robustness threshold
+of conjunctions and any superclass, e.g., decision lists and halfspaces. Faced
+with this negative result, we introduce the local equivalence query
+($\mathsf{LEQ}$) oracle, which returns whether the hypothesis and target
+concept agree in the perturbation region around a point in the training sample,
+as well as a counterexample if it exists. We show a separation result: on the
+one hand, if the query radius $\lambda$ is strictly smaller than the
+adversary's perturbation budget $\rho$, then distribution-free robust learning
+is impossible for a wide variety of concept classes; on the other hand, the
+setting $\lambda=\rho$ allows us to develop robust ERM algorithms. We then
+bound the query complexity of these algorithms based on online learning
+guarantees and further improve these bounds for the special case of
+conjunctions. We finish by giving robust learning algorithms for halfspaces on
+$\{0,1\}^n$ and then obtaining robustness guarantees for halfspaces in
+$\mathbb{R}^n$ against precision-bounded adversaries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2022; V2 contains new results (Section 3.6) and
+  an erratum from the previous version (Appendix C)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Review</span> of Machine Learning Methods Applied to Structural Dynamics and
+  Vibroacoustic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.06362v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.06362v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Barbara Cunha, Christophe Droz, Abdelmalek Zine, Stéphane Foulard, Mohamed Ichchou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of Machine Learning (ML) has rapidly spread across several fields,
+having encountered many applications in Structural Dynamics and Vibroacoustic
+(SD\&V). The increasing capabilities of ML to unveil insights from data, driven
+by unprecedented data availability, algorithms advances and computational
+power, enhance decision making, uncertainty handling, patterns recognition and
+real-time assessments. Three main applications in SD\&V have taken advantage of
+these benefits. In Structural Health Monitoring, ML detection and prognosis
+lead to safe operation and optimized maintenance schedules. System
+identification and control design are leveraged by ML techniques in Active
+Noise Control and Active Vibration Control. Finally, the so-called ML-based
+surrogate models provide fast alternatives to costly simulations, enabling
+robust and optimized product design. Despite the many works in the area, they
+have not been reviewed and analyzed. Therefore, to keep track and understand
+this ongoing integration of fields, this paper presents a survey of ML
+applications in SD\&V analyses, shedding light on the current state of
+implementation and emerging opportunities. The main methodologies, advantages,
+limitations, and recommendations based on scientific knowledge were identified
+for each of the three applications. Moreover, the paper considers the role of
+Digital Twins and Physics Guided ML to overcome current challenges and power
+future research progress. As a result, the survey provides a broad overview of
+the present landscape of ML applied in SD\&V and guides the reader to an
+advanced understanding of progress and prospects in the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gaussian Process Priors for Systems of Linear Partial Differential
+  Equations with Constant Coefficients <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.14319v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.14319v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc Härkönen, Markus Lange-Hegermann, Bogdan Raiţă
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Partial differential equations (PDEs) are important tools to model physical
+systems and including them into machine learning models is an important way of
+incorporating physical knowledge. Given any system of linear PDEs with constant
+coefficients, we propose a family of Gaussian process (GP) priors, which we
+call EPGP, such that all realizations are exact solutions of this system. We
+apply the Ehrenpreis-Palamodov fundamental principle, which works as a
+non-linear Fourier transform, to construct GP kernels mirroring standard
+spectral methods for GPs. Our approach can infer probable solutions of linear
+PDE systems from any data such as noisy measurements, or pointwise defined
+initial and boundary conditions. Constructing EPGP-priors is algorithmic,
+generally applicable, and comes with a sparse version (S-EPGP) that learns the
+relevant spectral frequencies and works better for big data sets. We
+demonstrate our approach on three families of systems of PDEs, the heat
+equation, wave equation, and Maxwell's equations, where we improve upon the
+state of the art in computation time and precision, in some experiments by
+several orders of magnitude.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 8 figures; ICML 2023 (oral); updated with expanded
+  appendices and ancillary files. Code available at
+  https://github.com/haerski/EPGP. For animations, see
+  https://mathrepo.mis.mpg.de/EPGP/index.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provably Efficient UCB-type Algorithms For Learning Predictive State
+  Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00405v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00405v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiquan Huang, Yingbin Liang, Jing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The general sequential decision-making problem, which includes Markov
+decision processes (MDPs) and partially observable MDPs (POMDPs) as special
+cases, aims at maximizing a cumulative reward by making a sequence of decisions
+based on a history of observations and actions over time. Recent studies have
+shown that the sequential decision-making problem is statistically learnable if
+it admits a low-rank structure modeled by predictive state representations
+(PSRs). Despite these advancements, existing approaches typically involve
+oracles or steps that are not computationally efficient. On the other hand, the
+upper confidence bound (UCB) based approaches, which have served successfully
+as computationally efficient methods in bandits and MDPs, have not been
+investigated for more general PSRs, due to the difficulty of optimistic bonus
+design in these more challenging settings. This paper proposes the first known
+UCB-type approach for PSRs, featuring a novel bonus term that upper bounds the
+total variation distance between the estimated and true models. We further
+characterize the sample complexity bounds for our designed UCB-type algorithms
+for both online and offline PSRs. In contrast to existing approaches for PSRs,
+our UCB-type algorithms enjoy computational efficiency, last-iterate guaranteed
+near-optimal policy, and guaranteed model accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-order Tensor Pooling with Attention for Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.05216v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.05216v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piotr Koniusz, Lei Wang, Ke Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim at capturing high-order statistics of feature vectors formed by a
+neural network, and propose end-to-end second- and higher-order pooling to form
+a tensor descriptor. Tensor descriptors require a robust similarity measure due
+to low numbers of aggregated vectors and the burstiness phenomenon, when a
+given feature appears more/less frequently than statistically expected. The
+Heat Diffusion Process (HDP) on a graph Laplacian is closely related to the
+Eigenvalue Power Normalization (EPN) of the covariance/auto-correlation matrix,
+whose inverse forms a loopy graph Laplacian. We show that the HDP and the EPN
+play the same role, i.e., to boost or dampen the magnitude of the eigenspectrum
+thus preventing the burstiness. We equip higher-order tensors with EPN which
+acts as a spectral detector of higher-order occurrences to prevent burstiness.
+We also prove that for a tensor of order r built from d dimensional feature
+descriptors, such a detector gives the likelihood if at least one higher-order
+occurrence is 'projected' into one of binom(d,r) subspaces represented by the
+tensor; thus forming a tensor power normalization metric endowed with
+binom(d,r) such 'detectors'. For experimental contributions, we apply several
+second- and higher-order pooling variants to action recognition, provide
+previously not presented comparisons of such pooling variants, and show
+state-of-the-art results on HMDB-51, YUP++ and MPII Cooking Activities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ $ν^2$-Flows: Fast and improved neutrino reconstruction in
+  multi-neutrino final states with conditional normalizing flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02405v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02405v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John Andrew Raine, Matthew Leigh, Knut Zoch, Tobias Golling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we introduce $\nu^2$-Flows, an extension of the $\nu$-Flows
+method to final states containing multiple neutrinos. The architecture can
+natively scale for all combinations of object types and multiplicities in the
+final state for any desired neutrino multiplicities. In $t\bar{t}$ dilepton
+events, the momenta of both neutrinos and correlations between them are
+reconstructed more accurately than when using the most popular standard
+analytical techniques, and solutions are found for all events. Inference time
+is significantly faster than competing methods, and can be reduced further by
+evaluating in parallel on graphics processing units. We apply $\nu^2$-Flows to
+$t\bar{t}$ dilepton events and show that the per-bin uncertainties in unfolded
+distributions is much closer to the limit of performance set by perfect
+neutrino reconstruction than standard techniques. For the chosen double
+differential observables $\nu^2$-Flows results in improved statistical
+precision for each bin by a factor of 1.5 to 2 in comparison to the Neutrino
+Weighting method and up to a factor of four in comparison to the Ellipse
+approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 16 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Topological Point Cloud Clustering <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16716v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16716v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincent P. Grande, Michael T. Schaub
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Topological Point Cloud Clustering (TPCC), a new method to cluster
+points in an arbitrary point cloud based on their contribution to global
+topological features. TPCC synthesizes desirable features from spectral
+clustering and topological data analysis and is based on considering the
+spectral properties of a simplicial complex associated to the considered point
+cloud. As it is based on considering sparse eigenvector computations, TPCC is
+similarly easy to interpret and implement as spectral clustering. However, by
+focusing not just on a single matrix associated to a graph created from the
+point cloud data, but on a whole set of Hodge-Laplacians associated to an
+appropriately constructed simplicial complex, we can leverage a far richer set
+of topological features to characterize the data points within the point cloud
+and benefit from the relative robustness of topological techniques against
+noise. We test the performance of TPCC on both synthetic and real-world data
+and compare it with classical spectral clustering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 40th International Conference on Machine Learning
+  (ICML), 2023. Code available at
+  https://git.rwth-aachen.de/netsci/publication-2023-topological-point-cloud-clustering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ My Boli: Code-mixed Marathi-English Corpora, <span class="highlight-title">Pretrain</span>ed Language Models
+  and Evaluation Benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14030v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14030v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanmay Chavan, Omkar Gokhale, Aditya Kane, Shantanu Patankar, Raviraj Joshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The research on code-mixed data is limited due to the unavailability of
+dedicated code-mixed datasets and pre-trained language models. In this work, we
+focus on the low-resource Indian language Marathi which lacks any prior work in
+code-mixing. We present L3Cube-MeCorpus, a large code-mixed Marathi-English
+(Mr-En) corpus with 10 million social media sentences for pretraining. We also
+release L3Cube-MeBERT and MeRoBERTa, code-mixed BERT-based transformer models
+pre-trained on MeCorpus. Furthermore, for benchmarking, we present three
+supervised datasets MeHate, MeSent, and MeLID for downstream tasks like
+code-mixed Mr-En hate speech detection, sentiment analysis, and language
+identification respectively. These evaluation datasets individually consist of
+manually annotated \url{~}12,000 Marathi-English code-mixed tweets. Ablations
+show that the models trained on this novel corpus significantly outperform the
+existing state-of-the-art BERT models. This is the first work that presents
+artifacts for code-mixed Marathi research. All datasets and models are publicly
+released at https://github.com/l3cube-pune/MarathiNLP .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Context-Conditional Navigation with a Learning-Based Terrain- and
+  Robot-Aware Dynamics Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09206v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09206v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suresh Guttikonda, Jan Achterhold, Haolong Li, Joschka Boedecker, Joerg Stueckler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In autonomous navigation settings, several quantities can be subject to
+variations. Terrain properties such as friction coefficients may vary over time
+depending on the location of the robot. Also, the dynamics of the robot may
+change due to, e.g., different payloads, changing the system's mass, or wear
+and tear, changing actuator gains or joint friction. An autonomous agent should
+thus be able to adapt to such variations. In this paper, we develop a novel
+probabilistic, terrain- and robot-aware forward dynamics model, termed TRADYN,
+which is able to adapt to the above-mentioned variations. It builds on recent
+advances in meta-learning forward dynamics models based on Neural Processes. We
+evaluate our method in a simulated 2D navigation setting with a unicycle-like
+robot and different terrain layouts with spatially varying friction
+coefficients. In our experiments, the proposed model exhibits lower prediction
+error for the task of long-horizon trajectory prediction, compared to
+non-adaptive ablation models. We also evaluate our model on the downstream task
+of navigation planning, which demonstrates improved performance in planning
+control-efficient paths by taking robot and terrain properties into account.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>\copyright 2023 IEEE. Accepted for publication in European Conference
+  on Mobile Robots (ECMR), 2023. Updated copyright statement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Offline Data in Online Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.04974v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.04974v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Wagenmaker, Aldo Pacchiano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Two central paradigms have emerged in the reinforcement learning (RL)
+community: online RL and offline RL. In the online RL setting, the agent has no
+prior knowledge of the environment, and must interact with it in order to find
+an $\epsilon$-optimal policy. In the offline RL setting, the learner instead
+has access to a fixed dataset to learn from, but is unable to otherwise
+interact with the environment, and must obtain the best policy it can from this
+offline data. Practical scenarios often motivate an intermediate setting: if we
+have some set of offline data and, in addition, may also interact with the
+environment, how can we best use the offline data to minimize the number of
+online interactions necessary to learn an $\epsilon$-optimal policy?
+  In this work, we consider this setting, which we call the \textsf{FineTuneRL}
+setting, for MDPs with linear structure. We characterize the necessary number
+of online samples needed in this setting given access to some offline dataset,
+and develop an algorithm, \textsc{FTPedel}, which is provably optimal, up to
+$H$ factors. We show through an explicit example that combining offline data
+with online interactions can lead to a provable improvement over either purely
+offline or purely online RL. Finally, our results illustrate the distinction
+between \emph{verifiable} learning, the typical setting considered in online
+RL, and \emph{unverifiable} learning, the setting often considered in offline
+RL, and show that there is a formal separation between these regimes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instance-Dependent Near-Optimal Policy Identification in Linear MDPs via
+  Online Experiment Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.02575v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.02575v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Wagenmaker, Kevin Jamieson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While much progress has been made in understanding the minimax sample
+complexity of reinforcement learning (RL) -- the complexity of learning on the
+"worst-case" instance -- such measures of complexity often do not capture the
+true difficulty of learning. In practice, on an "easy" instance, we might hope
+to achieve a complexity far better than that achievable on the worst-case
+instance. In this work we seek to understand the "instance-dependent"
+complexity of learning near-optimal policies (PAC RL) in the setting of RL with
+linear function approximation. We propose an algorithm, \textsc{Pedel}, which
+achieves a fine-grained instance-dependent measure of complexity, the first of
+its kind in the RL with function approximation setting, thereby capturing the
+difficulty of learning on each particular problem instance. Through an explicit
+example, we show that \textsc{Pedel} yields provable gains over low-regret,
+minimax-optimal algorithms and that such algorithms are unable to hit the
+instance-optimal rate. Our approach relies on a novel online experiment
+design-based procedure which focuses the exploration budget on the "directions"
+most relevant to learning a near-optimal policy, and may be of independent
+interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantitative CLTs in Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06092v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06092v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefano Favaro, Boris Hanin, Domenico Marinucci, Ivan Nourdin, Giovanni Peccati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the distribution of a fully connected neural network with random
+Gaussian weights and biases in which the hidden layer widths are proportional
+to a large constant $n$. Under mild assumptions on the non-linearity, we obtain
+quantitative bounds on normal approximations valid at large but finite $n$ and
+any fixed network depth. Our theorems show both for the finite-dimensional
+distributions and the entire process, that the distance between a random fully
+connected network (and its derivatives) to the corresponding infinite width
+Gaussian process scales like $n^{-\gamma}$ for $\gamma>0$, with the exponent
+depending on the metric used to measure discrepancy. Our bounds are strictly
+stronger in terms of their dependence on network width than any previously
+available in the literature; in the one-dimensional case, we also prove that
+they are optimal, i.e., we establish matching lower bounds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Network Complexity of Chaos and Turbulence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15382v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15382v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Whittaker, Romuald A. Janik, Yaron Oz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chaos and turbulence are complex physical phenomena, yet a precise definition
+of the complexity measure that quantifies them is still lacking. In this work
+we consider the relative complexity of chaos and turbulence from the
+perspective of deep neural networks. We analyze a set of classification
+problems, where the network has to distinguish images of fluid profiles in the
+turbulent regime from other classes of images such as fluid profiles in the
+chaotic regime, various constructions of noise and real world images. We
+analyze incompressible as well as weakly compressible fluid flows. We quantify
+the complexity of the computation performed by the network via the intrinsic
+dimensionality of the internal feature representations, and calculate the
+effective number of independent features which the network uses in order to
+distinguish between classes. In addition to providing a numerical estimate of
+the complexity of the computation, the measure also characterizes the neural
+network processing at intermediate and final stages. We construct adversarial
+examples and use them to identify the two point correlation spectra for the
+chaotic and turbulent vorticity as the feature used by the network for
+classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Warming up recurrent neural networks to maximise reachable
+  multistability greatly improves learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.01001v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.01001v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gaspard Lambrechts, Florent De Geeter, Nicolas Vecoven, Damien Ernst, Guillaume Drion
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training recurrent neural networks is known to be difficult when time
+dependencies become long. In this work, we show that most standard cells only
+have one stable equilibrium at initialisation, and that learning on tasks with
+long time dependencies generally occurs once the number of network stable
+equilibria increases; a property known as multistability. Multistability is
+often not easily attained by initially monostable networks, making learning of
+long time dependencies between inputs and outputs difficult. This insight leads
+to the design of a novel way to initialise any recurrent cell connectivity
+through a procedure called "warmup" to improve its capability to learn
+arbitrarily long time dependencies. This initialisation procedure is designed
+to maximise network reachable multistability, i.e., the number of equilibria
+within the network that can be reached through relevant input trajectories, in
+few gradient steps. We show on several information restitution, sequence
+classification, and reinforcement learning benchmarks that warming up greatly
+improves learning speed and performance, for multiple recurrent cells, but
+sometimes impedes precision. We therefore introduce a double-layer architecture
+initialised with a partial warmup that is shown to greatly improve learning of
+long time dependencies while maintaining high levels of precision. This
+approach provides a general framework for improving learning abilities of any
+recurrent cell when long time dependencies are present. We also show
+empirically that other initialisation and pretraining procedures from the
+literature implicitly foster reachable multistability of recurrent cells.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 35 pages total, 38 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Implicit Multidimensional Projection of Local Subspaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2009.03259v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2009.03259v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongzheng Bian, Yumeng Xue, Liang Zhou, Jian Zhang, Baoquan Chen, Daniel Weiskopf, Yunhai Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a visualization method to understand the effect of
+multidimensional projection on local subspaces, using implicit function
+differentiation. Here, we understand the local subspace as the multidimensional
+local neighborhood of data points. Existing methods focus on the projection of
+multidimensional data points, and the neighborhood information is ignored. Our
+method is able to analyze the shape and directional information of the local
+subspace to gain more insights into the global structure of the data through
+the perception of local structures. Local subspaces are fitted by
+multidimensional ellipses that are spanned by basis vectors. An accurate and
+efficient vector transformation method is proposed based on analytical
+differentiation of multidimensional projections formulated as implicit
+functions. The results are visualized as glyphs and analyzed using a full set
+of specifically-designed interactions supported in our efficient web-based
+visualization tool. The usefulness of our method is demonstrated using various
+multi- and high-dimensional benchmark datasets. Our implicit differentiation
+vector transformation is evaluated through numerical comparisons; the overall
+method is evaluated through exploration examples and use cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Pre-train</span>ed Perceptual Features Improve Differentially Private Image
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.12900v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.12900v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fredrik Harder, Milad Jalali Asadabadi, Danica J. Sutherland, Mijung Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training even moderately-sized generative models with differentially-private
+stochastic gradient descent (DP-SGD) is difficult: the required level of noise
+for reasonable levels of privacy is simply too large. We advocate instead
+building off a good, relevant representation on an informative public dataset,
+then learning to model the private data with that representation. In
+particular, we minimize the maximum mean discrepancy (MMD) between private
+target data and a generator's distribution, using a kernel based on perceptual
+features learned from a public dataset. With the MMD, we can simply privatize
+the data-dependent term once and for all, rather than introducing noise at each
+step of optimization as in DP-SGD. Our algorithm allows us to generate
+CIFAR10-level images with $\epsilon \approx 2$ which capture distinctive
+features in the distribution, far surpassing the current state of the art,
+which mostly focuses on datasets such as MNIST and FashionMNIST at a large
+$\epsilon \approx 10$. Our work introduces simple yet powerful foundations for
+reducing the gap between private and non-private deep generative models. Our
+code is available at \url{https://github.com/ParkLabML/DP-MEPF}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Selection for Generic Contextual Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2107.03455v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2107.03455v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avishek Ghosh, Abishek Sankararaman, Kannan Ramchandran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of model selection for the general stochastic
+contextual bandits under the realizability assumption. We propose a successive
+refinement based algorithm called Adaptive Contextual Bandit ({\ttfamily ACB}),
+that works in phases and successively eliminates model classes that are too
+simple to fit the given instance. We prove that this algorithm is adaptive,
+i.e., the regret rate order-wise matches that of any provable contextual bandit
+algorithm (ex. \cite{falcon}), that needs the knowledge of the true model
+class. The price of not knowing the correct model class turns out to be only an
+additive term contributing to the second order term in the regret bound. This
+cost possess the intuitive property that it becomes smaller as the model class
+becomes easier to identify, and vice-versa. We also show that a much simpler
+explore-then-commit (ETC) style algorithm also obtains similar regret bound,
+despite not knowing the true model class. However, the cost of model selection
+is higher in ETC as opposed to in {\ttfamily ACB}, as expected. Furthermore,
+for the special case of linear contextual bandits, we propose specialized
+algorithms that obtain sharper guarantees compared to the generic setup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE Transactions on Information Theory. arXiv admin
+  note: text overlap with arXiv:2006.02612</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-view <span class="highlight-title">self-supervised</span> learning for multivariate variable-channel
+  time series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09614v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09614v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thea Brüsch, Mikkel N. Schmidt, Tommy S. Alstrøm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Labeling of multivariate biomedical time series data is a laborious and
+expensive process. Self-supervised contrastive learning alleviates the need for
+large, labeled datasets through pretraining on unlabeled data. However, for
+multivariate time series data, the set of input channels often varies between
+applications, and most existing work does not allow for transfer between
+datasets with different sets of input channels. We propose learning one encoder
+to operate on all input channels individually. We then use a message passing
+neural network to extract a single representation across channels. We
+demonstrate the potential of this method by pretraining our model on a dataset
+with six EEG channels and then fine-tuning it on a dataset with two different
+EEG channels. We compare models with and without the message passing neural
+network across different contrastive loss functions. We show that our method,
+combined with the TS2Vec loss, outperforms all other methods in most settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in proceedings of 2023 IEEE International workshop on
+  Machine Learning for Signal Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Regular SE(3) Group Convolutions for Volumetric Medical Image Analysis <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thijs P. Kuipers, Erik J. Bekkers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regular group convolutional neural networks (G-CNNs) have been shown to
+increase model performance and improve equivariance to different geometrical
+symmetries. This work addresses the problem of SE(3), i.e., roto-translation
+equivariance, on volumetric data. Volumetric image data is prevalent in many
+medical settings. Motivated by the recent work on separable group convolutions,
+we devise a SE(3) group convolution kernel separated into a continuous SO(3)
+(rotation) kernel and a spatial kernel. We approximate equivariance to the
+continuous setting by sampling uniform SO(3) grids. Our continuous SO(3) kernel
+is parameterized via RBF interpolation on similarly uniform grids. We
+demonstrate the advantages of our approach in volumetric medical image
+analysis. Our SE(3) equivariant models consistently outperform CNNs and regular
+discrete G-CNNs on challenging medical classification tasks and show
+significantly improved generalization capabilities. Our approach achieves up to
+a 16.5% gain in accuracy over regular CNNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure, 2 tables, accepted at MICCAI 2023. Updated
+  version to camera ready version 1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HDGT: Heterogeneous Driving Graph <span class="highlight-title">Transformer</span> for Multi-Agent Trajectory
+  Prediction via Scene Encoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.09753v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.09753v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaosong Jia, Penghao Wu, Li Chen, Yu Liu, Hongyang Li, Junchi Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Encoding a driving scene into vector representations has been an essential
+task for autonomous driving that can benefit downstream tasks e.g. trajectory
+prediction. The driving scene often involves heterogeneous elements such as the
+different types of objects (agents, lanes, traffic signs) and the semantic
+relations between objects are rich and diverse. Meanwhile, there also exist
+relativity across elements, which means that the spatial relation is a relative
+concept and need be encoded in a ego-centric manner instead of in a global
+coordinate system. Based on these observations, we propose Heterogeneous
+Driving Graph Transformer (HDGT), a backbone modelling the driving scene as a
+heterogeneous graph with different types of nodes and edges. For heterogeneous
+graph construction, we connect different types of nodes according to diverse
+semantic relations. For spatial relation encoding, the coordinates of the node
+as well as its in-edges are in the local node-centric coordinate system. For
+the aggregation module in the graph neural network (GNN), we adopt the
+transformer structure in a hierarchical way to fit the heterogeneous nature of
+inputs. Experimental results show that HDGT achieves state-of-the-art
+performance for the task of trajectory prediction, on INTERACTION Prediction
+Challenge and Waymo Open Motion Challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE TPAMI in 2023. Code url:
+  https://github.com/OpenDriveLab/HDGT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Navya3DSeg -- Navya 3D Semantic Segmentation <span class="highlight-title">Dataset</span> & split generation
+  for autonomous vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08292v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08292v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Almin, Léo Lemarié, Anh Duong, B Ravi Kiran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving (AD) perception today relies heavily on deep learning
+based architectures requiring large scale annotated datasets with their
+associated costs for curation and annotation. The 3D semantic data are useful
+for core perception tasks such as obstacle detection and ego-vehicle
+localization. We propose a new dataset, Navya 3D Segmentation (Navya3DSeg),
+with a diverse label space corresponding to a large scale production grade
+operational domain, including rural, urban, industrial sites and universities
+from 13 countries. It contains 23 labeled sequences and 25 supplementary
+sequences without labels, designed to explore self-supervised and
+semi-supervised semantic segmentation benchmarks on point clouds. We also
+propose a novel method for sequential dataset split generation based on
+iterative multi-label stratification, and demonstrated to achieve a +1.2% mIoU
+improvement over the original split proposed by SemanticKITTI dataset. A
+complete benchmark for semantic segmentation task was performed, with state of
+the art methods. Finally, we demonstrate an Active Learning (AL) based dataset
+distillation framework. We introduce a novel heuristic-free sampling method
+called ego-pose distance based sampling in the context of AL. A detailed
+presentation on the dataset is available here
+https://www.youtube.com/watch?v=5m6ALIs-s20.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted version to IEEE RA-L. Version with supplementary materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Action Robust Reinforcement Learning with Probabilistic Policy
+  Execution Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07666v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07666v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanlin Liu, Zhihan Zhou, Han Liu, Lifeng Lai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust reinforcement learning (RL) aims to find a policy that optimizes the
+worst-case performance in the face of uncertainties. In this paper, we focus on
+action robust RL with the probabilistic policy execution uncertainty, in which,
+instead of always carrying out the action specified by the policy, the agent
+will take the action specified by the policy with probability $1-\rho$ and an
+alternative adversarial action with probability $\rho$. We establish the
+existence of an optimal policy on the action robust MDPs with probabilistic
+policy execution uncertainty and provide the action robust Bellman optimality
+equation for its solution. Furthermore, we develop Action Robust Reinforcement
+Learning with Certificates (ARRLC) algorithm that achieves minimax optimal
+regret and sample complexity. Furthermore, we conduct numerical experiments to
+validate our approach's robustness, demonstrating that ARRLC outperforms
+non-robust RL algorithms and converges faster than the robust TD algorithm in
+the presence of action perturbations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AUC Optimization from Multiple Unlabeled <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15776v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15776v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Liu, Zheng Xie, Ming Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly supervised learning aims to empower machine learning when the perfect
+supervision is unavailable, which has drawn great attention from researchers.
+Among various types of weak supervision, one of the most challenging cases is
+to learn from multiple unlabeled (U) datasets with only a little knowledge of
+the class priors, or U$^m$ learning for short. In this paper, we study the
+problem of building an AUC (area under ROC curve) optimization model from
+multiple unlabeled datasets, which maximizes the pairwise ranking ability of
+the classifier. We propose U$^m$-AUC, an AUC optimization approach that
+converts the U$^m$ data into a multi-label AUC optimization problem, and can be
+trained efficiently. We show that the proposed U$^m$-AUC is effective
+theoretically and empirically.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Opinion Market Model: Stemming Far-Right Opinion Spread using Positive
+  Interventions <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.06620v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.06620v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pio Calderon, Rohit Ram, Marian-Andrei Rizoiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online extremism has severe societal consequences, including normalizing hate
+speech, user radicalization, and increased social divisions. Various mitigation
+strategies have been explored to address these consequences. One such strategy
+uses positive interventions: controlled signals that add attention to the
+opinion ecosystem to boost certain opinions. To evaluate the effectiveness of
+positive interventions, we introduce the Opinion Market Model (OMM), a two-tier
+online opinion ecosystem model that considers both inter-opinion interactions
+and the role of positive interventions. The size of the opinion attention
+market is modeled in the first tier using the multivariate discrete-time Hawkes
+process; in the second tier, opinions cooperate and compete for market share,
+given limited attention using the market share attraction model. We demonstrate
+the convergence of our proposed estimation scheme on a synthetic dataset. Next,
+we test OMM on two learning tasks, applying to two real-world datasets to
+predict attention market shares and uncover latent relationships between online
+items. The first dataset comprises Facebook and Twitter discussions containing
+moderate and far-right opinions about bushfires and climate change. The second
+dataset captures popular VEVO artists' YouTube and Twitter attention volumes.
+OMM outperforms the state-of-the-art predictive models on both datasets and
+captures latent cooperation-competition relations. We uncover (1) self- and
+cross-reinforcement between far-right and moderate opinions on the bushfires
+and (2) pairwise artist relations that correlate with real-world interactions
+such as collaborations and long-lasting feuds. Lastly, we use OMM as a testbed
+for positive interventions and show how media coverage modulates the spread of
+far-right opinions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted in the 18th AAAI International Conference on Web and Social
+  Media (ICWSM'24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08396v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08396v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdul Rehman Khan, Asifullah Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional Neural Networks (CNNs) have made significant strides in medical
+image analysis in recent years. However, the local nature of the convolution
+operator may pose a limitation for capturing global and long-range interactions
+in CNNs. Recently, Transformers have gained popularity in the computer vision
+community and also medical image segmentation due to their ability to process
+global features effectively. The scalability issues of self-attention mechanism
+and lack of the CNN-like inductive bias may have limited their adoption.
+Therefore, hybrid Vision transformers (CNN-Transformer), exploiting advantages
+of both Convolution and Self-attention Mechanisms, have gained importance. In
+this work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision
+transformer (CNN-Transformer) for medical image segmentation. The proposed
+Hybrid Decoder, based on MaxViT-block, is designed to harness the power of both
+the convolution and self-attention mechanisms at each decoding stage with
+nominal computational burden. The inclusion of multi-axis self-attention,
+within each decoder stage, significantly enhances the discriminating capacity
+between the object and background regions, and thereby helps in improving the
+segmentation efficiency. In the Hybrid Decoder block, the fusion process
+commences by integrating the upsampled lower level decoder features, obtained
+through transpose convolution, with the skip-connection features derived from
+the hybrid encoder. Subsequently, the fused features undergo refinement through
+the utilization of a multi-axis attention mechanism. The proposed decoder block
+is repeated multiple times to progressively segment the nuclei regions.
+Experimental results on MoNuSeg18 and MoNuSAC20 dataset demonstrates the
+effectiveness of the proposed technique.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Positive unlabeled learning with tensor networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.14085v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.14085v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bojan Žunkovič
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Positive unlabeled learning is a binary classification problem with positive
+and unlabeled data. It is common in domains where negative labels are costly or
+impossible to obtain, e.g., medicine and personalized advertising. Most
+approaches to positive unlabeled learning apply to specific data types (e.g.,
+images, categorical data) and can not generate new positive and negative
+samples. This work introduces a feature-space distance-based tensor network
+approach to the positive unlabeled learning problem. The presented method is
+not domain specific and significantly improves the state-of-the-art results on
+the MNIST image and 15 categorical/mixed datasets. The trained tensor network
+model is also a generative model and enables the generation of new positive and
+negative instances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal LLMs for health grounded in individual-specific data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09018v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09018v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasiya Belyaeva, Justin Cosentino, Farhad Hormozdiari, Krish Eswaran, Shravya Shetty, Greg Corrado, Andrew Carroll, Cory Y. McLean, Nicholas A. Furlotte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation large language models (LLMs) have shown an impressive ability to
+solve tasks across a wide range of fields including health. To effectively
+solve personalized health tasks, LLMs need the ability to ingest a diversity of
+data modalities that are relevant to an individual's health status. In this
+paper, we take a step towards creating multimodal LLMs for health that are
+grounded in individual-specific data by developing a framework (HeLM: Health
+Large Language Model for Multimodal Understanding) that enables LLMs to use
+high-dimensional clinical modalities to estimate underlying disease risk. HeLM
+encodes complex data modalities by learning an encoder that maps them into the
+LLM's token embedding space and for simple modalities like tabular data by
+serializing the data into text. Using data from the UK Biobank, we show that
+HeLM can effectively use demographic and clinical features in addition to
+high-dimensional time-series data to estimate disease risk. For example, HeLM
+achieves an AUROC of 0.75 for asthma prediction when combining tabular and
+spirogram data modalities compared with 0.49 when only using tabular data.
+Overall, we find that HeLM outperforms or performs at parity with classical
+machine learning approaches across a selection of eight binary traits.
+Furthermore, we investigate the downstream uses of this model such as its
+generalizability to out-of-distribution traits and its ability to power
+conversations around individual health and wellness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Drug Repurposing Targeting COVID-19 3CL Protease using Molecular Docking
+  and Machine Learning Regression Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18088v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18088v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Imra Aqeel, Abdul Majid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The COVID-19 pandemic has created a global health crisis, driving the need
+for the rapid identification of potential therapeutics. To meet this challenge,
+drug repurposing is the only solution with saving cost, time, and labor. In
+this study, we used the Zinc database to screen the world-approved including
+FDA-approved 5903 drugs for repurposing as potential COVID-19 treatments
+targeting the main protease 3CL of SARS-CoV-2. We performed molecular docking
+and checked the efficacy of drug molecules. To enhance the efficiency of drug
+repurposing approach, we modeled the binding affinities using several machine
+learning regression approaches for QSAR modeling such as decision tree, extra
+trees, MLP, KNN, XGBoost, and gradient boosting. The computational results
+demonstrated that Decision Tree Regression (DTR) model has improved statistical
+measures of R2 and RMSE. These simulated results helped to identify drugs with
+high binding affinity. From the docking and other statistical analysis, we
+shortlisted six promising drugs with their respective Zinc IDs (ZINC3873365,
+ZINC85432544, ZINC203757351, ZINC85536956, ZINC8214470 and ZINC261494640)
+within the range of -15 kcal/mol to -13 kcal/mol. In the study, the repurposed
+drugs are novel except ZINC203757351 antiviral compound that has already
+identified against COVID-19 in other studies. Further, we analyzed the
+physiochemical and pharmacokinetic properties of these top-ranked selected
+drugs with respect to their best binding interaction for specific target
+protease 3CLpro. Our study has provided an efficient framework for drug
+repurposing against COVID-19. This highlights the potential of combining
+molecular docking with machine learning regression approaches to accelerate the
+identification of potential therapeutic candidates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Principal Component Analysis: A Median of Means Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2102.03403v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2102.03403v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debolina Paul, Saptarshi Chakraborty, Swagatam Das
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Principal Component Analysis (PCA) is a fundamental tool for data
+visualization, denoising, and dimensionality reduction. It is widely popular in
+Statistics, Machine Learning, Computer Vision, and related fields. However, PCA
+is well-known to fall prey to outliers and often fails to detect the true
+underlying low-dimensional structure within the dataset. Following the Median
+of Means (MoM) philosophy, recent supervised learning methods have shown great
+success in dealing with outlying observations without much compromise to their
+large sample theoretical properties. This paper proposes a PCA procedure based
+on the MoM principle. Called the \textbf{M}edian of \textbf{M}eans
+\textbf{P}rincipal \textbf{C}omponent \textbf{A}nalysis (MoMPCA), the proposed
+method is not only computationally appealing but also achieves optimal
+convergence rates under minimal assumptions. In particular, we explore the
+non-asymptotic error bounds of the obtained solution via the aid of the
+Rademacher complexities while granting absolutely no assumption on the outlying
+observations. The derived concentration results are not dependent on the
+dimension because the analysis is conducted in a separable Hilbert space, and
+the results only depend on the fourth moment of the underlying distribution in
+the corresponding norm. The proposal's efficacy is also thoroughly showcased
+through simulations and real data applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Friendly Noise against Adversarial Noise: A Powerful Defense against
+  Data Poisoning Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.10224v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.10224v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian Yu Liu, Yu Yang, Baharan Mirzasoleiman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A powerful category of (invisible) data poisoning attacks modify a subset of
+training examples by small adversarial perturbations to change the prediction
+of certain test-time data. Existing defense mechanisms are not desirable to
+deploy in practice, as they often either drastically harm the generalization
+performance, or are attack-specific, and prohibitively slow to apply. Here, we
+propose a simple but highly effective approach that unlike existing methods
+breaks various types of invisible poisoning attacks with the slightest drop in
+the generalization performance. We make the key observation that attacks
+introduce local sharp regions of high training loss, which when minimized,
+results in learning the adversarial perturbations and makes the attack
+successful. To break poisoning attacks, our key idea is to alleviate the sharp
+loss regions introduced by poisons. To do so, our approach comprises two
+components: an optimized friendly noise that is generated to maximally perturb
+examples without degrading the performance, and a randomly varying noise
+component. The combination of both components builds a very light-weight but
+extremely effective defense against the most powerful triggerless targeted and
+hidden-trigger backdoor poisoning attacks, including Gradient Matching,
+Bulls-eye Polytope, and Sleeper Agent. We show that our friendly noise is
+transferable to other architectures, and adaptive attacks cannot break our
+defense due to its random noise component. Our code is available at:
+https://github.com/tianyu139/friendly-noise
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at: https://github.com/tianyu139/friendly-noise</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-Efficient Augmentation for Training Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.08363v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.08363v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian Yu Liu, Baharan Mirzasoleiman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation is essential to achieve state-of-the-art performance in
+many deep learning applications. However, the most effective augmentation
+techniques become computationally prohibitive for even medium-sized datasets.
+To address this, we propose a rigorous technique to select subsets of data
+points that when augmented, closely capture the training dynamics of full data
+augmentation. We first show that data augmentation, modeled as additive
+perturbations, improves learning and generalization by relatively enlarging and
+perturbing the smaller singular values of the network Jacobian, while
+preserving its prominent directions. This prevents overfitting and enhances
+learning the harder to learn information. Then, we propose a framework to
+iteratively extract small subsets of training data that when augmented, closely
+capture the alignment of the fully augmented Jacobian with labels/residuals. We
+prove that stochastic gradient descent applied to the augmented subsets found
+by our approach has similar training dynamics to that of fully augmented data.
+Our experiments demonstrate that our method achieves 6.3x speedup on CIFAR10
+and 2.2x speedup on SVHN, and outperforms the baselines by up to 10% across
+various subset sizes. Similarly, on TinyImageNet and ImageNet, our method beats
+the baselines by up to 8%, while achieving up to 3.3x speedup across various
+subset sizes. Finally, training on and augmenting 50% subsets using our method
+on a version of CIFAR10 corrupted with label noise even outperforms using the
+full dataset. Our code is available at:
+https://github.com/tianyu139/data-efficient-augmentation
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at:
+  https://github.com/tianyu139/data-efficient-augmentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pythae: Unifying Generative Autoencoders in Python -- A Benchmarking Use
+  Case <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.08309v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.08309v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clément Chadebec, Louis J. Vincent, Stéphanie Allassonnière
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, deep generative models have attracted increasing interest
+due to their capacity to model complex distributions. Among those models,
+variational autoencoders have gained popularity as they have proven both to be
+computationally efficient and yield impressive results in multiple fields.
+Following this breakthrough, extensive research has been done in order to
+improve the original publication, resulting in a variety of different VAE
+models in response to different tasks. In this paper we present Pythae, a
+versatile open-source Python library providing both a unified implementation
+and a dedicated framework allowing straightforward, reproducible and reliable
+use of generative autoencoder models. We then propose to use this library to
+perform a case study benchmark where we present and compare 19 generative
+autoencoder models representative of some of the main improvements on
+downstream tasks such as image reconstruction, generation, classification,
+clustering and interpolation. The open-source library can be found at
+https://github.com/clementchadebec/benchmark_VAE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonuniqueness and Convergence to Equivalent Solutions in Observer-based
+  Inverse Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.16299v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.16299v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jared Town, Zachary Morrison, Rushikesh Kamalapurkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key challenge in solving the deterministic inverse reinforcement learning
+(IRL) problem online and in real-time is the existence of multiple solutions.
+Nonuniqueness necessitates the study of the notion of equivalent solutions,
+i.e., solutions that result in a different cost functional but same feedback
+matrix, and convergence to such solutions. While offline algorithms that result
+in convergence to equivalent solutions have been developed in the literature,
+online, real-time techniques that address nonuniqueness are not available. In
+this paper, a regularized history stack observer that converges to
+approximately equivalent solutions of the IRL problem is developed. Novel
+data-richness conditions are developed to facilitate the analysis and
+simulation results are provided to demonstrate the effectiveness of the
+developed technique.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures, submitted to American Controls Conference 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tangent <span class="highlight-title">Transformer</span>s for Composition, Privacy and Removal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08122v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08122v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian Yu Liu, Aditya Golatkar, Stefano Soatto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Tangent Attention Fine-Tuning (TAFT), a method for fine-tuning
+linearized transformers obtained by computing a First-order Taylor Expansion
+around a pre-trained initialization. We show that the Jacobian-Vector Product
+resulting from linearization can be computed efficiently in a single forward
+pass, reducing training and inference cost to the same order of magnitude as
+its original non-linear counterpart, while using the same number of parameters.
+Furthermore, we show that, when applied to various downstream visual
+classification tasks, the resulting Tangent Transformer fine-tuned with TAFT
+can perform comparably with fine-tuning the original non-linear network. Since
+Tangent Transformers are linear with respect to the new set of weights, and the
+resulting fine-tuning loss is convex, we show that TAFT enjoys several
+advantages compared to non-linear fine-tuning when it comes to model
+composition, parallel training, machine unlearning, and differential privacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Emotion-Conditioned Melody Harmonization with Hierarchical Variational
+  Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03718v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03718v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shulei Ji, Xinyu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing melody harmonization models have made great progress in improving
+the quality of generated harmonies, but most of them ignored the emotions
+beneath the music. Meanwhile, the variability of harmonies generated by
+previous methods is insufficient. To solve these problems, we propose a novel
+LSTM-based Hierarchical Variational Auto-Encoder (LHVAE) to investigate the
+influence of emotional conditions on melody harmonization, while improving the
+quality of generated harmonies and capturing the abundant variability of chord
+progressions. Specifically, LHVAE incorporates latent variables and emotional
+conditions at different levels (piece- and bar-level) to model the global and
+local music properties. Additionally, we introduce an attention-based melody
+context vector at each step to better learn the correspondence between melodies
+and harmonies. Objective experimental results show that our proposed model
+outperforms other LSTM-based models. Through subjective evaluation, we conclude
+that only altering the types of chords hardly changes the overall emotion of
+the music. The qualitative analysis demonstrates the ability of our model to
+generate variable harmonies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE SMC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Uncertainty Quantification of Variance Networks by
+  Tree-Structured Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12658v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12658v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Ma, Xing Yan, Kun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To improve the uncertainty quantification of variance networks, we propose a
+novel tree-structured local neural network model that partitions the feature
+space into multiple regions based on uncertainty heterogeneity. A tree is built
+upon giving the training data, whose leaf nodes represent different regions
+where region-specific neural networks are trained to predict both the mean and
+the variance for quantifying uncertainty. The proposed Uncertainty-Splitting
+Neural Regression Tree (USNRT) employs novel splitting criteria. At each node,
+a neural network is trained on the full data first, and a statistical test for
+the residuals is conducted to find the best split, corresponding to the two
+sub-regions with the most significant uncertainty heterogeneity between them.
+USNRT is computationally friendly because very few leaf nodes are sufficient
+and pruning is unnecessary. Furthermore, an ensemble version can be easily
+constructed to estimate the total uncertainty including the aleatory and
+epistemic. On extensive UCI datasets, USNRT or its ensemble shows superior
+performance compared to some recent popular methods for quantifying uncertainty
+with variances. Through comprehensive visualization and analysis, we uncover
+how USNRT works and show its merits, revealing that uncertainty heterogeneity
+does exist in many datasets and can be learned by USNRT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ It Is All About Data: A <span class="highlight-title">Survey</span> on the Effects of Data on Adversarial
+  Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09767v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09767v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiyu Xiong, Michael Tegegn, Jaskeerat Singh Sarin, Shubhraneel Pal, Julia Rubin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial examples are inputs to machine learning models that an attacker
+has intentionally designed to confuse the model into making a mistake. Such
+examples pose a serious threat to the applicability of machine-learning-based
+systems, especially in life- and safety-critical domains. To address this
+problem, the area of adversarial robustness investigates mechanisms behind
+adversarial attacks and defenses against these attacks. This survey reviews a
+particular subset of this literature that focuses on investigating properties
+of training data in the context of model robustness under evasion attacks. It
+first summarizes the main properties of data leading to adversarial
+vulnerability. It then discusses guidelines and techniques for improving
+adversarial robustness by enhancing the data representation and learning
+procedures, as well as techniques for estimating robustness guarantees given
+particular data. Finally, it discusses gaps of knowledge and promising future
+research directions in this area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>51 pages, 25 figures, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep-Q Learning with Hybrid Quantum Neural Network on Solving Maze
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.10159v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.10159v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao-Yuan Chen, Yen-Jui Chang, Ching-Ray Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum computing holds great potential for advancing the limitations of
+machine learning algorithms to handle higher data dimensions and reduce overall
+training parameters in deep neural network (DNN) models. This study uses a
+parameterized quantum circuit (PQC) on a gate-based quantum computer to
+investigate the potential for quantum advantage in a model-free reinforcement
+learning problem. Through a comprehensive investigation and evaluation of the
+current model and capabilities of quantum computers, we designed and trained a
+novel hybrid Quantum neural network based on the latest Qiskit and PyTorch
+framework. We compared its performance with a full-classical DNN with and
+without an integrated PQC. Our research provides insights into the potential of
+deep quantum learning to solve a maze problem and, potentially, other
+reinforcement learning problems. We conclude that various reinforcement
+learning problems can be effective with reasonable training epochs. Moreover, a
+comparative discussion of the various quantum reinforcement learning model on
+maze problems is discussed to evaluate our research's overall potential and
+advantages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MultiRobustBench: Benchmarking Robustness Against Multiple Attacks <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10980v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10980v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sihui Dai, Saeed Mahloujifar, Chong Xiang, Vikash Sehwag, Pin-Yu Chen, Prateek Mittal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The bulk of existing research in defending against adversarial examples
+focuses on defending against a single (typically bounded Lp-norm) attack, but
+for a practical setting, machine learning (ML) models should be robust to a
+wide variety of attacks. In this paper, we present the first unified framework
+for considering multiple attacks against ML models. Our framework is able to
+model different levels of learner's knowledge about the test-time adversary,
+allowing us to model robustness against unforeseen attacks and robustness
+against unions of attacks. Using our framework, we present the first
+leaderboard, MultiRobustBench, for benchmarking multiattack evaluation which
+captures performance across attack types and attack strengths. We evaluate the
+performance of 16 defended models for robustness against a set of 9 different
+attack types, including Lp-based threat models, spatial transformations, and
+color changes, at 20 different attack strengths (180 attacks total).
+Additionally, we analyze the state of current defenses against multiple
+attacks. Our analysis shows that while existing defenses have made progress in
+terms of average robustness across the set of attacks used, robustness against
+the worst-case attack is still a big open problem as all existing models
+perform worse than random guessing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlignAtt: Using Attention-based Audio-Translation Alignments as a Guide
+  for Simultaneous Speech Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Papi, Marco Turchi, Matteo Negri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention is the core mechanism of today's most used architectures for
+natural language processing and has been analyzed from many perspectives,
+including its effectiveness for machine translation-related tasks. Among these
+studies, attention resulted to be a useful source of information to get
+insights about word alignment also when the input text is substituted with
+audio segments, as in the case of the speech translation (ST) task. In this
+paper, we propose AlignAtt, a novel policy for simultaneous ST (SimulST) that
+exploits the attention information to generate source-target alignments that
+guide the model during inference. Through experiments on the 8 language pairs
+of MuST-C v1.0, we show that AlignAtt outperforms previous state-of-the-art
+SimulST policies applied to offline-trained models with gains in terms of BLEU
+of 2 points and latency reductions ranging from 0.5s to 0.8s across the 8
+languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Interspeech 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solvent: A Framework for Protein Folding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04603v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04603v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemyung Lee, Kyeongtak Han, Jaehoon Kim, Hasun Yu, Youhan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Consistency and reliability are crucial for conducting AI research. Many
+famous research fields, such as object detection, have been compared and
+validated with solid benchmark frameworks. After AlphaFold2, the protein
+folding task has entered a new phase, and many methods are proposed based on
+the component of AlphaFold2. The importance of a unified research framework in
+protein folding contains implementations and benchmarks to consistently and
+fairly compare various approaches. To achieve this, we present Solvent, an
+protein folding framework that supports significant components of
+state-of-the-art models in the manner of off-the-shelf interface Solvent
+contains different models implemented in a unified codebase and supports
+training and evaluation for defined models on the same dataset. We benchmark
+well-known algorithms and their components and provide experiments that give
+helpful insights into the protein structure modeling field. We hope that
+Solvent will increase the reliability and consistency of proposed models and
+gives efficiency in both speed and costs, resulting in acceleration on protein
+folding modeling research. The code is available at
+https://github.com/kakaobrain/solvent, and the project will continue to be
+developed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint, 8pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Guided Generation for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09702v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09702v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon T. Willard, Rémi Louf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article we describe an efficient approach to guiding language model
+text generation with regular expressions and context-free grammars. Our
+approach adds little to no overhead to the token sequence generation process,
+and makes guided generation feasible in practice. An implementation is provided
+in the open source Python library Outlines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Shift-Robust Molecular Relational Learning with Causal Substructure <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18451v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18451v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Namkyeong Lee, Kanghoon Yoon, Gyoung S. Na, Sein Kim, Chanyoung Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, molecular relational learning, whose goal is to predict the
+interaction behavior between molecular pairs, got a surge of interest in
+molecular sciences due to its wide range of applications. In this work, we
+propose CMRL that is robust to the distributional shift in molecular relational
+learning by detecting the core substructure that is causally related to
+chemical reactions. To do so, we first assume a causal relationship based on
+the domain knowledge of molecular sciences and construct a structural causal
+model (SCM) that reveals the relationship between variables. Based on the SCM,
+we introduce a novel conditional intervention framework whose intervention is
+conditioned on the paired molecule. With the conditional intervention
+framework, our model successfully learns from the causal substructure and
+alleviates the confounding effect of shortcut substructures that are spuriously
+correlated to chemical reactions. Extensive experiments on various tasks with
+real-world and synthetic datasets demonstrate the superiority of CMRL over
+state-of-the-art baseline models. Our code is available at
+https://github.com/Namkyeong/CMRL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Computing the gradients with respect to all parameters of a quantum
+  neural network using a single circuit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08167v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08167v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guang Ping He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When computing the gradients of a quantum neural network using the
+parameter-shift rule, the cost function needs to be calculated twice for the
+gradient with respect to a single adjustable parameter of the network. When the
+total number of parameters is high, the quantum circuit for the computation has
+to be adjusted and run for many times. Here we propose an approach to compute
+all the gradients using a single circuit only, with a much reduced circuit
+depth and less classical registers. We also demonstrate experimentally, on both
+real quantum hardware and simulator, that our approach has the advantages that
+the circuit takes a significantly shorter time to compile than the conventional
+approach, resulting in a speedup on the total runtime.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added a suggestion on improving real quantum computers</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Identifying the Hazard Boundary of ML-enabled Autonomous Systems Using
+  Cooperative Co-Evolutionary Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13807v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13807v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sepehr Sharifi, Donghwan Shin, Lionel C. Briand, Nathan Aschbacher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Machine Learning (ML)-enabled autonomous systems (MLASs), it is essential
+to identify the hazard boundary of ML Components (MLCs) in the MLAS under
+analysis. Given that such boundary captures the conditions in terms of MLC
+behavior and system context that can lead to hazards, it can then be used to,
+for example, build a safety monitor that can take any predefined fallback
+mechanisms at runtime when reaching the hazard boundary. However, determining
+such hazard boundary for an ML component is challenging. This is due to the
+problem space combining system contexts (i.e., scenarios) and MLC behaviors
+(i.e., inputs and outputs) being far too large for exhaustive exploration and
+even to handle using conventional metaheuristics, such as genetic algorithms.
+Additionally, the high computational cost of simulations required to determine
+any MLAS safety violations makes the problem even more challenging.
+Furthermore, it is unrealistic to consider a region in the problem space
+deterministically safe or unsafe due to the uncontrollable parameters in
+simulations and the non-linear behaviors of ML models (e.g., deep neural
+networks) in the MLAS under analysis. To address the challenges, we propose
+MLCSHE (ML Component Safety Hazard Envelope), a novel method based on a
+Cooperative Co-Evolutionary Algorithm (CCEA), which aims to tackle a
+high-dimensional problem by decomposing it into two lower-dimensional search
+subproblems. Moreover, we take a probabilistic view of safe and unsafe regions
+and define a novel fitness function to measure the distance from the
+probabilistic hazard boundary and thus drive the search effectively. We
+evaluate the effectiveness and efficiency of MLCSHE on a complex Autonomous
+Vehicle (AV) case study. Our evaluation results show that MLCSHE is
+significantly more effective and efficient compared to a standard genetic
+algorithm and random search.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provably Faster Gradient Descent via Long Steps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06324v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06324v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Grimmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work establishes provably faster convergence rates for gradient descent
+in smooth convex optimization via a computer-assisted analysis technique. Our
+theory allows nonconstant stepsize policies with frequent long steps
+potentially violating descent by analyzing the overall effect of many
+iterations at once rather than the typical one-iteration inductions used in
+most first-order method analyses. We show that long steps, which may increase
+the objective value in the short term, lead to provably faster convergence in
+the long term. A conjecture towards proving a faster $O(1/T\log T)$ rate for
+gradient descent is also motivated along with simple numerical validation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Apologies for the several updates done shortly after first posting
+  this work: In these, I have added more references to excellent relevant works
+  I missed in my initial literature review, esp the Master's thesis of Jason
+  Altschuler</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preprocessors Matter! Realistic Decision-Based Attacks on Machine
+  Learning Systems <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03297v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03297v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chawin Sitawarin, Florian Tramèr, Nicholas Carlini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision-based attacks construct adversarial examples against a machine
+learning (ML) model by making only hard-label queries. These attacks have
+mainly been applied directly to standalone neural networks. However, in
+practice, ML models are just one component of a larger learning system. We find
+that by adding a single preprocessor in front of a classifier, state-of-the-art
+query-based attacks are up to 7$\times$ less effective at attacking a
+prediction pipeline than at attacking the model alone. We explain this
+discrepancy by the fact that most preprocessors introduce some notion of
+invariance to the input space. Hence, attacks that are unaware of this
+invariance inevitably waste a large number of queries to re-discover or
+overcome it. We, therefore, develop techniques to (i) reverse-engineer the
+preprocessor and then (ii) use this extracted information to attack the
+end-to-end system. Our preprocessors extraction method requires only a few
+hundred queries, and our preprocessor-aware attacks recover the same efficacy
+as when attacking the model alone. The code can be found at
+https://github.com/google-research/preprocessor-aware-black-box-attack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023. Code can be found at
+  https://github.com/google-research/preprocessor-aware-black-box-attack</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-dimensional and Permutation Invariant Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03933v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03933v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vinicius Mikuni, Benjamin Nachman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods for anomaly detection of new physics processes are often limited to
+low-dimensional spaces due to the difficulty of learning high-dimensional
+probability densities. Particularly at the constituent level, incorporating
+desirable properties such as permutation invariance and variable-length inputs
+becomes difficult within popular density estimation methods. In this work, we
+introduce a permutation-invariant density estimator for particle physics data
+based on diffusion models, specifically designed to handle variable-length
+inputs. We demonstrate the efficacy of our methodology by utilizing the learned
+density as a permutation-invariant anomaly detection score, effectively
+identifying jets with low likelihood under the background-only hypothesis. To
+validate our density estimation method, we investigate the ratio of learned
+densities and compare to those obtained by a supervised classification
+algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Convergence Rate for Manifold Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12606v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12606v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joyce Chew, Deanna Needell, Michael Perlmutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-dimensional data arises in numerous applications, and the rapidly
+developing field of geometric deep learning seeks to develop neural network
+architectures to analyze such data in non-Euclidean domains, such as graphs and
+manifolds. Recent work by Z. Wang, L. Ruiz, and A. Ribeiro has introduced a
+method for constructing manifold neural networks using the spectral
+decomposition of the Laplace Beltrami operator. Moreover, in this work, the
+authors provide a numerical scheme for implementing such neural networks when
+the manifold is unknown and one only has access to finitely many sample points.
+The authors show that this scheme, which relies upon building a data-driven
+graph, converges to the continuum limit as the number of sample points tends to
+infinity. Here, we build upon this result by establishing a rate of convergence
+that depends on the intrinsic dimension of the manifold but is independent of
+the ambient dimension. We also discuss how the rate of convergence depends on
+the depth of the network and the number of filters used in each layer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Monotonic Risk Relationships under Distribution Shifts for Regularized
+  Risk Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.11589v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.11589v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel LeJeune, Jiayu Liu, Reinhard Heckel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning systems are often applied to data that is drawn from a
+different distribution than the training distribution. Recent work has shown
+that for a variety of classification and signal reconstruction problems, the
+out-of-distribution performance is strongly linearly correlated with the
+in-distribution performance. If this relationship or more generally a monotonic
+one holds, it has important consequences. For example, it allows to optimize
+performance on one distribution as a proxy for performance on the other. In
+this paper, we study conditions under which a monotonic relationship between
+the performances of a model on two distributions is expected. We prove an exact
+asymptotic linear relation for squared error and a monotonic relation for
+misclassification error for ridge-regularized general linear models under
+covariate shift, as well as an approximate linear relation for linear inverse
+problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization
+  Using Floating-Point Formats 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09782v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09782v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxia Wu, Zhewei Yao, Yuxiong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the complex domain of large language models (LLMs), striking a balance
+between computational efficiency and maintaining model quality is a formidable
+challenge. Navigating the inherent limitations of uniform quantization,
+particularly when dealing with outliers, and motivated by the launch of
+NVIDIA's H100 hardware, this study delves into the viability of floating-point
+(FP) quantization, particularly focusing on FP8 and FP4, as a potential
+solution. Our comprehensive investigation reveals that for LLMs, FP8 activation
+consistently outshines its integer (INT8) equivalent, with the performance edge
+becoming more noticeable in models possessing parameters beyond one billion.
+For weight quantization, our findings indicate that FP4 exhibits comparable, if
+not superior, performance to INT4, simplifying deployment on FP-supported
+hardware like H100. To mitigate the overhead from precision alignment caused by
+the disparity between weights and activations, we propose two scaling
+constraints for weight quantization that negligibly impact the performance
+compared to the standard W4A8 model. We additionally enhance our quantization
+methods by integrating the Low Rank Compensation (LoRC) strategy, yielding
+improvements especially in smaller models. The results of our investigation
+emphasize the immense potential of FP quantization for LLMs, paving the way for
+high-efficiency deployment in resource-limited settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">6</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating VTubing as a Reconstruction of Streamer Self-Presentation:
+  Identity, Performance, and Gender <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Wan, Zhicong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  VTubers, or Virtual YouTubers, are live streamers who create streaming
+content using animated 2D or 3D virtual avatars. In recent years, there has
+been a significant increase in the number of VTuber creators and viewers across
+the globe. This practise has drawn research attention into topics such as
+viewers' engagement behaviors and perceptions, however, as animated avatars
+offer more identity and performance flexibility than traditional live streaming
+where one uses their own body, little research has focused on how this
+flexibility influences how creators present themselves. This research thus
+seeks to fill this gap by presenting results from a qualitative study of 16
+Chinese-speaking VTubers' streaming practices. The data revealed that the
+virtual avatars that were used while live streaming afforded creators
+opportunities to present themselves using inflated presentations and resulted
+in inclusive interactions with viewers. The results also unveiled the inflated,
+and often sexualized, gender expressions of VTubers while they were situated in
+misogynistic environments. The socio-technical facets of VTubing were found to
+potentially reduce sexual harassment and sexism, whilst also raising
+self-objectification concerns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review at ACM CSCW after a Major Revision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-<span class="highlight-title">Transformer</span>: A Unified Framework for Multimodal Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyuan Zhang, Kaixiong Gong, Kaipeng Zhang, Hongsheng Li, Yu Qiao, Wanli Ouyang, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal learning aims to build models that can process and relate
+information from multiple modalities. Despite years of development in this
+field, it still remains challenging to design a unified network for processing
+various modalities ($\textit{e.g.}$ natural language, 2D images, 3D point
+clouds, audio, video, time series, tabular data) due to the inherent gaps among
+them. In this work, we propose a framework, named Meta-Transformer, that
+leverages a $\textbf{frozen}$ encoder to perform multimodal perception without
+any paired multimodal training data. In Meta-Transformer, the raw input data
+from various modalities are mapped into a shared token space, allowing a
+subsequent encoder with frozen parameters to extract high-level semantic
+features of the input data. Composed of three main components: a unified data
+tokenizer, a modality-shared encoder, and task-specific heads for downstream
+tasks, Meta-Transformer is the first framework to perform unified learning
+across 12 modalities with unpaired data. Experiments on different benchmarks
+reveal that Meta-Transformer can handle a wide range of tasks including
+fundamental perception (text, image, point cloud, audio, video), practical
+application (X-Ray, infrared, hyperspectral, and IMU), and data mining (graph,
+tabular, and time-series). Meta-Transformer indicates a promising future for
+developing unified multimodal intelligence with transformers. Code will be
+available at https://github.com/invictus717/MetaTransformer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://kxgong.github.io/meta_transformer/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RetouchingFFHQ: A Large-scale <span class="highlight-title">Dataset</span> for Fine-grained Face Retouching
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10642v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10642v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qichao Ying, Jiaxin Liu, Sheng Li, Haisheng Xu, Zhenxing Qian, Xinpeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread use of face retouching filters on short-video platforms has
+raised concerns about the authenticity of digital appearances and the impact of
+deceptive advertising. To address these issues, there is a pressing need to
+develop advanced face retouching techniques. However, the lack of large-scale
+and fine-grained face retouching datasets has been a major obstacle to progress
+in this field. In this paper, we introduce RetouchingFFHQ, a large-scale and
+fine-grained face retouching dataset that contains over half a million
+conditionally-retouched images. RetouchingFFHQ stands out from previous
+datasets due to its large scale, high quality, fine-grainedness, and
+customization. By including four typical types of face retouching operations
+and different retouching levels, we extend the binary face retouching detection
+into a fine-grained, multi-retouching type, and multi-retouching level
+estimation problem. Additionally, we propose a Multi-granularity Attention
+Module (MAM) as a plugin for CNN backbones for enhanced cross-scale
+representation learning. Extensive experiments using different baselines as
+well as our proposed method on RetouchingFFHQ show decent performance on face
+retouching detection. With the proposed new dataset, we believe there is great
+potential for future work to tackle the challenging problem of real-world
+fine-grained face retouching detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAP: Multimodal Uncertainty-Aware Vision-Language <span class="highlight-title">Pre-train</span>ing Model <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.05335v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.05335v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yatai Ji, Junjie Wang, Yuan Gong, Lin Zhang, Yanru Zhu, Hongfa Wang, Jiaxing Zhang, Tetsuya Sakai, Yujiu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal semantic understanding often has to deal with uncertainty, which
+means the obtained messages tend to refer to multiple targets. Such uncertainty
+is problematic for our interpretation, including inter- and intra-modal
+uncertainty. Little effort has studied the modeling of this uncertainty,
+particularly in pre-training on unlabeled datasets and fine-tuning in
+task-specific downstream datasets. In this paper, we project the
+representations of all modalities as probabilistic distributions via a
+Probability Distribution Encoder (PDE) by utilizing sequence-level
+interactions. Compared to the existing deterministic methods, such uncertainty
+modeling can convey richer multimodal semantic information and more complex
+relationships. Furthermore, we integrate uncertainty modeling with popular
+pre-training frameworks and propose suitable pre-training tasks:
+Distribution-based Vision-Language Contrastive learning (D-VLC),
+Distribution-based Masked Language Modeling (D-MLM), and Distribution-based
+Image-Text Matching (D-ITM). The fine-tuned models are applied to challenging
+downstream tasks, including image-text retrieval, visual question answering,
+visual reasoning, and visual entailment, and achieve state-of-the-art results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023 Main Track Long Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Positive-Augmented Contrastive Learning for Image and Video Captioning
+  Evaluation <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12112v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12112v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Sarto, Manuele Barraco, Marcella Cornia, Lorenzo Baraldi, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The CLIP model has been recently proven to be very effective for a variety of
+cross-modal tasks, including the evaluation of captions generated from
+vision-and-language architectures. In this paper, we propose a new recipe for a
+contrastive-based evaluation metric for image captioning, namely
+Positive-Augmented Contrastive learning Score (PAC-S), that in a novel way
+unifies the learning of a contrastive visual-semantic space with the addition
+of generated images and text on curated data. Experiments spanning several
+datasets demonstrate that our new metric achieves the highest correlation with
+human judgments on both images and videos, outperforming existing
+reference-based metrics like CIDEr and SPICE and reference-free metrics like
+CLIP-Score. Finally, we test the system-level correlation of the proposed
+metric when considering popular image captioning approaches, and assess the
+impact of employing different cross-modal features. Our source code and trained
+models are publicly available at: https://github.com/aimagelab/pacscore.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023 (highlight paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Emotion-Conditioned Melody Harmonization with Hierarchical Variational
+  Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03718v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03718v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shulei Ji, Xinyu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing melody harmonization models have made great progress in improving
+the quality of generated harmonies, but most of them ignored the emotions
+beneath the music. Meanwhile, the variability of harmonies generated by
+previous methods is insufficient. To solve these problems, we propose a novel
+LSTM-based Hierarchical Variational Auto-Encoder (LHVAE) to investigate the
+influence of emotional conditions on melody harmonization, while improving the
+quality of generated harmonies and capturing the abundant variability of chord
+progressions. Specifically, LHVAE incorporates latent variables and emotional
+conditions at different levels (piece- and bar-level) to model the global and
+local music properties. Additionally, we introduce an attention-based melody
+context vector at each step to better learn the correspondence between melodies
+and harmonies. Objective experimental results show that our proposed model
+outperforms other LSTM-based models. Through subjective evaluation, we conclude
+that only altering the types of chords hardly changes the overall emotion of
+the music. The qualitative analysis demonstrates the ability of our model to
+generate variable harmonies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE SMC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-19T00:00:00Z">2023-07-19</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">50</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DialogStudio: Towards Richest and Most Diverse Unified <span class="highlight-title">Dataset</span>
+  Collection for Conversational AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianguo Zhang, Kun Qian, Zhiwei Liu, Shelby Heinecke, Rui Meng, Ye Liu, Zhou Yu, Silvio Savarese, Caiming Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite advancements in conversational AI, language models encounter
+challenges to handle diverse conversational tasks, and existing dialogue
+dataset collections often lack diversity and comprehensiveness. To tackle these
+issues, we introduce DialogStudio: the largest and most diverse collection of
+dialogue datasets, unified under a consistent format while preserving their
+original information. Our collection encompasses data from open-domain
+dialogues, task-oriented dialogues, natural language understanding,
+conversational recommendation, dialogue summarization, and knowledge-grounded
+dialogues, making it an incredibly rich and diverse resource for dialogue
+research and model training. To further enhance the utility of DialogStudio, we
+identify the licenses for each dataset and design domain-aware prompts for
+selected dialogues to facilitate instruction-aware fine-tuning. Furthermore, we
+develop conversational AI models using the dataset collection, and our
+experiments in both zero-shot and few-shot learning scenarios demonstrate the
+superiority of DialogStudio. To improve transparency and support dataset and
+task-based research, as well as language model pre-training, all datasets,
+licenses, codes, and models associated with DialogStudio are made publicly
+accessible at https://github.com/salesforce/DialogStudio
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Challenges and Applications of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean Kaddour, Joshua Harris, Maximilian Mozes, Herbie Bradley, Roberta Raileanu, Robert McHardy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) went from non-existent to ubiquitous in the
+machine learning discourse within a few years. Due to the fast pace of the
+field, it is difficult to identify the remaining challenges and already
+fruitful application areas. In this paper, we aim to establish a systematic set
+of open problems and application successes so that ML researchers can
+comprehend the field's current state more quickly and become productive.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>72 pages. v01. Work in progress. Feedback and comments are highly
+  appreciated!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLMs as Workers in Human-Computational Algorithms? Replicating
+  Crowdsourcing Pipelines with LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongshuang Wu, Haiyi Zhu, Maya Albayrak, Alexis Axon, Amanda Bertsch, Wenxing Deng, Ziqi Ding, Bill Guo, Sireesh Gururaja, Tzu-Sheng Kuo, Jenny T. Liang, Ryan Liu, Ihita Mandal, Jeremiah Milbauer, Xiaolin Ni, Namrata Padmanabhan, Subhashini Ramkumar, Alexis Sudjianto, Jordan Taylor, Ying-Jui Tseng, Patricia Vaidos, Zhijin Wu, Wei Wu, Chenyang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LLMs have shown promise in replicating human-like behavior in crowdsourcing
+tasks that were previously thought to be exclusive to human abilities. However,
+current efforts focus mainly on simple atomic tasks. We explore whether LLMs
+can replicate more complex crowdsourcing pipelines. We find that modern LLMs
+can simulate some of crowdworkers' abilities in these "human computation
+algorithms," but the level of success is variable and influenced by requesters'
+understanding of LLM capabilities, the specific skills required for sub-tasks,
+and the optimal interaction modality for performing these sub-tasks. We reflect
+on human and LLMs' different sensitivities to instructions, stress the
+importance of enabling human-facing safeguards for LLMs, and discuss the
+potential of training humans and LLMs with complementary skill sets. Crucially,
+we show that replicating crowdsourcing pipelines offers a valuable platform to
+investigate (1) the relative strengths of LLMs on different tasks (by
+cross-comparing their performances on sub-tasks) and (2) LLMs' potential in
+complex tasks, where they can complete part of the tasks while leaving others
+to humans.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring <span class="highlight-title">Transformer</span> Extrapolation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Qin, Yiran Zhong, Hui Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Length extrapolation has attracted considerable attention recently since it
+allows transformers to be tested on longer sequences than those used in
+training. Previous research has shown that this property can be attained by
+using carefully designed Relative Positional Encodings (RPEs). While these
+methods perform well on a variety of corpora, the conditions for length
+extrapolation have yet to be investigated. This paper attempts to determine
+what types of RPEs allow for length extrapolation through a thorough
+mathematical and empirical analysis. We discover that a transformer is certain
+to possess this property as long as the series that corresponds to the RPE's
+exponential converges. Two practices are derived from the conditions and
+examined in language modeling tasks on a variety of corpora. As a bonus from
+the conditions, we derive a new Theoretical Receptive Field (TRF) to measure
+the receptive field of RPEs without taking any training steps. Extensive
+experiments are conducted on the Wikitext-103, Books, Github, and WikiBook
+datasets to demonstrate the viability of our discovered conditions. We also
+compare TRF to Empirical Receptive Field (ERF) across different models, showing
+consistently matched trends on the aforementioned datasets. The code is
+available at https://github.com/OpenNLPLab/Rpe.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Zhen Qin and Yiran Zhong contribute equally to this paper; Yiran
+  Zhong is the corresponding author. The code is available at
+  https://github.com/OpenNLPLab/Rpe</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient Sparsification For Masked Fine-Tuning of <span class="highlight-title">Transformer</span>s <span class="chip">IJCNN 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James O' Neill, Sourav Dutta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning pretrained self-supervised language models is widely adopted for
+transfer learning to downstream tasks. Fine-tuning can be achieved by freezing
+gradients of the pretrained network and only updating gradients of a newly
+added classification layer, or by performing gradient updates on all
+parameters. Gradual unfreezing makes a trade-off between the two by gradually
+unfreezing gradients of whole layers during training. This has been an
+effective strategy to trade-off between storage and training speed with
+generalization performance. However, it is not clear whether gradually
+unfreezing layers throughout training is optimal, compared to sparse variants
+of gradual unfreezing which may improve fine-tuning performance. In this paper,
+we propose to stochastically mask gradients to regularize pretrained language
+models for improving overall fine-tuned performance. We introduce GradDrop and
+variants thereof, a class of gradient sparsification methods that mask
+gradients during the backward pass, acting as gradient noise. GradDrop is
+sparse and stochastic unlike gradual freezing. Extensive experiments on the
+multilingual XGLUE benchmark with XLMR-Large show that GradDrop is competitive
+against methods that use additional translated data for intermediate
+pretraining and outperforms standard fine-tuning and gradual unfreezing. A
+post-analysis shows how GradDrop improves performance with languages it was not
+trained on, such as under-resourced languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IJCNN 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Android in the Wild: A Large-Scale <span class="highlight-title">Dataset</span> for Android Device Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Rawles, Alice Li, Daniel Rodriguez, Oriana Riva, Timothy Lillicrap
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a growing interest in device-control systems that can interpret
+human natural language instructions and execute them on a digital device by
+directly controlling its user interface. We present a dataset for
+device-control research, Android in the Wild (AITW), which is orders of
+magnitude larger than current datasets. The dataset contains human
+demonstrations of device interactions, including the screens and actions, and
+corresponding natural language instructions. It consists of 715k episodes
+spanning 30k unique instructions, four versions of Android (v10-13),and eight
+device types (Pixel 2 XL to Pixel 6) with varying screen resolutions. It
+contains multi-step tasks that require semantic understanding of language and
+visual context. This dataset poses a new challenge: actions available through
+the user interface must be inferred from their visual appearance. And, instead
+of simple UI element-based actions, the action space consists of precise
+gestures (e.g., horizontal scrolls to operate carousel widgets). We organize
+our dataset to encourage robustness analysis of device-control systems, i.e.,
+how well a system performs in the presence of new task descriptions, new
+applications, or new platform versions. We develop two agents and report
+performance across the dataset. The dataset is available at
+https://github.com/google-research/google-research/tree/master/android_in_the_wild.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Study on Fertility Proposals Using Multi-Grined Topic
+  Analysis Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fertility issues are closely related to population security, in 60 years
+China's population for the first time in a negative growth trend, the change of
+fertility policy is of great concern to the community. 2023 ``two sessions"
+proposal ``suggests that the country in the form of legislation, the birth of
+the registration of the cancellation of the marriage restriction" This topic
+was once a hot topic on the Internet, and ``unbundling" the relationship
+between birth registration and marriage has become the focus of social debate.
+In this paper, we adopt co-occurrence semantic analysis, topic analysis and
+sentiment analysis to conduct multi-granularity semantic analysis of microblog
+comments. It is found that the discussion on the proposal of ``removing
+marriage restrictions from birth registration" involves the individual, society
+and the state at three dimensions, and is detailed into social issues such as
+personal behaviour, social ethics and law, and national policy, with people's
+sentiment inclined to be negative in most of the topics. Based on this, eight
+proposals were made to provide a reference for governmental decision making and
+to form a reference method for researching public opinion on political issues.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 4 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Mathematical Derivations with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Meadows, Marco Valentino, Andre Freitas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The derivation of mathematical results in specialised fields using Large
+Language Models (LLMs) is an emerging research direction that can help identify
+models' limitations, and potentially support mathematical discovery. In this
+paper, we leverage a symbolic engine to generate derivations of equations at
+scale, and investigate the capabilities of LLMs when deriving goal equations
+from premises. Specifically, we employ in-context learning for GPT and
+fine-tune a range of T5 models to compare the robustness and generalisation of
+pre-training strategies to specialised models. Empirical results show that
+fine-tuned FLAN-T5-large (MathT5) outperforms GPT models on all static and
+out-of-distribution test sets in terms of absolute performance. However, an
+in-depth analysis reveals that the fine-tuned models are more sensitive to
+perturbations involving unseen symbols and (to a lesser extent) changes to
+equation structure. In addition, we analyse 1.7K equations and over 200
+derivations to highlight common reasoning errors such as the inclusion of
+incorrect, irrelevant, and redundant equations, along with the tendency to skip
+derivation steps. Finally, we explore the suitability of existing metrics for
+evaluating mathematical derivations finding evidence that, while they capture
+general properties such as sensitivity to perturbations, they fail to highlight
+fine-grained reasoning errors and essential differences between models.
+Overall, this work demonstrates that training models on synthetic data can
+improve their mathematical capabilities beyond larger architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GUIDO: A Hybrid Approach to Guideline Discovery & Ordering from Natural
+  Language Texts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nils Freyer, Dustin Thewes, Matthias Meinecke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting workflow nets from textual descriptions can be used to simplify
+guidelines or formalize textual descriptions of formal processes like business
+processes and algorithms. The task of manually extracting processes, however,
+requires domain expertise and effort. While automatic process model extraction
+is desirable, annotating texts with formalized process models is expensive.
+Therefore, there are only a few machine-learning-based extraction approaches.
+Rule-based approaches, in turn, require domain specificity to work well and can
+rarely distinguish relevant and irrelevant information in textual descriptions.
+In this paper, we present GUIDO, a hybrid approach to the process model
+extraction task that first, classifies sentences regarding their relevance to
+the process model, using a BERT-based sentence classifier, and second, extracts
+a process model from the sentences classified as relevant, using dependency
+parsing. The presented approach achieves significantly better results than a
+pure rule-based approach. GUIDO achieves an average behavioral similarity score
+of $0.93$. Still, in comparison to purely machine-learning-based approaches,
+the annotation costs stay low.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of the short paper presented at the 12th International
+  Conference on Data Science, Technology and Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models can accomplish Business Process Management Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Grohs, Luka Abb, Nourhan Elsayed, Jana-Rebecca Rehse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Business Process Management (BPM) aims to improve organizational activities
+and their outcomes by managing the underlying processes. To achieve this, it is
+often necessary to consider information from various sources, including
+unstructured textual documents. Therefore, researchers have developed several
+BPM-specific solutions that extract information from textual documents using
+Natural Language Processing techniques. These solutions are specific to their
+respective tasks and cannot accomplish multiple process-related problems as a
+general-purpose instrument. However, in light of the recent emergence of Large
+Language Models (LLMs) with remarkable reasoning capabilities, such a
+general-purpose instrument with multiple applications now appears attainable.
+In this paper, we illustrate how LLMs can accomplish text-related BPM tasks by
+applying a specific LLM to three exemplary tasks: mining imperative process
+models from textual descriptions, mining declarative process models from
+textual descriptions, and assessing the suitability of process tasks from
+textual descriptions for robotic process automation. We show that, without
+extensive configuration or prompt engineering, LLMs perform comparably to or
+better than existing solutions and discuss implications for future BPM research
+as well as practical usage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NLP4BPM workshop at BPM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Test-takers have a say: understanding the implications of the use of AI
+  in language tests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dawen Zhang, Thong Hoang, Shidong Pan, Yongquan Hu, Zhenchang Xing, Mark Staples, Xiwei Xu, Qinghua Lu, Aaron Quigley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language tests measure a person's ability to use a language in terms of
+listening, speaking, reading, or writing. Such tests play an integral role in
+academic, professional, and immigration domains, with entities such as
+educational institutions, professional accreditation bodies, and governments
+using them to assess candidate language proficiency. Recent advances in
+Artificial Intelligence (AI) and the discipline of Natural Language Processing
+have prompted language test providers to explore AI's potential applicability
+within language testing, leading to transformative activity patterns
+surrounding language instruction and learning. However, with concerns over AI's
+trustworthiness, it is imperative to understand the implications of integrating
+AI into language testing. This knowledge will enable stakeholders to make
+well-informed decisions, thus safeguarding community well-being and testing
+integrity. To understand the concerns and effects of AI usage in language
+tests, we conducted interviews and surveys with English test-takers. To the
+best of our knowledge, this is the first empirical study aimed at identifying
+the implications of AI adoption in language tests from a test-taker
+perspective. Our study reveals test-taker perceptions and behavioral patterns.
+Specifically, we identify that AI integration may enhance perceptions of
+fairness, consistency, and availability. Conversely, it might incite mistrust
+regarding reliability and interactivity aspects, subsequently influencing the
+behaviors and well-being of test-takers. These insights provide a better
+understanding of potential societal implications and assist stakeholders in
+making informed decisions concerning AI usage in language testing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DA<span class="highlight-title">Prompt</span>: Deterministic Assumption <span class="highlight-title">Prompt</span> Learning for Event Causality
+  Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09813v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09813v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Xiang, Chuanhong Zhan, Bang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event Causality Identification (ECI) aims at determining whether there is a
+causal relation between two event mentions. Conventional prompt learning
+designs a prompt template to first predict an answer word and then maps it to
+the final decision. Unlike conventional prompts, we argue that predicting an
+answer word may not be a necessary prerequisite for the ECI task. Instead, we
+can first make a deterministic assumption on the existence of causal relation
+between two events and then evaluate its rationality to either accept or reject
+the assumption. The design motivation is to try the most utilization of the
+encyclopedia-like knowledge embedded in a pre-trained language model. In light
+of such considerations, we propose a deterministic assumption prompt learning
+model, called DAPrompt, for the ECI task. In particular, we design a simple
+deterministic assumption template concatenating with the input event pair,
+which includes two masks as predicted events' tokens. We use the probabilities
+of predicted events to evaluate the assumption rationality for the final event
+causality decision. Experiments on the EventStoryLine corpus and
+Causal-TimeBank corpus validate our design objective in terms of significant
+performance improvements over the state-of-the-art algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09793v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09793v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Gao, Andrew Kean Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since late 2022, Large Language Models (LLMs) have become very prominent with
+LLMs like ChatGPT and Bard receiving millions of users. Hundreds of new LLMs
+are announced each week, many of which are deposited to Hugging Face, a
+repository of machine learning models and datasets. To date, nearly 16,000 Text
+Generation models have been uploaded to the site. Given the huge influx of
+LLMs, it is of interest to know which LLM backbones, settings, training
+methods, and families are popular or trending. However, there is no
+comprehensive index of LLMs available. We take advantage of the relatively
+systematic nomenclature of Hugging Face LLMs to perform hierarchical clustering
+and identify communities amongst LLMs using n-grams and term frequency-inverse
+document frequency. Our methods successfully identify families of LLMs and
+accurately cluster LLMs into meaningful subgroups. We present a public web
+application to navigate and explore Constellation, our atlas of 15,821 LLMs.
+Constellation rapidly generates a variety of visualizations, namely
+dendrograms, graphs, word clouds, and scatter plots. Constellation is available
+at the following link: https://constellation.sites.stanford.edu/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization
+  Using Floating-Point Formats 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxia Wu, Zhewei Yao, Yuxiong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the complex domain of large language models (LLMs), striking a balance
+between computational efficiency and maintaining model quality is a formidable
+challenge. Navigating the inherent limitations of uniform quantization,
+particularly when dealing with outliers, and motivated by the launch of
+NVIDIA's H100 hardware, this study delves into the viability of floating-point
+(FP) quantization, particularly focusing on FP8 and FP4, as a potential
+solution. Our comprehensive investigation reveals that for LLMs, FP8 activation
+consistently outshines its integer (INT8) equivalent, with the performance edge
+becoming more noticeable in models possessing parameters beyond one billion.
+For weight quantization, our findings indicate that FP4 exhibits comparable, if
+not superior, performance to INT4, simplifying deployment on FP-supported
+hardware like H100. To mitigate the overhead from precision alignment caused by
+the disparity between weights and activations, we propose two scaling
+constraints for weight quantization that negligibly impact the performance
+compared to the standard W4A8 model. We additionally enhance our quantization
+methods by integrating the Low Rank Compensation (LoRC) strategy, yielding
+improvements especially in smaller models. The results of our investigation
+emphasize the immense potential of FP quantization for LLMs, paving the way for
+high-efficiency deployment in resource-limited settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing conversational quality in language learning chatbots: An
+  evaluation of <span class="highlight-title">GPT</span>4 for ASR error correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Mai, Julie Carson-Berndsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of natural language processing (NLP) technologies into
+educational applications has shown promising results, particularly in the
+language learning domain. Recently, many spoken open-domain chatbots have been
+used as speaking partners, helping language learners improve their language
+skills. However, one of the significant challenges is the high word-error-rate
+(WER) when recognizing non-native/non-fluent speech, which interrupts
+conversation flow and leads to disappointment for learners. This paper explores
+the use of GPT4 for ASR error correction in conversational settings. In
+addition to WER, we propose to use semantic textual similarity (STS) and next
+response sensibility (NRS) metrics to evaluate the impact of error correction
+models on the quality of the conversation. We find that transcriptions
+corrected by GPT4 lead to higher conversation quality, despite an increase in
+WER. GPT4 also outperforms standard error correction methods without the need
+for in-domain training data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RaTE: a Reproducible automatic Taxonomy Evaluation by Filling the Gap <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09706v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09706v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianjian Gao, Phillipe Langlais
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Taxonomies are an essential knowledge representation, yet most studies on
+automatic taxonomy construction (ATC) resort to manual evaluation to score
+proposed algorithms. We argue that automatic taxonomy evaluation (ATE) is just
+as important as taxonomy construction. We propose RaTE, an automatic label-free
+taxonomy scoring procedure, which relies on a large pre-trained language model.
+We apply our evaluation procedure to three state-of-the-art ATC algorithms with
+which we built seven taxonomies from the Yelp domain, and show that 1) RaTE
+correlates well with human judgments and 2) artificially degrading a taxonomy
+leads to decreasing RaTE score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15th International Conference on Computational Semantics (IWCS),
+  Association for Computational Linguistics (ACL)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CValues: Measuring the Values of Chinese Large Language Models from
+  Safety to Responsibility 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09705v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09705v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guohai Xu, Jiayi Liu, Ming Yan, Haotian Xu, Jinghui Si, Zhuoran Zhou, Peng Yi, Xing Gao, Jitao Sang, Rong Zhang, Ji Zhang, Chao Peng, Fei Huang, Jingren Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid evolution of large language models (LLMs), there is a growing
+concern that they may pose risks or have negative social impacts. Therefore,
+evaluation of human values alignment is becoming increasingly important.
+Previous work mainly focuses on assessing the performance of LLMs on certain
+knowledge and reasoning abilities, while neglecting the alignment to human
+values, especially in a Chinese context. In this paper, we present CValues, the
+first Chinese human values evaluation benchmark to measure the alignment
+ability of LLMs in terms of both safety and responsibility criteria. As a
+result, we have manually collected adversarial safety prompts across 10
+scenarios and induced responsibility prompts from 8 domains by professional
+experts. To provide a comprehensive values evaluation of Chinese LLMs, we not
+only conduct human evaluation for reliable comparison, but also construct
+multi-choice prompts for automatic evaluation. Our findings suggest that while
+most Chinese LLMs perform well in terms of safety, there is considerable room
+for improvement in terms of responsibility. Moreover, both the automatic and
+human evaluation are important for assessing the human values alignment in
+different aspects. The benchmark and code is available on ModelScope and
+Github.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Working in Process</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Guided Generation for LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09702v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09702v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon T. Willard, Rémi Louf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article we describe an efficient approach to guiding language model
+text generation with regular expressions and context-free grammars. Our
+approach adds little to no overhead to the token sequence generation process,
+and makes guided generation feasible in practice. An implementation is provided
+in the open source Python library Outlines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficiency Pentathlon: A Standardized Arena for Efficiency Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09701v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09701v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Peng, Qingqing Cao, Jesse Dodge, Matthew E. Peters, Jared Fernandez, Tom Sherborne, Kyle Lo, Sam Skjonsberg, Emma Strubell, Darrell Plessas, Iz Beltagy, Evan Pete Walsh, Noah A. Smith, Hannaneh Hajishirzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rising computational demands of modern natural language processing (NLP)
+systems have increased the barrier to entry for cutting-edge research while
+posing serious environmental concerns. Yet, progress on model efficiency has
+been impeded by practical challenges in model evaluation and comparison. For
+example, hardware is challenging to control due to disparate levels of
+accessibility across different institutions. Moreover, improvements in metrics
+such as FLOPs often fail to translate to progress in real-world applications.
+In response, we introduce Pentathlon, a benchmark for holistic and realistic
+evaluation of model efficiency. Pentathlon focuses on inference, which accounts
+for a majority of the compute in a model's lifecycle. It offers a
+strictly-controlled hardware platform, and is designed to mirror real-world
+applications scenarios. It incorporates a suite of metrics that target
+different aspects of efficiency, including latency, throughput, memory
+overhead, and energy consumption. Pentathlon also comes with a software library
+that can be seamlessly integrated into any codebase and enable evaluation. As a
+standardized and centralized evaluation platform, Pentathlon can drastically
+reduce the workload to make fair and reproducible efficiency comparisons. While
+initially focused on natural language processing (NLP) models, Pentathlon is
+designed to allow flexible extension to other fields. We envision Pentathlon
+will stimulate algorithmic innovations in building efficient models, and foster
+an increased awareness of the social and environmental implications in the
+development of future-generation NLP models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ (Ab)using Images and Sounds for Indirect Instruction Injection in
+  Multi-Modal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugene Bagdasaryan, Tsung-Yin Hsieh, Ben Nassi, Vitaly Shmatikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate how images and sounds can be used for indirect prompt and
+instruction injection in multi-modal LLMs. An attacker generates an adversarial
+perturbation corresponding to the prompt and blends it into an image or audio
+recording. When the user asks the (unmodified, benign) model about the
+perturbed image or audio, the perturbation steers the model to output the
+attacker-chosen text and/or make the subsequent dialog follow the attacker's
+instruction. We illustrate this attack with several proof-of-concept examples
+targeting LLaVa and PandaGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SPRINT: A Unified Toolkit for Evaluating and Demystifying Zero-shot
+  Neural Sparse Retrieval <span class="chip">SIGIR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10488v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10488v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nandan Thakur, Kexin Wang, Iryna Gurevych, Jimmy Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditionally, sparse retrieval systems relied on lexical representations to
+retrieve documents, such as BM25, dominated information retrieval tasks. With
+the onset of pre-trained transformer models such as BERT, neural sparse
+retrieval has led to a new paradigm within retrieval. Despite the success,
+there has been limited software supporting different sparse retrievers running
+in a unified, common environment. This hinders practitioners from fairly
+comparing different sparse models and obtaining realistic evaluation results.
+Another missing piece is, that a majority of prior work evaluates sparse
+retrieval models on in-domain retrieval, i.e. on a single dataset: MS MARCO.
+However, a key requirement in practical retrieval systems requires models that
+can generalize well to unseen out-of-domain, i.e. zero-shot retrieval tasks. In
+this work, we provide SPRINT, a unified Python toolkit based on Pyserini and
+Lucene, supporting a common interface for evaluating neural sparse retrieval.
+The toolkit currently includes five built-in models: uniCOIL, DeepImpact,
+SPARTA, TILDEv2 and SPLADEv2. Users can also easily add customized models by
+defining their term weighting method. Using our toolkit, we establish strong
+and reproducible zero-shot sparse retrieval baselines across the
+well-acknowledged benchmark, BEIR. Our results demonstrate that SPLADEv2
+achieves the best average score of 0.470 nDCG@10 on BEIR amongst all neural
+sparse retrievers. In this work, we further uncover the reasons behind its
+performance gain. We show that SPLADEv2 produces sparse representations with a
+majority of tokens outside of the original query and document which is often
+crucial for its performance gains, i.e. a limitation among its other sparse
+counterparts. We provide our SPRINT toolkit, models, and data used in our
+experiments publicly here at https://github.com/thakur-nandan/sprint.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at SIGIR 2023 (Resource Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fin<span class="highlight-title">GPT</span>: Democratizing Internet-scale Data for Financial Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao-Yang Liu, Guoxuan Wang, Daochen Zha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable proficiency in
+understanding and generating human-like texts, which may potentially
+revolutionize the finance industry. However, existing LLMs often fall short in
+the financial field, which is mainly attributed to the disparities between
+general text data and financial text data. Unfortunately, there is only a
+limited number of financial text datasets available (quite small size), and
+BloombergGPT, the first financial LLM (FinLLM), is close-sourced (only the
+training logs were released). In light of this, we aim to democratize
+Internet-scale financial data for LLMs, which is an open challenge due to
+diverse data sources, low signal-to-noise ratio, and high time-validity. To
+address the challenges, we introduce an open-sourced and data-centric
+framework, \textit{Financial Generative Pre-trained Transformer (FinGPT)}, that
+automates the collection and curation of real-time financial data from >34
+diverse sources on the Internet, providing researchers and practitioners with
+accessible and transparent resources to develop their FinLLMs. Additionally, we
+propose a simple yet effective strategy for fine-tuning FinLLM using the
+inherent feedback from the market, dubbed Reinforcement Learning with Stock
+Prices (RLSP). We also adopt the Low-rank Adaptation (LoRA, QLoRA) method that
+enables users to customize their own FinLLMs from open-source general-purpose
+LLMs at a low cost. Finally, we showcase several FinGPT applications, including
+robo-advisor, sentiment analysis for algorithmic trading, and low-code
+development. FinGPT aims to democratize FinLLMs, stimulate innovation, and
+unlock new opportunities in open finance. The codes are available at
+https://github.com/AI4Finance-Foundation/FinGPT and
+https://github.com/AI4Finance-Foundation/FinNLP
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, 9 tables, and 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What can we learn from Data Leakage and Unlearning for Law? <span class="chip">ICML'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10476v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10476v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaydeep Borkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have a privacy concern because they memorize
+training data (including personally identifiable information (PII) like emails
+and phone numbers) and leak it during inference. A company can train an LLM on
+its domain-customized data which can potentially also include their users' PII.
+In order to comply with privacy laws such as the "right to be forgotten", the
+data points of users that are most vulnerable to extraction could be deleted.
+We find that once the most vulnerable points are deleted, a new set of points
+become vulnerable to extraction. So far, little attention has been given to
+understanding memorization for fine-tuned models. In this work, we also show
+that not only do fine-tuned models leak their training data but they also leak
+the pre-training data (and PII) memorized during the pre-training phase. The
+property of new data points becoming vulnerable to extraction after unlearning
+and leakage of pre-training data through fine-tuned models can pose significant
+privacy and legal concerns for companies that use LLMs to offer services. We
+hope this work will start an interdisciplinary discussion within AI and law
+communities regarding the need for policies to tackle these issues.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 8 figures, accepted to the first GenLaw workshop at ICML'23,
+  Hawai'i</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Findings of Factify 2: Multimodal Fake News Detection <span class="chip">AAAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S Suryavardan, Shreyash Mishra, Megha Chakraborty, Parth Patwa, Anku Rani, Aman Chadha, Aishwarya Reganti, Amitava Das, Amit Sheth, Manoj Chinnakotla, Asif Ekbal, Srijan Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With social media usage growing exponentially in the past few years, fake
+news has also become extremely prevalent. The detrimental impact of fake news
+emphasizes the need for research focused on automating the detection of false
+information and verifying its accuracy. In this work, we present the outcome of
+the Factify 2 shared task, which provides a multi-modal fact verification and
+satire news dataset, as part of the DeFactify 2 workshop at AAAI'23. The data
+calls for a comparison based approach to the task by pairing social media
+claims with supporting documents, with both text and image, divided into 5
+classes based on multi-modal relations. In the second iteration of this task we
+had over 60 participants and 9 final test-set submissions. The best
+performances came from the use of DeBERTa for text and Swinv2 and CLIP for
+image. The highest F1 score averaged for all five classes was 81.82%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Defactify2 @AAAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Instruction Fine-Tuned Language Models Identify Social Bias through
+  <span class="highlight-title">Prompt</span>ing? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omkar Dige, Jacob-Junqi Tian, David Emerson, Faiza Khan Khattak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the breadth and depth of language model applications continue to expand
+rapidly, it is increasingly important to build efficient frameworks for
+measuring and mitigating the learned or inherited social biases of these
+models. In this paper, we present our work on evaluating instruction fine-tuned
+language models' ability to identify bias through zero-shot prompting,
+including Chain-of-Thought (CoT) prompts. Across LLaMA and its two instruction
+fine-tuned versions, Alpaca 7B performs best on the bias identification task
+with an accuracy of 56.7%. We also demonstrate that scaling up LLM size and
+data diversity could lead to further performance gain. This is a
+work-in-progress presenting the first component of our bias mitigation
+framework. We will keep updating this work as we get more results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving <span class="highlight-title">Pre-train</span>ed Language Models' Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Somayeh Ghanbarzadeh, Hamid Palangi, Yan Huang, Radames Cruz Moreno, Hamed Khanpour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The reusability of state-of-the-art Pre-trained Language Models (PLMs) is
+often limited by their generalization problem, where their performance
+drastically decreases when evaluated on examples that differ from the training
+dataset, known as Out-of-Distribution (OOD)/unseen examples. This limitation
+arises from PLMs' reliance on spurious correlations, which work well for
+frequent example types but not for general examples. To address this issue, we
+propose a training approach called Mask-tuning, which integrates Masked
+Language Modeling (MLM) training objectives into the fine-tuning process to
+enhance PLMs' generalization. Comprehensive experiments demonstrate that
+Mask-tuning surpasses current state-of-the-art techniques and enhances PLMs'
+generalization on OOD datasets while improving their performance on
+in-distribution datasets. The findings suggest that Mask-tuning improves the
+reusability of PLMs on unseen data, making them more practical and effective
+for real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating a Heterogeneous Graph with Entity-aware Self-attention using
+  Relative Position Labels for Reading Comprehension Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10443v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10443v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shima Foolad, Kourosh Kiani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the significant progress made by transformer models in machine
+reading comprehension tasks, they still face limitations in handling complex
+reasoning tasks due to the absence of explicit knowledge in the input sequence.
+This paper proposes a novel attention pattern to overcome this limitation,
+which integrates reasoning knowledge derived from a heterogeneous graph into
+the transformer architecture using a graph-enhanced self-attention mechanism.
+The proposed attention pattern comprises three key elements: global-local
+attention for word tokens, graph attention for entity tokens that exhibit
+strong attention towards tokens connected in the graph as opposed to those
+unconnected, and the consideration of the type of relationship between each
+entity token and word token. This results in optimized attention between the
+two if a relationship exists. The pattern is coupled with special relative
+position labels, allowing it to integrate with LUKE's entity-aware
+self-attention mechanism. The experimental findings corroborate that our model
+outperforms both the cutting-edge LUKE-Graph and the baseline LUKE model on the
+ReCoRD dataset that focuses on commonsense reasoning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted for Knowledge-Based Systems Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Thrust: Adaptively Propels Large Language Models with External Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinran Zhao, Hongming Zhang, Xiaoman Pan, Wenlin Yao, Dong Yu, Jianshu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although large-scale pre-trained language models (PTLMs) are shown to encode
+rich knowledge in their model parameters, the inherent knowledge in PTLMs can
+be opaque or static, making external knowledge necessary. However, the existing
+information retrieval techniques could be costly and may even introduce noisy
+and sometimes misleading knowledge. To address these challenges, we propose the
+instance-level adaptive propulsion of external knowledge (IAPEK), where we only
+conduct the retrieval when necessary. To achieve this goal, we propose
+measuring whether a PTLM contains enough knowledge to solve an instance with a
+novel metric, Thrust, which leverages the representation distribution of a
+small number of seen instances. Extensive experiments demonstrate that thrust
+is a good measurement of PTLM models' instance-level knowledgeability.
+Moreover, we can achieve significantly higher cost-efficiency with the Thrust
+score as the retrieval indicator than the naive usage of external knowledge on
+88% of the evaluated tasks with 26% average performance improvement. Such
+findings shed light on the real-world practice of knowledge-enhanced LMs with a
+limited knowledge-seeking budget due to computation latency or costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pharmacy<span class="highlight-title">GPT</span>: The AI Pharmacist 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengliang Liu, Zihao Wu, Mengxuan Hu, Bokai Zhao, Lin Zhao, Tianyi Zhang, Haixing Dai, Xianyan Chen, Ye Shen, Sheng Li, Brian Murray, Tianming Liu, Andrea Sikora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce PharmacyGPT, a novel framework to assess the
+capabilities of large language models (LLMs) such as ChatGPT and GPT-4 in
+emulating the role of clinical pharmacists. Our methodology encompasses the
+utilization of LLMs to generate comprehensible patient clusters, formulate
+medication plans, and forecast patient outcomes. We conduct our investigation
+using real data acquired from the intensive care unit (ICU) at the University
+of North Carolina Chapel Hill (UNC) Hospital. Our analysis offers valuable
+insights into the potential applications and limitations of LLMs in the field
+of clinical pharmacy, with implications for both patient care and the
+development of future AI-driven healthcare solutions. By evaluating the
+performance of PharmacyGPT, we aim to contribute to the ongoing discourse
+surrounding the integration of artificial intelligence in healthcare settings,
+ultimately promoting the responsible and efficacious use of such technologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IncDSI: Incrementally Updatable Document Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Varsha Kishore, Chao Wan, Justin Lovelace, Yoav Artzi, Kilian Q. Weinberger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differentiable Search Index is a recently proposed paradigm for document
+retrieval, that encodes information about a corpus of documents within the
+parameters of a neural network and directly maps queries to corresponding
+documents. These models have achieved state-of-the-art performances for
+document retrieval across many benchmarks. These kinds of models have a
+significant limitation: it is not easy to add new documents after a model is
+trained. We propose IncDSI, a method to add documents in real time (about
+20-50ms per document), without retraining the model on the entire dataset (or
+even parts thereof). Instead we formulate the addition of documents as a
+constrained optimization problem that makes minimal changes to the network
+parameters. Although orders of magnitude faster, our approach is competitive
+with re-training the model on the whole dataset and enables the development of
+document retrieval systems that can be updated with new information in
+real-time. Our code for IncDSI is available at
+https://github.com/varshakishore/IncDSI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mood Classification of Bangla Songs Based on Lyrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maliha Mahajebin, Mohammad Rifat Ahmmad Rashid, Nafees Mansoor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music can evoke various emotions, and with the advancement of technology, it
+has become more accessible to people. Bangla music, which portrays different
+human emotions, lacks sufficient research. The authors of this article aim to
+analyze Bangla songs and classify their moods based on the lyrics. To achieve
+this, this research has compiled a dataset of 4000 Bangla song lyrics, genres,
+and used Natural Language Processing and the Bert Algorithm to analyze the
+data. Among the 4000 songs, 1513 songs are represented for the sad mood, 1362
+for the romantic mood, 886 for happiness, and the rest 239 are classified as
+relaxation. By embedding the lyrics of the songs, the authors have classified
+the songs into four moods: Happy, Sad, Romantic, and Relaxed. This research is
+crucial as it enables a multi-class classification of songs' moods, making the
+music more relatable to people's emotions. The article presents the automated
+result of the four moods accurately derived from the song lyrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at International Conference on. Inventive Communication and
+  Computational Technologies 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Llama 2: Open Foundation and Fine-Tuned Chat Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09288v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09288v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we develop and release Llama 2, a collection of pretrained and
+fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70
+billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for
+dialogue use cases. Our models outperform open-source chat models on most
+benchmarks we tested, and based on our human evaluations for helpfulness and
+safety, may be a suitable substitute for closed-source models. We provide a
+detailed description of our approach to fine-tuning and safety improvements of
+Llama 2-Chat in order to enable the community to build on our work and
+contribute to the responsible development of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ThoughtSource: A central hub for large language model reasoning data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11596v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11596v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Ott, Konstantin Hebenstreit, Valentin Liévin, Christoffer Egeberg Hother, Milad Moradi, Maximilian Mayrhauser, Robert Praas, Ole Winther, Matthias Samwald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) such as GPT-4 have recently demonstrated
+impressive results across a wide range of tasks. LLMs are still limited,
+however, in that they frequently fail at complex reasoning, their reasoning
+processes are opaque, they are prone to 'hallucinate' facts, and there are
+concerns about their underlying biases. Letting models verbalize reasoning
+steps as natural language, a technique known as chain-of-thought prompting, has
+recently been proposed as a way to address some of these issues. Here we
+present ThoughtSource, a meta-dataset and software library for chain-of-thought
+(CoT) reasoning. The goal of ThoughtSource is to improve future artificial
+intelligence systems by facilitating qualitative understanding of CoTs,
+enabling empirical evaluations, and providing training data. This first release
+of ThoughtSource integrates six scientific/medical, three general-domain and
+five math word question answering datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revision: added datasets, minor restructuring</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A comparative analysis of SRGAN models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09456v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09456v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Rezapoor Nikroo, Ajinkya Deshmukh, Anantha Sharma, Adrian Tam, Kaarthik Kumar, Cleo Norris, Aditya Dangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we evaluate the performance of multiple state-of-the-art SRGAN
+(Super Resolution Generative Adversarial Network) models, ESRGAN, Real-ESRGAN
+and EDSR, on a benchmark dataset of real-world images which undergo degradation
+using a pipeline. Our results show that some models seem to significantly
+increase the resolution of the input images while preserving their visual
+quality, this is assessed using Tesseract OCR engine. We observe that EDSR-BASE
+model from huggingface outperforms the remaining candidate models in terms of
+both quantitative metrics and subjective visual quality assessments with least
+compute overhead. Specifically, EDSR generates images with higher peak
+signal-to-noise ratio (PSNR) and structural similarity index (SSIM) values and
+are seen to return high quality OCR results with Tesseract OCR engine. These
+findings suggest that EDSR is a robust and effective approach for single-image
+super-resolution and may be particularly well-suited for applications where
+high-quality visual fidelity is critical and optimized compute.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 tables, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chat<span class="highlight-title">GPT</span> Outperforms Crowd-Workers for Text-Annotation Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15056v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15056v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabrizio Gilardi, Meysam Alizadeh, Maël Kubli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many NLP applications require manual data annotations for a variety of tasks,
+notably to train classifiers or evaluate the performance of unsupervised
+models. Depending on the size and degree of complexity, the tasks may be
+conducted by crowd-workers on platforms such as MTurk as well as trained
+annotators, such as research assistants. Using a sample of 2,382 tweets, we
+demonstrate that ChatGPT outperforms crowd-workers for several annotation
+tasks, including relevance, stance, topics, and frames detection. Specifically,
+the zero-shot accuracy of ChatGPT exceeds that of crowd-workers for four out of
+five tasks, while ChatGPT's intercoder agreement exceeds that of both
+crowd-workers and trained annotators for all tasks. Moreover, the
+per-annotation cost of ChatGPT is less than $0.003 -- about twenty times
+cheaper than MTurk. These results show the potential of large language models
+to drastically increase the efficiency of text classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Gilardi, Fabrizio, Meysam Alizadeh, and Ma\"el Kubli. 2023. "ChatGPT
+  Outperforms Crowd Workers for Text-Annotation Tasks". Proceedings of the
+  National Academy of Sciences 120(30): e2305016120</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Softmax for Uncertainty Approximation in Text Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.14037v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.14037v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Nugaard Holm, Dustin Wright, Isabelle Augenstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty approximation in text classification is an important area with
+applications in domain adaptation and interpretability. One of the most widely
+used uncertainty approximation methods is Monte Carlo (MC) Dropout, which is
+computationally expensive as it requires multiple forward passes through the
+model. A cheaper alternative is to simply use the softmax based on a single
+forward pass without dropout to estimate model uncertainty. However, prior work
+has indicated that these predictions tend to be overconfident. In this paper,
+we perform a thorough empirical analysis of these methods on five datasets with
+two base neural architectures in order to identify the trade-offs between the
+two. We compare both softmax and an efficient version of MC Dropout on their
+uncertainty approximations and downstream text classification performance,
+while weighing their runtime (cost) against performance (benefit). We find
+that, while MC dropout produces the best uncertainty approximations, using a
+simple softmax leads to competitive and in some cases better uncertainty
+estimation for text classification at a much lower computational cost,
+suggesting that softmax can in fact be a sufficient uncertainty estimate when
+computational resources are a concern.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LongNet: Scaling <span class="highlight-title">Transformer</span>s to 1,000,000,000 Tokens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02486v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02486v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Ding, Shuming Ma, Li Dong, Xingxing Zhang, Shaohan Huang, Wenhui Wang, Nanning Zheng, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling sequence length has become a critical demand in the era of large
+language models. However, existing methods struggle with either computational
+complexity or model expressivity, rendering the maximum sequence length
+restricted. To address this issue, we introduce LongNet, a Transformer variant
+that can scale sequence length to more than 1 billion tokens, without
+sacrificing the performance on shorter sequences. Specifically, we propose
+dilated attention, which expands the attentive field exponentially as the
+distance grows. LongNet has significant advantages: 1) it has a linear
+computation complexity and a logarithm dependency between any two tokens in a
+sequence; 2) it can be served as a distributed trainer for extremely long
+sequences; 3) its dilated attention is a drop-in replacement for standard
+attention, which can be seamlessly integrated with the existing
+Transformer-based optimization. Experiments results demonstrate that LongNet
+yields strong performance on both long-sequence modeling and general language
+tasks. Our work opens up new possibilities for modeling very long sequences,
+e.g., treating a whole corpus or even the entire Internet as a sequence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation
+  Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09416v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09416v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Betti, Jacopo Staiano, Lorenzo Baraldi, Lorenzo Baraldi, Rita Cucchiara, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research in Image Generation has recently made significant progress,
+particularly boosted by the introduction of Vision-Language models which are
+able to produce high-quality visual content based on textual inputs. Despite
+ongoing advancements in terms of generation quality and realism, no methodical
+frameworks have been defined yet to quantitatively measure the quality of the
+generated content and the adherence with the prompted requests: so far, only
+human-based evaluations have been adopted for quality satisfaction and for
+comparing different generative methods. We introduce a novel automated method
+for Visual Concept Evaluation (ViCE), i.e. to assess consistency between a
+generated/edited image and the corresponding prompt/instructions, with a
+process inspired by the human cognitive behaviour. ViCE combines the strengths
+of Large Language Models (LLMs) and Visual Question Answering (VQA) into a
+unified pipeline, aiming to replicate the human cognitive process in quality
+assessment. This method outlines visual concepts, formulates image-specific
+verification questions, utilizes the Q&A system to investigate the image, and
+scores the combined outcome. Although this brave new hypothesis of mimicking
+humans in the image evaluation process is in its preliminary assessment stage,
+results are promising and open the door to a new form of automatic evaluation
+which could have significant impact as the image generation or the image target
+editing tasks become more and more sophisticated.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as oral at ACM MultiMedia 2023 (Brave New Ideas track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can In-context Learners Learn a Reasoning Concept from Demonstrations? <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.01692v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.01692v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Štefánik, Marek Kadlčík
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models exhibit an emergent ability to learn a new task from a small
+number of input-output demonstrations. However, recent work shows that
+in-context learners largely rely on their pre-trained knowledge, such as the
+sentiment of the labels, instead of learning new associations from the input.
+We argue that the commonly-used few-shot evaluation using a random selection of
+in-context demonstrations can not disentangle models' reliance on such biases,
+as most of the randomly-selected demonstrations do not present relations
+informative for prediction beyond exposing the task's input-output
+distribution.
+  Therefore, to evaluate models' in-context learning ability independent of
+models' memory, we introduce a Concept-sharing few-shot learning method
+choosing the demonstrations that share an underlying concept with the predicted
+sample. We extract a set of such concepts from available human explanations and
+measure how much models can benefit from presenting these concepts in few-shot
+demonstrations.
+  We find that most of the recent in-context learners can not consistently
+benefit from the demonstrated concepts, irrespective of the model size.
+However, we note that T0 models are more sensitive to exhibited concepts,
+benefiting from concept-sharing demonstrations in 7 out of 8 evaluation
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Awarded Best Paper at ACL 2023 Natural Language Reasoning and
+  Structured Explanations (NLRSE) workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retentive Network: A Successor to <span class="highlight-title">Transformer</span> for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08621v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08621v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose Retentive Network (RetNet) as a foundation
+architecture for large language models, simultaneously achieving training
+parallelism, low-cost inference, and good performance. We theoretically derive
+the connection between recurrence and attention. Then we propose the retention
+mechanism for sequence modeling, which supports three computation paradigms,
+i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel
+representation allows for training parallelism. The recurrent representation
+enables low-cost $O(1)$ inference, which improves decoding throughput, latency,
+and GPU memory without sacrificing performance. The chunkwise recurrent
+representation facilitates efficient long-sequence modeling with linear
+complexity, where each chunk is encoded parallelly while recurrently
+summarizing the chunks. Experimental results on language modeling show that
+RetNet achieves favorable scaling results, parallel training, low-cost
+deployment, and efficient inference. The intriguing properties make RetNet a
+strong successor to Transformer for large language models. Code will be
+available at https://aka.ms/retnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lego-MT: Learning Detachable Models for Massively Multilingual Machine
+  Translation <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10551v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10551v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fei Yuan, Yinquan Lu, WenHao Zhu, Lingpeng Kong, Lei Li, Yu Qiao, Jingjing Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multilingual neural machine translation (MNMT) aims to build a unified model
+for many language directions. Existing monolithic models for MNMT encounter two
+challenges: parameter interference among languages and inefficient inference
+for large models. In this paper, we revisit the classic multi-way structures
+and develop a detachable model by assigning each language (or group of
+languages) to an individual branch that supports plug-and-play training and
+inference. To address the needs of learning representations for all languages
+in a unified space, we propose a novel efficient training recipe, upon which we
+build an effective detachable model, Lego-MT. For a fair comparison, we collect
+data from OPUS and build a translation benchmark covering 433 languages and
+1.3B parallel data. Experiments show that Lego-MT with 1.2B parameters brings
+an average gain of 3.2 spBLEU. It even outperforms M2M-100 with 12B parameters.
+The proposed training recipe brings a 28.2$\times$ speedup over the
+conventional multi-way training method.\footnote{
+\url{https://github.com/CONE-MT/Lego-MT}.}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understand Legal Documents with Contextualized Large Language Models <span class="chip">SemEval 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12135v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12135v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Jin, Yuchen Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growth of pending legal cases in populous countries, such as India, has
+become a major issue. Developing effective techniques to process and understand
+legal documents is extremely useful in resolving this problem. In this paper,
+we present our systems for SemEval-2023 Task 6: understanding legal texts (Modi
+et al., 2023). Specifically, we first develop the Legal-BERT-HSLN model that
+considers the comprehensive context information in both intra- and
+inter-sentence levels to predict rhetorical roles (subtask A) and then train a
+Legal-LUKE model, which is legal-contextualized and entity-aware, to recognize
+legal entities (subtask B). Our evaluations demonstrate that our designed
+models are more accurate than baselines, e.g., with an up to 15.0% better F1
+score in subtask B. We achieved notable performance in the task leaderboard,
+e.g., 0.834 micro F1 score, and ranked No.5 out of 27 teams in subtask A.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SemEval 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio
+  <span class="highlight-title">Pretrain</span>ing for Speech Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07848v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07848v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Pan, Lei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning based cross-modality pretraining methods have recently
+exhibited impressive success in diverse fields. In this paper, we propose
+GEmo-CLAP, a kind of gender-attribute-enhanced contrastive language-audio
+pretraining (CLAP) method for speech emotion recognition. Specifically, a novel
+emotion CLAP model (Emo-CLAP) is first built, utilizing various self-supervised
+pre-trained models. Second, considering the importance of gender attribute in
+speech emotion modeling, the soft label based GEmo-CLAP (SL-GEmo-CLAP) and
+multi-task learning based GEmo-CLAP (ML-GEmo-CLAP) are further proposed to
+integrate the emotion and gender information of speech signals, forming more
+reasonable objectives. Extensive experiments on IEMOCAP show that our proposed
+two GEmo-CLAP models consistently outperform the baseline Emo-CLAP with
+different pre-trained models, while also achieving the best recognition
+performance compared with recent state-of-the-art methods. Noticeably, the
+proposed WavLM-based ML-GEmo-CLAP obtains the best UAR of 80.16\% and WAR of
+82.06\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pseudo Outlier Exposure for Out-of-Distribution Detection using
+  <span class="highlight-title">Pretrain</span>ed <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09455v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09455v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaeyoung Kim, Kyuheon Jung, Dongbin Na, Sion Jang, Eunbin Park, Sungchul Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For real-world language applications, detecting an out-of-distribution (OOD)
+sample is helpful to alert users or reject such unreliable samples. However,
+modern over-parameterized language models often produce overconfident
+predictions for both in-distribution (ID) and OOD samples. In particular,
+language models suffer from OOD samples with a similar semantic representation
+to ID samples since these OOD samples lie near the ID manifold. A rejection
+network can be trained with ID and diverse outlier samples to detect test OOD
+samples, but explicitly collecting auxiliary OOD datasets brings an additional
+burden for data collection. In this paper, we propose a simple but effective
+method called Pseudo Outlier Exposure (POE) that constructs a surrogate OOD
+dataset by sequentially masking tokens related to ID classes. The surrogate OOD
+sample introduced by POE shows a similar representation to ID data, which is
+most effective in training a rejection network. Our method does not require any
+external OOD data and can be easily implemented within off-the-shelf
+Transformers. A comprehensive comparison with state-of-the-art algorithms
+demonstrates POE's competitiveness on several text classification benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distilling Large Vision-Language Model with Out-of-Distribution
+  Generalizability <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03135v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03135v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanlin Li, Yunhao Fang, Minghua Liu, Zhan Ling, Zhuowen Tu, Hao Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models have achieved outstanding performance, but their
+size and computational requirements make their deployment on
+resource-constrained devices and time-sensitive tasks impractical. Model
+distillation, the process of creating smaller, faster models that maintain the
+performance of larger models, is a promising direction towards the solution.
+This paper investigates the distillation of visual representations in large
+teacher vision-language models into lightweight student models using a small-
+or mid-scale dataset. Notably, this study focuses on open-vocabulary
+out-of-distribution (OOD) generalization, a challenging problem that has been
+overlooked in previous model distillation literature. We propose two principles
+from vision and language modality perspectives to enhance student's OOD
+generalization: (1) by better imitating teacher's visual representation space,
+and carefully promoting better coherence in vision-language alignment with the
+teacher; (2) by enriching the teacher's language representations with
+informative and finegrained semantic attributes to effectively distinguish
+between different labels. We propose several metrics and conduct extensive
+experiments to investigate their techniques. The results demonstrate
+significant improvements in zero-shot and few-shot student performance on
+open-vocabulary out-of-distribution classification, highlighting the
+effectiveness of our proposed approaches. Code released at
+https://github.com/xuanlinli17/large_vlm_distillation_ood
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at International Conference on Computer Vision (ICCV) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chat<span class="highlight-title">GPT</span> is Good but Bing Chat is Better for Vietnamese Students 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08272v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08272v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan-Quy Dao, Ngoc-Bich Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study examines the efficacy of two SOTA large language models (LLMs),
+namely ChatGPT and Microsoft Bing Chat (BingChat), in catering to the needs of
+Vietnamese students. Although ChatGPT exhibits proficiency in multiple
+disciplines, Bing Chat emerges as the more advantageous option. We conduct a
+comparative analysis of their academic achievements in various disciplines,
+encompassing mathematics, literature, English language, physics, chemistry,
+biology, history, geography, and civic education. The results of our study
+suggest that BingChat demonstrates superior performance compared to ChatGPT
+across a wide range of subjects, with the exception of literature, where
+ChatGPT exhibits better performance. Additionally, BingChat utilizes the more
+advanced GPT-4 technology in contrast to ChatGPT, which is built upon GPT-3.5.
+This allows BingChat to improve to comprehension, reasoning and generation of
+creative and informative text. Moreover, the fact that BingChat is accessible
+in Vietnam and its integration of hyperlinks and citations within responses
+serve to reinforce its superiority. In our analysis, it is evident that while
+ChatGPT exhibits praiseworthy qualities, BingChat presents a more apdated
+solutions for Vietnamese students.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages; 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Execution-based Code Generation using Deep Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13816v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13816v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parshin Shojaee, Aneesh Jain, Sindhu Tipirneni, Chandan K. Reddy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The utilization of programming language (PL) models, pre-trained on
+large-scale code corpora, as a means of automating software engineering
+processes has demonstrated considerable potential in streamlining various code
+generation tasks such as code completion, code translation, and program
+synthesis. However, current approaches mainly rely on supervised fine-tuning
+objectives borrowed from text generation, neglecting unique sequence-level
+characteristics of code, including but not limited to compilability as well as
+syntactic and functional correctness. To address this limitation, we propose
+PPOCoder, a new framework for code generation that synergistically combines
+pre-trained PL models with Proximal Policy Optimization (PPO) which is a widely
+used deep reinforcement learning technique. By utilizing non-differentiable
+feedback from code execution and structure alignment, PPOCoder seamlessly
+integrates external code-specific knowledge into the model optimization
+process. It's important to note that PPOCoder is a task-agnostic and
+model-agnostic framework that can be used across different code generation
+tasks and PLs. Extensive experiments on three code generation tasks demonstrate
+the effectiveness of our proposed approach compared to SOTA methods, achieving
+significant improvements in compilation success rates and functional
+correctness across different PLs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Transactions on Machine Learning Research (TMLR), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fairness in AI and Its Long-Term Implications on Society 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09826v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09826v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ondrej Bohdal, Timothy Hospedales, Philip H. S. Torr, Fazl Barez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Successful deployment of artificial intelligence (AI) in various settings has
+led to numerous positive outcomes for individuals and society. However, AI
+systems have also been shown to harm parts of the population due to biased
+predictions. AI fairness focuses on mitigating such biases to ensure AI
+decision making is not discriminatory towards certain groups. We take a closer
+look at AI fairness and analyze how lack of AI fairness can lead to deepening
+of biases over time and act as a social stressor. More specifically, we discuss
+how biased models can lead to more negative real-world outcomes for certain
+groups, which may then become more prevalent by deploying new AI models trained
+on increasingly biased data, resulting in a feedback loop. If the issues
+persist, they could be reinforced by interactions with other risks and have
+severe implications on society in the form of social unrest. We examine current
+strategies for improving AI fairness, assess their limitations in terms of
+real-world deployment, and explore potential paths forward to ensure we reap
+AI's benefits without causing society's collapse.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Stanford Existential Risks Conference 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chat<span class="highlight-title">GPT</span> for Robotics: Design Principles and Model Abilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17582v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17582v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sai Vemprala, Rogerio Bonatti, Arthur Bucker, Ashish Kapoor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an experimental study regarding the use of OpenAI's
+ChatGPT for robotics applications. We outline a strategy that combines design
+principles for prompt engineering and the creation of a high-level function
+library which allows ChatGPT to adapt to different robotics tasks, simulators,
+and form factors. We focus our evaluations on the effectiveness of different
+prompt engineering techniques and dialog strategies towards the execution of
+various types of robotics tasks. We explore ChatGPT's ability to use free-form
+dialog, parse XML tags, and to synthesize code, in addition to the use of
+task-specific prompting functions and closed-loop reasoning through dialogues.
+Our study encompasses a range of tasks within the robotics domain, from basic
+logical, geometrical, and mathematical reasoning all the way to complex domains
+such as aerial navigation, manipulation, and embodied agents. We show that
+ChatGPT can be effective at solving several of such tasks, while allowing users
+to interact with it primarily via natural language instructions. In addition to
+these studies, we introduce an open-sourced research tool called PromptCraft,
+which contains a platform where researchers can collaboratively upload and vote
+on examples of good prompting schemes for robotics applications, as well as a
+sample robotics simulator with ChatGPT integration, making it easier for users
+to get started with using ChatGPT for robotics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Text Matching in E-Commerce Search with A Rationalizable,
+  Intervenable and Fast Entity-Based Relevance Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00370v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00370v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiong Cai, Yong Jiang, Yue Zhang, Chengyue Jiang, Ke Yu, Jianhui Ji, Rong Xiao, Haihong Tang, Tao Wang, Zhongqiang Huang, Pengjun Xie, Fei Huang, Kewei Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discovering the intended items of user queries from a massive repository of
+items is one of the main goals of an e-commerce search system. Relevance
+prediction is essential to the search system since it helps improve
+performance. When online serving a relevance model, the model is required to
+perform fast and accurate inference. Currently, the widely used models such as
+Bi-encoder and Cross-encoder have their limitations in accuracy or inference
+speed respectively. In this work, we propose a novel model called the
+Entity-Based Relevance Model (EBRM). We identify the entities contained in an
+item and decompose the QI (query-item) relevance problem into multiple QE
+(query-entity) relevance problems; we then aggregate their results to form the
+QI prediction using a soft logic formulation. The decomposition allows us to
+use a Cross-encoder QE relevance module for high accuracy as well as cache QE
+predictions for fast online inference. Utilizing soft logic makes the
+prediction procedure interpretable and intervenable. We also show that
+pretraining the QE module with auto-generated QE data from user logs can
+further improve the overall performance. The proposed method is evaluated on
+labeled data from e-commerce websites. Empirical results show that it achieves
+promising improvements with computation efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">135</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DNA-Rendering: A Diverse Neural Actor Repository for High-Fidelity
+  Human-centric Rendering <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Cheng, Ruixiang Chen, Wanqi Yin, Siming Fan, Keyu Chen, Honglin He, Huiwen Luo, Zhongang Cai, Jingbo Wang, Yang Gao, Zhengming Yu, Zhengyu Lin, Daxuan Ren, Lei Yang, Ziwei Liu, Chen Change Loy, Chen Qian, Wayne Wu, Dahua Lin, Bo Dai, Kwan-Yee Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Realistic human-centric rendering plays a key role in both computer vision
+and computer graphics. Rapid progress has been made in the algorithm aspect
+over the years, yet existing human-centric rendering datasets and benchmarks
+are rather impoverished in terms of diversity, which are crucial for rendering
+effect. Researchers are usually constrained to explore and evaluate a small set
+of rendering problems on current datasets, while real-world applications
+require methods to be robust across different scenarios. In this work, we
+present DNA-Rendering, a large-scale, high-fidelity repository of human
+performance data for neural actor rendering. DNA-Rendering presents several
+alluring attributes. First, our dataset contains over 1500 human subjects, 5000
+motion sequences, and 67.5M frames' data volume. Second, we provide rich assets
+for each subject -- 2D/3D human body keypoints, foreground masks, SMPLX models,
+cloth/accessory materials, multi-view images, and videos. These assets boost
+the current method's accuracy on downstream rendering tasks. Third, we
+construct a professional multi-view system to capture data, which contains 60
+synchronous cameras with max 4096 x 3000 resolution, 15 fps speed, and stern
+camera calibration steps, ensuring high-quality resources for task training and
+evaluation. Along with the dataset, we provide a large-scale and quantitative
+benchmark in full-scale, with multiple tasks to evaluate the existing progress
+of novel view synthesis, novel pose animation synthesis, and novel identity
+rendering methods. In this manuscript, we describe our DNA-Rendering effort as
+a revealing of new observations, challenges, and future directions to
+human-centric rendering. The dataset, code, and benchmarks will be publicly
+available at https://dna-rendering.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by ICCV2023. Project page:
+  https://dna-rendering.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Latent Autoencoder with Self-Attention for Structural Image
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajie Fan, Laure Vuaille, Hao Wang, Thomas Bäck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Engineering Design approaches driven by Deep Generative Models
+(DGM) have been proposed to facilitate industrial engineering processes. In
+such processes, designs often come in the form of images, such as blueprints,
+engineering drawings, and CAD models depending on the level of detail. DGMs
+have been successfully employed for synthesis of natural images, e.g.,
+displaying animals, human faces and landscapes. However, industrial design
+images are fundamentally different from natural scenes in that they contain
+rich structural patterns and long-range dependencies, which are challenging for
+convolution-based DGMs to generate. Moreover, DGM-driven generation process is
+typically triggered based on random noisy inputs, which outputs unpredictable
+samples and thus cannot perform an efficient industrial design exploration. We
+tackle these challenges by proposing a novel model Self-Attention Adversarial
+Latent Autoencoder (SA-ALAE), which allows generating feasible design images of
+complex engineering parts. With SA-ALAE, users can not only explore novel
+variants of an existing design, but also control the generation process by
+operating in latent space. The potential of SA-ALAE is shown by generating
+engineering blueprints in a real automotive design task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Drone navigation and license place detection for vehicle location in
+  indoor spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moa Arvidsson, Sithichot Sawirot, Cristofer Englund, Fernando Alonso-Fernandez, Martin Torstensson, Boris Duran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Millions of vehicles are transported every year, tightly parked in vessels or
+boats. To reduce the risks of associated safety issues like fires, knowing the
+location of vehicles is essential, since different vehicles may need different
+mitigation measures, e.g. electric cars. This work is aimed at creating a
+solution based on a nano-drone that navigates across rows of parked vehicles
+and detects their license plates. We do so via a wall-following algorithm, and
+a CNN trained to detect license plates. All computations are done in real-time
+on the drone, which just sends position and detected images that allow the
+creation of a 2D map with the position of the plates. Our solution is capable
+of reading all plates across eight test cases (with several rows of plates,
+different drone speeds, or low light) by aggregation of measurements across
+several drone journeys.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at VIII International Workshop on Artificial Intelligence
+  and Pattern Recognition, IWAIPR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Driving Policy Learning with Guided Meta Reinforcement Learning <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanghoon Lee, Jiachen Li, David Isele, Jinkyoo Park, Kikuo Fujimura, Mykel J. Kochenderfer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although deep reinforcement learning (DRL) has shown promising results for
+autonomous navigation in interactive traffic scenarios, existing work typically
+adopts a fixed behavior policy to control social vehicles in the training
+environment. This may cause the learned driving policy to overfit the
+environment, making it difficult to interact well with vehicles with different,
+unseen behaviors. In this work, we introduce an efficient method to train
+diverse driving policies for social vehicles as a single meta-policy. By
+randomizing the interaction-based reward functions of social vehicles, we can
+generate diverse objectives and efficiently train the meta-policy through
+guiding policies that achieve specific objectives. We further propose a
+training strategy to enhance the robustness of the ego vehicle's driving policy
+using the environment where social vehicles are controlled by the learned
+meta-policy. Our method successfully learns an ego driving policy that
+generalizes well to unseen situations with out-of-distribution (OOD) social
+agents' behaviors in a challenging uncontrolled T-intersection scenario.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ITSC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FABRIC: Personalizing Diffusion Models with Iterative Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10159v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10159v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitri von Rütte, Elisabetta Fedele, Jonathan Thomm, Lukas Wolf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In an era where visual content generation is increasingly driven by machine
+learning, the integration of human feedback into generative models presents
+significant opportunities for enhancing user experience and output quality.
+This study explores strategies for incorporating iterative human feedback into
+the generative process of diffusion-based text-to-image models. We propose
+FABRIC, a training-free approach applicable to a wide range of popular
+diffusion models, which exploits the self-attention layer present in the most
+widely used architectures to condition the diffusion process on a set of
+feedback images. To ensure a rigorous assessment of our approach, we introduce
+a comprehensive evaluation methodology, offering a robust mechanism to quantify
+the performance of generative visual models that integrate human feedback. We
+show that generation results improve over multiple rounds of iterative feedback
+through exhaustive analysis, implicitly optimizing arbitrary user preferences.
+The potential applications of these findings extend to fields such as
+personalized content creation and customization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Visemes for Better Visual Speech Representation and Lip
+  Reading 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10157v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10157v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javad Peymanfard, Vahid Saeedi, Mohammad Reza Mohammadi, Hossein Zeinali, Nasser Mozayani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lip reading is a challenging task that has many potential applications in
+speech recognition, human-computer interaction, and security systems. However,
+existing lip reading systems often suffer from low accuracy due to the
+limitations of video features. In this paper, we propose a novel approach that
+leverages visemes, which are groups of phonetically similar lip shapes, to
+extract more discriminative and robust video features for lip reading. We
+evaluate our approach on various tasks, including word-level and sentence-level
+lip reading, and audiovisual speech recognition using the Arman-AV dataset, a
+largescale Persian corpus. Our experimental results show that our viseme based
+approach consistently outperforms the state-of-theart methods in all these
+tasks. The proposed method reduces the lip-reading word error rate (WER) by
+9.1% relative to the best previous method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Improved NeuMIP with Better Accuracy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10135v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10135v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Xue, Shuang Zhao, Henrik Wann Jensen, Zahra Montazeri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural reflectance models are capable of accurately reproducing the
+spatially-varying appearance of many real-world materials at different scales.
+However, existing methods have difficulties handling highly glossy materials.
+To address this problem, we introduce a new neural reflectance model which,
+compared with existing methods, better preserves not only specular highlights
+but also fine-grained details. To this end, we enhance the neural network
+performance by encoding input data to frequency space, inspired by NeRF, to
+better preserve the details. Furthermore, we introduce a gradient-based loss
+and employ it in multiple stages, adaptive to the progress of the learning
+phase. Lastly, we utilize an optional extension to the decoder network using
+the Inception module for more accurate yet costly performance. We demonstrate
+the effectiveness of our method using a variety of synthetic and real examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ General vs. Long-Tailed Age Estimation: An Approach to Kill Two Birds
+  with One Stone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zenghao Bao, Zichang Tan, Jun Li, Jun Wan, Xibo Ma, Zhen Lei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial age estimation has received a lot of attention for its diverse
+application scenarios. Most existing studies treat each sample equally and aim
+to reduce the average estimation error for the entire dataset, which can be
+summarized as General Age Estimation. However, due to the long-tailed
+distribution prevalent in the dataset, treating all samples equally will
+inevitably bias the model toward the head classes (usually the adult with a
+majority of samples). Driven by this, some works suggest that each class should
+be treated equally to improve performance in tail classes (with a minority of
+samples), which can be summarized as Long-tailed Age Estimation. However,
+Long-tailed Age Estimation usually faces a performance trade-off, i.e.,
+achieving improvement in tail classes by sacrificing the head classes. In this
+paper, our goal is to design a unified framework to perform well on both tasks,
+killing two birds with one stone. To this end, we propose a simple, effective,
+and flexible training paradigm named GLAE, which is two-fold. Our GLAE provides
+a surprising improvement on Morph II, reaching the lowest MAE and CMAE of 1.14
+and 1.27 years, respectively. Compared to the previous best method, MAE dropped
+by up to 34%, which is an unprecedented improvement, and for the first time,
+MAE is close to 1 year old. Extensive experiments on other age benchmark
+datasets, including CACD, MIVIA, and Chalearn LAP 2015, also indicate that GLAE
+outperforms the state-of-the-art approaches significantly.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Two Approaches to Supervised Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10123v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10123v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Benatti, Luciano da F. Costa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Though performed almost effortlessly by humans, segmenting 2D gray-scale or
+color images in terms of their constituent regions of interest
+(e.g.~background, objects or portions of objects) constitutes one of the
+greatest challenges in science and technology as a consequence of the involved
+dimensionality reduction(3D to 2D), noise, reflections, shades, and occlusions,
+among many other possible effects. While a large number of interesting
+approaches have been respectively suggested along the last decades, it was
+mainly with the more recent development of deep learning that more effective
+and general solutions have been obtained, currently constituting the basic
+comparison reference for this type of operation. Also developed recently, a
+multiset-based methodology has been described that is capable of encouraging
+performance that combines spatial accuracy, stability, and robustness while
+requiring minimal computational resources (hardware and/or training and
+recognition time). The interesting features of the latter methodology mostly
+follow from the enhanced selectivity and sensitivity, as well as good
+robustness to data perturbations and outliers, allowed by the coincidence
+similarity index on which the multiset approach to supervised image
+segmentation is based. After describing the deep learning and multiset
+approaches, the present work develops two comparison experiments between them
+which are primarily aimed at illustrating their respective main interesting
+features when applied to the adopted specific type of data and parameter
+configurations. While the deep learning approach confirmed its potential for
+performing image segmentation, the alternative multiset methodology allowed for
+encouraging accuracy while requiring little computational resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boundary-Refined Prototype Generation: A General End-to-End Paradigm for
+  Semi-Supervised Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhao Dong, Zhu Meng, Delong Liu, Zhicheng Zhao, Fei Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prototype-based classification is a classical method in machine learning, and
+recently it has achieved remarkable success in semi-supervised semantic
+segmentation. However, the current approach isolates the prototype
+initialization process from the main training framework, which appears to be
+unnecessary. Furthermore, while the direct use of K-Means algorithm for
+prototype generation has considered rich intra-class variance, it may not be
+the optimal solution for the classification task. To tackle these problems, we
+propose a novel boundary-refined prototype generation (BRPG) method, which is
+incorporated into the whole training framework. Specifically, our approach
+samples and clusters high- and low-confidence features separately based on a
+confidence threshold, aiming to generate prototypes closer to the class
+boundaries. Moreover, an adaptive prototype optimization strategy is introduced
+to make prototype augmentation for categories with scattered feature
+distributions. Extensive experiments on the PASCAL VOC 2012 and Cityscapes
+datasets demonstrate the superiority and scalability of the proposed method,
+outperforming the current state-of-the-art approaches. The code is available at
+xxxxxxxxxxxxxx.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>53 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Make-A-Volume: Leveraging Latent Diffusion Models for Cross-Modality 3D
+  Brain MRI Synthesis <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10094v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10094v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingting Zhu, Zeyue Xue, Zhenchao Jin, Xian Liu, Jingzhen He, Ziwei Liu, Lequan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modality medical image synthesis is a critical topic and has the
+potential to facilitate numerous applications in the medical imaging field.
+Despite recent successes in deep-learning-based generative models, most current
+medical image synthesis methods rely on generative adversarial networks and
+suffer from notorious mode collapse and unstable training. Moreover, the 2D
+backbone-driven approaches would easily result in volumetric inconsistency,
+while 3D backbones are challenging and impractical due to the tremendous memory
+cost and training difficulty. In this paper, we introduce a new paradigm for
+volumetric medical data synthesis by leveraging 2D backbones and present a
+diffusion-based framework, Make-A-Volume, for cross-modality 3D medical image
+synthesis. To learn the cross-modality slice-wise mapping, we employ a latent
+diffusion model and learn a low-dimensional latent space, resulting in high
+computational efficiency. To enable the 3D image synthesis and mitigate
+volumetric inconsistency, we further insert a series of volumetric layers in
+the 2D slice-mapping model and fine-tune them with paired 3D data. This
+paradigm extends the 2D image diffusion model to a volumetric version with a
+slightly increasing number of parameters and computation, offering a principled
+solution for generic cross-modality 3D medical image synthesis. We showcase the
+effectiveness of our Make-A-Volume framework on an in-house SWI-MRA brain MRI
+dataset and a public T1-T2 brain MRI dataset. Experimental results demonstrate
+that our framework achieves superior synthesis results with volumetric
+consistency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Conference on Medical Image Computing and
+  Computer Assisted Intervention (MICCAI 2023). 10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Accuracy Estimation of Deep Visual Models using
+  Domain-Adaptive Adversarial Perturbation without Source Samples <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10062v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10062v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        JoonHo Lee, Jae Oh Woo, Hankyu Moon, Kwonho Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying deep visual models can lead to performance drops due to the
+discrepancies between source and target distributions. Several approaches
+leverage labeled source data to estimate target domain accuracy, but accessing
+labeled source data is often prohibitively difficult due to data
+confidentiality or resource limitations on serving devices. Our work proposes a
+new framework to estimate model accuracy on unlabeled target data without
+access to source data. We investigate the feasibility of using pseudo-labels
+for accuracy estimation and evolve this idea into adopting recent advances in
+source-free domain adaptation algorithms. Our approach measures the
+disagreement rate between the source hypothesis and the target pseudo-labeling
+function, adapted from the source hypothesis. We mitigate the impact of
+erroneous pseudo-labels that may arise due to a high ideal joint hypothesis
+risk by employing adaptive adversarial perturbation on the input of the target
+model. Our proposed source-free framework effectively addresses the challenging
+distribution shift scenarios and outperforms existing methods requiring source
+data and labels for training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Divert More Attention to Vision-Language Object Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingzhe Guo, Zhipeng Zhang, Liping Jing, Haibin Ling, Heng Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal vision-language (VL) learning has noticeably pushed the tendency
+toward generic intelligence owing to emerging large foundation models. However,
+tracking, as a fundamental vision problem, surprisingly enjoys less bonus from
+recent flourishing VL learning. We argue that the reasons are two-fold: the
+lack of large-scale vision-language annotated videos and ineffective
+vision-language interaction learning of current works. These nuisances motivate
+us to design more effective vision-language representation for tracking,
+meanwhile constructing a large database with language annotation for model
+learning. Particularly, in this paper, we first propose a general attribute
+annotation strategy to decorate videos in six popular tracking benchmarks,
+which contributes a large-scale vision-language tracking database with more
+than 23,000 videos. We then introduce a novel framework to improve tracking by
+learning a unified-adaptive VL representation, where the cores are the proposed
+asymmetric architecture search and modality mixer (ModaMixer). To further
+improve VL representation, we introduce a contrastive loss to align different
+modalities. To thoroughly evidence the effectiveness of our method, we
+integrate the proposed framework on three tracking methods with different
+designs, i.e., the CNN-based SiamCAR, the Transformer-based OSTrack, and the
+hybrid structure TransT. The experiments demonstrate that our framework can
+significantly improve all baselines on six benchmarks. Besides empirical
+results, we theoretically analyze our approach to show its rationality. By
+revealing the potential of VL representation, we expect the community to divert
+more attention to VL tracking and hope to open more possibilities for future
+tracking with diversified multimodal messages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Class Attention to Regions of Lesion for Imbalanced Medical Image
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10036v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10036v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Xin Zhuang, Jiabin Cai, Jianguo Zhang, Wei-shi Zheng, Ruixuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated medical image classification is the key component in intelligent
+diagnosis systems. However, most medical image datasets contain plenty of
+samples of common diseases and just a handful of rare ones, leading to major
+class imbalances. Currently, it is an open problem in intelligent diagnosis to
+effectively learn from imbalanced training data. In this paper, we propose a
+simple yet effective framework, named \textbf{C}lass \textbf{A}ttention to
+\textbf{RE}gions of the lesion (CARE), to handle data imbalance issues by
+embedding attention into the training process of \textbf{C}onvolutional
+\textbf{N}eural \textbf{N}etworks (CNNs). The proposed attention module helps
+CNNs attend to lesion regions of rare diseases, therefore helping CNNs to learn
+their characteristics more effectively. In addition, this attention module
+works only during the training phase and does not change the architecture of
+the original network, so it can be directly combined with any existing CNN
+architecture. The CARE framework needs bounding boxes to represent the lesion
+regions of rare diseases. To alleviate the need for manual annotation, we
+further developed variants of CARE by leveraging the traditional saliency
+methods or a pretrained segmentation model for bounding box generation. Results
+show that the CARE variants with automated bounding box generation are
+comparable to the original CARE framework with \textit{manual} bounding box
+annotations. A series of experiments on an imbalanced skin image dataset and a
+pneumonia dataset indicates that our method can effectively help the network
+focus on the lesion regions of rare diseases and remarkably improves the
+classification performance of rare diseases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Neurocomputing on July 2023. 37 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Fair Face Verification: An In-depth Analysis of Demographic
+  Biases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Sarridis, Christos Koutlis, Symeon Papadopoulos, Christos Diou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based person identification and verification systems have
+remarkably improved in terms of accuracy in recent years; however, such
+systems, including widely popular cloud-based solutions, have been found to
+exhibit significant biases related to race, age, and gender, a problem that
+requires in-depth exploration and solutions. This paper presents an in-depth
+analysis, with a particular emphasis on the intersectionality of these
+demographic factors. Intersectional bias refers to the performance
+discrepancies w.r.t. the different combinations of race, age, and gender
+groups, an area relatively unexplored in current literature. Furthermore, the
+reliance of most state-of-the-art approaches on accuracy as the principal
+evaluation metric often masks significant demographic disparities in
+performance. To counter this crucial limitation, we incorporate five additional
+metrics in our quantitative analysis, including disparate impact and
+mistreatment metrics, which are typically ignored by the relevant
+fairness-aware approaches. Results on the Racial Faces in-the-Wild (RFW)
+benchmark indicate pervasive biases in face recognition systems, extending
+beyond race, with different demographic factors yielding significantly
+disparate outcomes. In particular, Africans demonstrate an 11.25% lower True
+Positive Rate (TPR) compared to Caucasians, while only a 3.51% accuracy drop is
+observed. Even more concerning, the intersections of multiple protected groups,
+such as African females over 60 years old, demonstrate a +39.89% disparate
+mistreatment rate compared to the highest Caucasians rate. By shedding light on
+these biases and their implications, this paper aims to stimulate further
+research towards developing fairer, more equitable face recognition and
+verification systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MODA: Mapping-Once Audio-driven Portrait Animation with Dual Attentions <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfei Liu, Lijian Lin, Fei Yu, Changyin Zhou, Yu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-driven portrait animation aims to synthesize portrait videos that are
+conditioned by given audio. Animating high-fidelity and multimodal video
+portraits has a variety of applications. Previous methods have attempted to
+capture different motion modes and generate high-fidelity portrait videos by
+training different models or sampling signals from given videos. However,
+lacking correlation learning between lip-sync and other movements (e.g., head
+pose/eye blinking) usually leads to unnatural results. In this paper, we
+propose a unified system for multi-person, diverse, and high-fidelity talking
+portrait generation. Our method contains three stages, i.e., 1) Mapping-Once
+network with Dual Attentions (MODA) generates talking representation from given
+audio. In MODA, we design a dual-attention module to encode accurate mouth
+movements and diverse modalities. 2) Facial composer network generates dense
+and detailed face landmarks, and 3) temporal-guided renderer syntheses stable
+videos. Extensive evaluations demonstrate that the proposed system produces
+more natural and realistic video portraits compared to previous methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TbExplain: A Text-based Explanation Method for Scene Classification
+  Models with the Statistical Prediction Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amirhossein Aminimehr, Pouya Khani, Amirali Molaei, Amirmohammad Kazemeini, Erik Cambria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of Explainable Artificial Intelligence (XAI) aims to improve the
+interpretability of black-box machine learning models. Building a heatmap based
+on the importance value of input features is a popular method for explaining
+the underlying functions of such models in producing their predictions.
+Heatmaps are almost understandable to humans, yet they are not without flaws.
+Non-expert users, for example, may not fully understand the logic of heatmaps
+(the logic in which relevant pixels to the model's prediction are highlighted
+with different intensities or colors). Additionally, objects and regions of the
+input image that are relevant to the model prediction are frequently not
+entirely differentiated by heatmaps. In this paper, we propose a framework
+called TbExplain that employs XAI techniques and a pre-trained object detector
+to present text-based explanations of scene classification models. Moreover,
+TbExplain incorporates a novel method to correct predictions and textually
+explain them based on the statistics of objects in the input image when the
+initial prediction is unreliable. To assess the trustworthiness and validity of
+the text-based explanations, we conducted a qualitative experiment, and the
+findings indicated that these explanations are sufficiently reliable.
+Furthermore, our quantitative and qualitative experiments on TbExplain with
+scene classification datasets reveal an improvement in classification accuracy
+over ResNet variants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ As large as it gets: Learning infinitely large Filters via Neural
+  Implicit Functions in the Fourier Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julia Grabinski, Janis Keuper, Margret Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the recent trend towards the usage of larger receptive fields
+for more context-aware neural networks in vision applications, we aim to
+investigate how large these receptive fields really need to be. To facilitate
+such study, several challenges need to be addressed, most importantly: (i) We
+need to provide an effective way for models to learn large filters (potentially
+as large as the input data) without increasing their memory consumption during
+training or inference, (ii) the study of filter sizes has to be decoupled from
+other effects such as the network width or number of learnable parameters, and
+(iii) the employed convolution operation should be a plug-and-play module that
+can replace any conventional convolution in a Convolutional Neural Network
+(CNN) and allow for an efficient implementation in current frameworks. To
+facilitate such models, we propose to learn not spatial but frequency
+representations of filter weights as neural implicit functions, such that even
+infinitely large filters can be parameterized by only a few learnable weights.
+The resulting neural implicit frequency CNNs are the first models to achieve
+results on par with the state-of-the-art on large image classification
+benchmarks while executing convolutions solely in the frequency domain and can
+be employed within any CNN architecture. They allow us to provide an extensive
+analysis of the learned receptive fields. Interestingly, our analysis shows
+that, although the proposed networks could learn very large convolution
+kernels, the learned filters practically translate into well-localized and
+relatively small convolution kernels in the spatial domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TUNeS: A Temporal U-Net with Self-Attention for Video-based Surgical
+  Phase Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09997v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09997v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isabel Funke, Dominik Rivoir, Stefanie Krell, Stefanie Speidel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To enable context-aware computer assistance in the operating room of the
+future, cognitive systems need to understand automatically which surgical phase
+is being performed by the medical team. The primary source of information for
+surgical phase recognition is typically video, which presents two challenges:
+extracting meaningful features from the video stream and effectively modeling
+temporal information in the sequence of visual features. For temporal modeling,
+attention mechanisms have gained popularity due to their ability to capture
+long-range dependencies. In this paper, we explore design choices for attention
+in existing temporal models for surgical phase recognition and propose a novel
+approach that does not resort to local attention or regularization of attention
+weights: TUNeS is an efficient and simple temporal model that incorporates
+self-attention at the coarsest stage of a U-Net-like structure. In addition, we
+propose to train the feature extractor, a standard CNN, together with an LSTM
+on preferably long video segments, i.e., with long temporal context. In our
+experiments, all temporal models performed better on top of feature extractors
+that were trained with longer temporal context. On top of these contextualized
+features, TUNeS achieves state-of-the-art results on Cholec80.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Impact of Disentanglement on Pruning Neural Networks <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carl Shneider, Peyman Rostami, Anis Kacem, Nilotpal Sinha, Abd El Rahman Shabayek, Djamila Aouada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying deep learning neural networks on edge devices, to accomplish task
+specific objectives in the real-world, requires a reduction in their memory
+footprint, power consumption, and latency. This can be realized via efficient
+model compression. Disentangled latent representations produced by variational
+autoencoder (VAE) networks are a promising approach for achieving model
+compression because they mainly retain task-specific information, discarding
+useless information for the task at hand. We make use of the Beta-VAE framework
+combined with a standard criterion for pruning to investigate the impact of
+forcing the network to learn disentangled representations on the pruning
+process for the task of classification. In particular, we perform experiments
+on MNIST and CIFAR10 datasets, examine disentanglement challenges, and propose
+a path forward for future works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented in ISCS23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TinyTrain: Deep Neural Network Training at the Extreme Edge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Young D. Kwon, Rui Li, Stylianos I. Venieris, Jagmohan Chauhan, Nicholas D. Lane, Cecilia Mascolo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  On-device training is essential for user personalisation and privacy. With
+the pervasiveness of IoT devices and microcontroller units (MCU), this task
+becomes more challenging due to the constrained memory and compute resources,
+and the limited availability of labelled user data. Nonetheless, prior works
+neglect the data scarcity issue, require excessively long training time (e.g. a
+few hours), or induce substantial accuracy loss ($\geq$10\%). We propose
+TinyTrain, an on-device training approach that drastically reduces training
+time by selectively updating parts of the model and explicitly coping with data
+scarcity. TinyTrain introduces a task-adaptive sparse-update method that
+dynamically selects the layer/channel based on a multi-objective criterion that
+jointly captures user data, the memory, and the compute capabilities of the
+target device, leading to high accuracy on unseen tasks with reduced
+computation and memory footprint. TinyTrain outperforms vanilla fine-tuning of
+the entire network by 3.6-5.0\% in accuracy, while reducing the backward-pass
+memory and computation cost by up to 2,286$\times$ and 7.68$\times$,
+respectively. Targeting broadly used real-world edge devices, TinyTrain
+achieves 9.5$\times$ faster and 3.5$\times$ more energy-efficient training over
+status-quo approaches, and 2.8$\times$ smaller memory footprint than SOTA
+approaches, while remaining within the 1 MB memory envelope of MCU-grade
+platforms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lazy Visual Localization via Motion Averaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyan Dong, Shaohui Liu, Hengkai Guo, Baoquan Chen, Marc Pollefeys
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual (re)localization is critical for various applications in computer
+vision and robotics. Its goal is to estimate the 6 degrees of freedom (DoF)
+camera pose for each query image, based on a set of posed database images.
+Currently, all leading solutions are structure-based that either explicitly
+construct 3D metric maps from the database with structure-from-motion, or
+implicitly encode the 3D information with scene coordinate regression models.
+On the contrary, visual localization without reconstructing the scene in 3D
+offers clear benefits. It makes deployment more convenient by reducing database
+pre-processing time, releasing storage requirements, and remaining unaffected
+by imperfect reconstruction, etc. In this technical report, we demonstrate that
+it is possible to achieve high localization accuracy without reconstructing the
+scene from the database. The key to achieving this owes to a tailored motion
+averaging over database-query pairs. Experiments show that our visual
+localization proposal, LazyLoc, achieves comparable performance against
+state-of-the-art structure-based methods. Furthermore, we showcase the
+versatility of LazyLoc, which can be easily extended to handle complex
+configurations such as multi-query co-localization and camera rigs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ U-CE: Uncertainty-aware Cross-Entropy for Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steven Landgraf, Markus Hillemann, Kira Wursthorn, Markus Ulrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have shown exceptional performance in various tasks, but
+their lack of robustness, reliability, and tendency to be overconfident pose
+challenges for their deployment in safety-critical applications like autonomous
+driving. In this regard, quantifying the uncertainty inherent to a model's
+prediction is a promising endeavour to address these shortcomings. In this
+work, we present a novel Uncertainty-aware Cross-Entropy loss (U-CE) that
+incorporates dynamic predictive uncertainties into the training process by
+pixel-wise weighting of the well-known cross-entropy loss (CE). Through
+extensive experimentation, we demonstrate the superiority of U-CE over regular
+CE training on two benchmark datasets, Cityscapes and ACDC, using two common
+backbone architectures, ResNet-18 and ResNet-101. With U-CE, we manage to train
+models that not only improve their segmentation performance but also provide
+meaningful uncertainties after training. Consequently, we contribute to the
+development of more robust and reliable segmentation models, ultimately
+advancing the state-of-the-art in safety-critical applications and beyond.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures, 7 tables, 1 algorithm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProtoCaps: A Fast and Non-Iterative Capsule Network Routing Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miles Everett, Mingjun Zhong, Georgios Leontidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capsule Networks have emerged as a powerful class of deep learning
+architectures, known for robust performance with relatively few parameters
+compared to Convolutional Neural Networks (CNNs). However, their inherent
+efficiency is often overshadowed by their slow, iterative routing mechanisms
+which establish connections between Capsule layers, posing computational
+challenges resulting in an inability to scale. In this paper, we introduce a
+novel, non-iterative routing mechanism, inspired by trainable prototype
+clustering. This innovative approach aims to mitigate computational complexity,
+while retaining, if not enhancing, performance efficacy. Furthermore, we
+harness a shared Capsule subspace, negating the need to project each
+lower-level Capsule to each higher-level Capsule, thereby significantly
+reducing memory requisites during training. Our approach demonstrates superior
+results compared to the current best non-iterative Capsule Network and tests on
+the Imagewoof dataset, which is too computationally demanding to handle
+efficiently by iterative approaches. Our findings underscore the potential of
+our proposed methodology in enhancing the operational efficiency and
+performance of Capsule Networks, paving the way for their application in
+increasingly complex computational scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AGAR: Attention Graph-RNN for Adaptative Motion Prediction of Point
+  Clouds of Deformable Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Gomes, Silvia Rossi, Laura Toni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on motion prediction for point cloud sequences in the
+challenging case of deformable 3D objects, such as human body motion. First, we
+investigate the challenges caused by deformable shapes and complex motions
+present in this type of representation, with the ultimate goal of understanding
+the technical limitations of state-of-the-art models. From this understanding,
+we propose an improved architecture for point cloud prediction of deformable 3D
+objects. Specifically, to handle deformable shapes, we propose a graph-based
+approach that learns and exploits the spatial structure of point clouds to
+extract more representative features. Then we propose a module able to combine
+the learned features in an adaptative manner according to the point cloud
+movements. The proposed adaptative module controls the composition of local and
+global motions for each point, enabling the network to model complex motions in
+deformable 3D objects more effectively. We tested the proposed method on the
+following datasets: MNIST moving digits, the Mixamo human bodies motions, JPEG
+and CWIPC-SXR real-world dynamic bodies. Simulation results demonstrate that
+our method outperforms the current baseline methods given its improved ability
+to model complex movements as well as preserve point cloud shape. Furthermore,
+we demonstrate the generalizability of the proposed framework for dynamic
+feature learning, by testing the framework for action recognition on the
+MSRAction3D dataset and achieving results on-par with state-of-the-art methods
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spuriosity Didn't Kill the Classifier: Using Invariant Predictions to
+  Harness Spurious Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09933v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09933v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cian Eastwood, Shashank Singh, Andrei Liviu Nicolicioiu, Marin Vlastelica, Julius von Kügelgen, Bernhard Schölkopf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To avoid failures on out-of-distribution data, recent works have sought to
+extract features that have a stable or invariant relationship with the label
+across domains, discarding the "spurious" or unstable features whose
+relationship with the label changes across domains. However, unstable features
+often carry complementary information about the label that could boost
+performance if used correctly in the test domain. Our main contribution is to
+show that it is possible to learn how to use these unstable features in the
+test domain without labels. In particular, we prove that pseudo-labels based on
+stable features provide sufficient guidance for doing so, provided that stable
+and unstable features are conditionally independent given the label. Based on
+this theoretical insight, we propose Stable Feature Boosting (SFB), an
+algorithm for: (i) learning a predictor that separates stable and
+conditionally-independent unstable features; and (ii) using the stable-feature
+predictions to adapt the unstable-feature predictions in the test domain.
+Theoretically, we prove that SFB can learn an asymptotically-optimal predictor
+without test-domain labels. Empirically, we demonstrate the effectiveness of
+SFB on real and synthetic data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DISA: DIfferentiable Similarity Approximation for Universal Multimodal
+  Registration <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Ronchetti, Wolfgang Wein, Nassir Navab, Oliver Zettinig, Raphael Prevost
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal image registration is a challenging but essential step for
+numerous image-guided procedures. Most registration algorithms rely on the
+computation of complex, frequently non-differentiable similarity metrics to
+deal with the appearance discrepancy of anatomical structures between imaging
+modalities. Recent Machine Learning based approaches are limited to specific
+anatomy-modality combinations and do not generalize to new settings. We propose
+a generic framework for creating expressive cross-modal descriptors that enable
+fast deformable global registration. We achieve this by approximating existing
+metrics with a dot-product in the feature space of a small convolutional neural
+network (CNN) which is inherently differentiable can be trained without
+registered data. Our method is several orders of magnitude faster than local
+patch-based metrics and can be directly applied in clinical settings by
+replacing the similarity measure with the proposed one. Experiments on three
+different datasets demonstrate that our approach generalizes well beyond the
+training data, yielding a broad capture range even on unseen anatomies and
+modality pairs, without the need for specialized retraining. We make our
+training code and data publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This preprint was submitted to MICCAI 2023. The Version of Record of
+  this contribution will be published in Springer LNCS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring and Modeling Uncertainty Degree for Monocular Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mochu Xiang, Jing Zhang, Nick Barnes, Yuchao Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effectively measuring and modeling the reliability of a trained model is
+essential to the real-world deployment of monocular depth estimation (MDE)
+models. However, the intrinsic ill-posedness and ordinal-sensitive nature of
+MDE pose major challenges to the estimation of uncertainty degree of the
+trained models. On the one hand, utilizing current uncertainty modeling methods
+may increase memory consumption and are usually time-consuming. On the other
+hand, measuring the uncertainty based on model accuracy can also be
+problematic, where uncertainty reliability and prediction accuracy are not well
+decoupled. In this paper, we propose to model the uncertainty of MDE models
+from the perspective of the inherent probability distributions originating from
+the depth probability volume and its extensions, and to assess it more fairly
+with more comprehensive metrics. By simply introducing additional training
+regularization terms, our model, with surprisingly simple formations and
+without requiring extra modules or multiple inferences, can provide uncertainty
+estimations with state-of-the-art reliability, and can be further improved when
+combined with ensemble or sampling methods. A series of experiments demonstrate
+the effectiveness of our methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embedded Heterogeneous Attention <span class="highlight-title">Transformer</span> for Cross-lingual Image
+  Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Song, Zhenzhen Hu, Richang Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-lingual image captioning is confronted with both cross-lingual and
+cross-modal challenges for multimedia analysis. The crucial issue in this task
+is to model the global and local matching between the image and different
+languages. Existing cross-modal embedding methods based on Transformer
+architecture oversight the local matching between the image region and
+monolingual words, not to mention in the face of a variety of differentiated
+languages. Due to the heterogeneous property of the cross-modal and
+cross-lingual task, we utilize the heterogeneous network to establish
+cross-domain relationships and the local correspondences between the image and
+different languages. In this paper, we propose an Embedded Heterogeneous
+Attention Transformer (EHAT) to build reasoning paths bridging cross-domain for
+cross-lingual image captioning and integrate into transformer. The proposed
+EHAT consists of a Masked Heterogeneous Cross-attention (MHCA), Heterogeneous
+Attention Reasoning Network (HARN) and Heterogeneous Co-attention (HCA). HARN
+as the core network, models and infers cross-domain relationship anchored by
+vision bounding box representation features to connect two languages word
+features and learn the heterogeneous maps. MHCA and HCA implement cross-domain
+integration in the encoder through the special heterogeneous attention and
+enable single model to generate two language captioning. We test on MSCOCO
+dataset to generate English and Chinese, which are most widely used and have
+obvious difference between their language families. Our experiments show that
+our method even achieve better than advanced monolingual methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit Identity Representation Conditioned Memory Compensation Network
+  for Talking Head video Generation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fa-Ting Hong, Dan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Talking head video generation aims to animate a human face in a still image
+with dynamic poses and expressions using motion information derived from a
+target-driving video, while maintaining the person's identity in the source
+image. However, dramatic and complex motions in the driving video cause
+ambiguous generation, because the still source image cannot provide sufficient
+appearance information for occluded regions or delicate expression variations,
+which produces severe artifacts and significantly degrades the generation
+quality. To tackle this problem, we propose to learn a global facial
+representation space, and design a novel implicit identity representation
+conditioned memory compensation network, coined as MCNet, for high-fidelity
+talking head generation.~Specifically, we devise a network module to learn a
+unified spatial facial meta-memory bank from all training samples, which can
+provide rich facial structure and appearance priors to compensate warped source
+facial features for the generation. Furthermore, we propose an effective query
+mechanism based on implicit identity representations learned from the discrete
+keypoints of the source image. It can greatly facilitate the retrieval of more
+correlated information from the memory bank for the compensation. Extensive
+experiments demonstrate that MCNet can learn representative and complementary
+facial memory, and can clearly outperform previous state-of-the-art talking
+head generation methods on VoxCeleb1 and CelebV datasets. Please check our
+\href{https://github.com/harlanhong/ICCV2023-MCNET}{Project}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning from Abstract Images: on the Importance of Occlusion in a
+  Minimalist Encoding of Human Poses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09893v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09893v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saad Manzur, Wayne Hayes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing 2D-to-3D pose lifting networks suffer from poor performance in
+cross-dataset benchmarks. Although the use of 2D keypoints joined by
+"stick-figure" limbs has shown promise as an intermediate step, stick-figures
+do not account for occlusion information that is often inherent in an image. In
+this paper, we propose a novel representation using opaque 3D limbs that
+preserves occlusion information while implicitly encoding joint locations.
+Crucially, when training on data with accurate three-dimensional keypoints and
+without part-maps, this representation allows training on abstract synthetic
+images, with occlusion, from as many synthetic viewpoints as desired. The
+result is a pose defined by limb angles rather than joint positions
+$\unicode{x2013}$ because poses are, in the real world, independent of cameras
+$\unicode{x2013}$ allowing us to predict poses that are completely independent
+of camera viewpoint. The result provides not only an improvement in
+same-dataset benchmarks, but a "quantum leap" in cross-dataset benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3Deformer: A Common Framework for Image-Guided Mesh Deformation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Su, Xuefeng Liu, Jianwei Niu, Ji Wan, Xinghao Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose 3Deformer, a general-purpose framework for interactive 3D shape
+editing. Given a source 3D mesh with semantic materials, and a user-specified
+semantic image, 3Deformer can accurately edit the source mesh following the
+shape guidance of the semantic image, while preserving the source topology as
+rigid as possible. Recent studies of 3D shape editing mostly focus on learning
+neural networks to predict 3D shapes, which requires high-cost 3D training
+datasets and is limited to handling objects involved in the datasets. Unlike
+these studies, our 3Deformer is a non-training and common framework, which only
+requires supervision of readily-available semantic images, and is compatible
+with editing various objects unlimited by datasets. In 3Deformer, the source
+mesh is deformed utilizing the differentiable renderer technique, according to
+the correspondences between semantic images and mesh materials. However,
+guiding complex 3D shapes with a simple 2D image incurs extra challenges, that
+is, the deform accuracy, surface smoothness, geometric rigidity, and global
+synchronization of the edited mesh should be guaranteed. To address these
+challenges, we propose a hierarchical optimization architecture to balance the
+global and local shape features, and propose further various strategies and
+losses to improve properties of accuracy, smoothness, rigidity, and so on.
+Extensive experiments show that our 3Deformer is able to produce impressive
+results and reaches the state-of-the-art level.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A reinforcement learning approach for VQA validation: an application to
+  diabetic macular edema grading 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatiana Fountoukidou, Raphael Sznitman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in machine learning models have greatly increased the
+performance of automated methods in medical image analysis. However, the
+internal functioning of such models is largely hidden, which hinders their
+integration in clinical practice. Explainability and trust are viewed as
+important aspects of modern methods, for the latter's widespread use in
+clinical communities. As such, validation of machine learning models represents
+an important aspect and yet, most methods are only validated in a limited way.
+In this work, we focus on providing a richer and more appropriate validation
+approach for highly powerful Visual Question Answering (VQA) algorithms. To
+better understand the performance of these methods, which answer arbitrary
+questions related to images, this work focuses on an automatic visual Turing
+test (VTT). That is, we propose an automatic adaptive questioning method, that
+aims to expose the reasoning behavior of a VQA algorithm. Specifically, we
+introduce a reinforcement learning (RL) agent that observes the history of
+previously asked questions, and uses it to select the next question to pose. We
+demonstrate our approach in the context of evaluating algorithms that
+automatically answer questions related to diabetic macular edema (DME) grading.
+The experiments show that such an agent has similar behavior to a clinician,
+whereby asking questions that are relevant to key clinical concepts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages (+ 23 pages supplementary material)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A3D: Adaptive, Accurate, and Autonomous Navigation for Edge-Assisted
+  Drones 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liekang Zeng, Haowei Chen, Daipeng Feng, Xiaoxi Zhang, Xu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate navigation is of paramount importance to ensure flight safety and
+efficiency for autonomous drones. Recent research starts to use Deep Neural
+Networks to enhance drone navigation given their remarkable predictive
+capability for visual perception. However, existing solutions either run DNN
+inference tasks on drones in situ, impeded by the limited onboard resource, or
+offload the computation to external servers which may incur large network
+latency. Few works consider jointly optimizing the offloading decisions along
+with image transmission configurations and adapting them on the fly. In this
+paper, we propose A3D, an edge server assisted drone navigation framework that
+can dynamically adjust task execution location, input resolution, and image
+compression ratio in order to achieve low inference latency, high prediction
+accuracy, and long flight distances. Specifically, we first augment
+state-of-the-art convolutional neural networks for drone navigation and define
+a novel metric called Quality of Navigation as our optimization objective which
+can effectively capture the above goals. We then design a deep reinforcement
+learning based neural scheduler at the drone side for which an information
+encoder is devised to reshape the state features and thus improve its learning
+ability. To further support simultaneous multi-drone serving, we extend the
+edge server design by developing a network-aware resource allocation algorithm,
+which allows provisioning containerized resources aligned with drones' demand.
+We finally implement a proof-of-concept prototype with realistic devices and
+validate its performance in a real-world campus scene, as well as a simulation
+environment for thorough evaluation upon AirSim. Extensive experimental results
+show that A3D can reduce end-to-end latency by 28.06% and extend the flight
+distance by up to 27.28% compared with non-adaptive solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE/ACM Transactions on Networking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BSDM: Background Suppression Diffusion Model for Hyperspectral Anomaly
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jitao Ma, Weiying Xie, Yunsong Li, Leyuan Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral anomaly detection (HAD) is widely used in Earth observation and
+deep space exploration. A major challenge for HAD is the complex background of
+the input hyperspectral images (HSIs), resulting in anomalies confused in the
+background. On the other hand, the lack of labeled samples for HSIs leads to
+poor generalization of existing HAD methods. This paper starts the first
+attempt to study a new and generalizable background learning problem without
+labeled samples. We present a novel solution BSDM (background suppression
+diffusion model) for HAD, which can simultaneously learn latent background
+distributions and generalize to different datasets for suppressing complex
+background. It is featured in three aspects: (1) For the complex background of
+HSIs, we design pseudo background noise and learn the potential background
+distribution in it with a diffusion model (DM). (2) For the generalizability
+problem, we apply a statistical offset module so that the BSDM adapts to
+datasets of different domains without labeling samples. (3) For achieving
+background suppression, we innovatively improve the inference process of DM by
+feeding the original HSIs into the denoising network, which removes the
+background as noise. Our work paves a new background suppression way for HAD
+that can improve HAD performance without the prerequisite of manually labeled
+data. Assessments and generalization experiments of four HAD methods on several
+real HSI datasets demonstrate the above three unique properties of the proposed
+method. The code is available at https://github.com/majitao-xd/BSDM-HAD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Blind Image Quality Assessment Using Multi-Stream Architecture with
+  Spatial and Channel Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09857v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09857v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hassan Khalid, Nisar Ahmed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  BIQA (Blind Image Quality Assessment) is an important field of study that
+evaluates images automatically. Although significant progress has been made,
+blind image quality assessment remains a difficult task since images vary in
+content and distortions. Most algorithms generate quality without emphasizing
+the important region of interest. In order to solve this, a multi-stream
+spatial and channel attention-based algorithm is being proposed. This algorithm
+generates more accurate predictions with a high correlation to human perceptual
+assessment by combining hybrid features from two different backbones, followed
+by spatial and channel attention to provide high weights to the region of
+interest. Four legacy image quality assessment datasets are used to validate
+the effectiveness of our proposed approach. Authentic and synthetic distortion
+image databases are used to demonstrate the effectiveness of the proposed
+method, and we show that it has excellent generalization properties with a
+particular focus on the perceptual foreground information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Spatio-Temporal Representation Learning for Gait
+  Recognition <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Wang, Bo Liu, Fangfang Liang, Bincheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gait recognition is a biometric technique that identifies individuals by
+their unique walking styles, which is suitable for unconstrained environments
+and has a wide range of applications. While current methods focus on exploiting
+body part-based representations, they often neglect the hierarchical
+dependencies between local motion patterns. In this paper, we propose a
+hierarchical spatio-temporal representation learning (HSTL) framework for
+extracting gait features from coarse to fine. Our framework starts with a
+hierarchical clustering analysis to recover multi-level body structures from
+the whole body to local details. Next, an adaptive region-based motion
+extractor (ARME) is designed to learn region-independent motion features. The
+proposed HSTL then stacks multiple ARMEs in a top-down manner, with each ARME
+corresponding to a specific partition level of the hierarchy. An adaptive
+spatio-temporal pooling (ASTP) module is used to capture gait features at
+different levels of detail to perform hierarchical feature mapping. Finally, a
+frame-level temporal aggregation (FTA) module is employed to reduce redundant
+information in gait sequences through multi-scale temporal downsampling.
+Extensive experiments on CASIA-B, OUMVLP, GREW, and Gait3D datasets demonstrate
+that our method outperforms the state-of-the-art while maintaining a reasonable
+balance between model accuracy and complexity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cryo-forum: A framework for orientation recovery with uncertainty
+  measure with the application in cryo-EM image analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09847v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09847v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Szu-Chi Chung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In single-particle cryo-electron microscopy (cryo-EM), the efficient
+determination of orientation parameters for 2D projection images poses a
+significant challenge yet is crucial for reconstructing 3D structures. This
+task is complicated by the high noise levels present in the cryo-EM datasets,
+which often include outliers, necessitating several time-consuming 2D clean-up
+processes. Recently, solutions based on deep learning have emerged, offering a
+more streamlined approach to the traditionally laborious task of orientation
+estimation. These solutions often employ amortized inference, eliminating the
+need to estimate parameters individually for each image. However, these methods
+frequently overlook the presence of outliers and may not adequately concentrate
+on the components used within the network. This paper introduces a novel
+approach that uses a 10-dimensional feature vector to represent the orientation
+and applies a Quadratically-Constrained Quadratic Program to derive the
+predicted orientation as a unit quaternion, supplemented by an uncertainty
+metric. Furthermore, we propose a unique loss function that considers the
+pairwise distances between orientations, thereby enhancing the accuracy of our
+method. Finally, we also comprehensively evaluate the design choices involved
+in constructing the encoder network, a topic that has not received sufficient
+attention in the literature. Our numerical analysis demonstrates that our
+methodology effectively recovers orientations from 2D cryo-EM images in an
+end-to-end manner. Importantly, the inclusion of uncertainty quantification
+allows for direct clean-up of the dataset at the 3D level. Lastly, we package
+our proposed methods into a user-friendly software suite named cryo-forum,
+designed for easy accessibility by the developers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compressive Image Scanning Microscope <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09841v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09841v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ajay Gunalan, Marco Castello, Simonluca Piazza, Shunlei Li, Alberto Diaspro, Leonardo S. Mattos, Paolo Bianchini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel approach to implement compressive sensing in laser
+scanning microscopes (LSM), specifically in image scanning microscopy (ISM),
+using a single-photon avalanche diode (SPAD) array detector. Our method
+addresses two significant limitations in applying compressive sensing to LSM:
+the time to compute the sampling matrix and the quality of reconstructed
+images. We employ a fixed sampling strategy, skipping alternate rows and
+columns during data acquisition, which reduces the number of points scanned by
+a factor of four and eliminates the need to compute different sampling
+matrices. By exploiting the parallel images generated by the SPAD array, we
+improve the quality of the reconstructed compressive-ISM images compared to
+standard compressive confocal LSM images. Our results demonstrate the
+effectiveness of our approach in producing higher-quality images with reduced
+data acquisition time and potential benefits in reducing photobleaching.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented in ISCS23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What do neural networks learn in image classification? A frequency
+  shortcut perspective <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09829v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09829v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shunxin Wang, Raymond Veldhuis, Christoph Brune, Nicola Strisciuglio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Frequency analysis is useful for understanding the mechanisms of
+representation learning in neural networks (NNs). Most research in this area
+focuses on the learning dynamics of NNs for regression tasks, while little for
+classification. This study empirically investigates the latter and expands the
+understanding of frequency shortcuts. First, we perform experiments on
+synthetic datasets, designed to have a bias in different frequency bands. Our
+results demonstrate that NNs tend to find simple solutions for classification,
+and what they learn first during training depends on the most distinctive
+frequency characteristics, which can be either low- or high-frequencies.
+Second, we confirm this phenomenon on natural images. We propose a metric to
+measure class-wise frequency characteristics and a method to identify frequency
+shortcuts. The results show that frequency shortcuts can be texture-based or
+shape-based, depending on what best simplifies the objective. Third, we
+validate the transferability of frequency shortcuts on out-of-distribution
+(OOD) test sets. Our results suggest that frequency shortcuts can be
+transferred across datasets and cannot be fully avoided by larger model
+capacity and data augmentation. We recommend that future research should focus
+on effective training schemes mitigating frequency shortcut learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Continual Learning for Robust Indoor Object Recognition <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09827v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09827v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Umberto Michieli, Mete Ozay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision systems mounted on home robots need to interact with unseen classes in
+changing environments. Robots have limited computational resources, labelled
+data and storage capability. These requirements pose some unique challenges:
+models should adapt without forgetting past knowledge in a data- and
+parameter-efficient way. We characterize the problem as few-shot (FS) online
+continual learning (OCL), where robotic agents learn from a non-repeated stream
+of few-shot data updating only a few model parameters. Additionally, such
+models experience variable conditions at test time, where objects may appear in
+different poses (e.g., horizontal or vertical) and environments (e.g., day or
+night). To improve robustness of CL agents, we propose RobOCLe, which; 1)
+constructs an enriched feature space computing high order statistical moments
+from the embedded features of samples; and 2) computes similarity between high
+order statistics of the samples on the enriched feature space, and predicts
+their class labels. We evaluate robustness of CL models to train/test
+augmentations in various cases. We show that different moments allow RobOCLe to
+capture different properties of deformations, providing higher robustness with
+no decrease of inference speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-modal Learning based Prediction for Disease 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaran Chen, Xueyu Chen, Yu Han, Haoran Li, Dongbin Zhao, Jingzhong Li, Xu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non alcoholic fatty liver disease (NAFLD) is the most common cause of chronic
+liver disease, which can be predicted accurately to prevent advanced fibrosis
+and cirrhosis. While, a liver biopsy, the gold standard for NAFLD diagnosis, is
+invasive, expensive, and prone to sampling errors. Therefore, non-invasive
+studies are extremely promising, yet they are still in their infancy due to the
+lack of comprehensive research data and intelligent methods for multi-modal
+data. This paper proposes a NAFLD diagnosis system (DeepFLDDiag) combining a
+comprehensive clinical dataset (FLDData) and a multi-modal learning based NAFLD
+prediction method (DeepFLD). The dataset includes over 6000 participants
+physical examinations, laboratory and imaging studies, extensive
+questionnaires, and facial images of partial participants, which is
+comprehensive and valuable for clinical studies. From the dataset, we
+quantitatively analyze and select clinical metadata that most contribute to
+NAFLD prediction. Furthermore, the proposed DeepFLD, a deep neural network
+model designed to predict NAFLD using multi-modal input, including metadata and
+facial images, outperforms the approach that only uses metadata. Satisfactory
+performance is also verified on other unseen datasets. Inspiringly, DeepFLD can
+achieve competitive results using only facial images as input rather than
+metadata, paving the way for a more robust and simpler non-invasive NAFLD
+diagnosis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Siamese-based Verification System for Open-set Architecture
+  Attribution of Synthetic Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lydia Abady, Jun Wang, Benedetta Tondi, Mauro Barni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the wide variety of methods developed for synthetic image
+attribution, most of them can only attribute images generated by models or
+architectures included in the training set and do not work with unknown
+architectures, hindering their applicability in real-world scenarios. In this
+paper, we propose a verification framework that relies on a Siamese Network to
+address the problem of open-set attribution of synthetic images to the
+architecture that generated them. We consider two different settings. In the
+first setting, the system determines whether two images have been produced by
+the same generative architecture or not. In the second setting, the system
+verifies a claim about the architecture used to generate a synthetic image,
+utilizing one or multiple reference images generated by the claimed
+architecture. The main strength of the proposed system is its ability to
+operate in both closed and open-set scenarios so that the input images, either
+the query and reference images, can belong to the architectures considered
+during training or not. Experimental evaluations encompassing various
+generative architectures such as GANs, diffusion models, and transformers,
+focusing on synthetic face image generation, confirm the excellent performance
+of our method in both closed and open-set settings, as well as its strong
+generalization capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Semantic Perceptual Listener Head Video Generation: A
+  High-performance Pipeline <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhigang Chang, Weitai Hu, Qing Yang, Shibao Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In dyadic speaker-listener interactions, the listener's head reactions along
+with the speaker's head movements, constitute an important non-verbal semantic
+expression together. The listener Head generation task aims to synthesize
+responsive listener's head videos based on audios of the speaker and reference
+images of the listener. Compared to the Talking-head generation, it is more
+challenging to capture the correlation clues from the speaker's audio and
+visual information. Following the ViCo baseline scheme, we propose a
+high-performance solution by enhancing the hierarchical semantic extraction
+capability of the audio encoder module and improving the decoder part, renderer
+and post-processing modules. Our solution gets the first place on the official
+leaderboard for the track of listening head generation. This paper is a
+technical report of ViCo@2023 Conversational Head Generation Challenge in ACM
+Multimedia 2023 conference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep unrolling Shrinkage Network for Dynamic MR imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghao Zhang, Xiaodi Li, Weihang Li, Yue Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep unrolling networks that utilize sparsity priors have achieved great
+success in dynamic magnetic resonance (MR) imaging. The convolutional neural
+network (CNN) is usually utilized to extract the transformed domain, and then
+the soft thresholding (ST) operator is applied to the CNN-transformed data to
+enforce the sparsity priors. However, the ST operator is usually constrained to
+be the same across all channels of the CNN-transformed data. In this paper, we
+propose a novel operator, called soft thresholding with channel attention
+(AST), that learns the threshold for each channel. In particular, we put
+forward a novel deep unrolling shrinkage network (DUS-Net) by unrolling the
+alternating direction method of multipliers (ADMM) for optimizing the
+transformed $l_1$ norm dynamic MR reconstruction model. Experimental results on
+an open-access dynamic cine MR dataset demonstrate that the proposed DUS-Net
+outperforms the state-of-the-art methods. The source code is available at
+\url{https://github.com/yhao-z/DUS-Net}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages,3 figures,2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LDP: Language-driven Dual-Pixel Image Defocus Deblurring Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Yang, Liyuan Pan, Yan Yang, Miaomiao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recovering sharp images from dual-pixel (DP) pairs with disparity-dependent
+blur is a challenging task.~Existing blur map-based deblurring methods have
+demonstrated promising results. In this paper, we propose, to the best of our
+knowledge, the first framework to introduce the contrastive language-image
+pre-training framework (CLIP) to achieve accurate blur map estimation from DP
+pairs unsupervisedly. To this end, we first carefully design text prompts to
+enable CLIP to understand blur-related geometric prior knowledge from the DP
+pair. Then, we propose a format to input stereo DP pair to the CLIP without any
+fine-tuning, where the CLIP is pre-trained on monocular images. Given the
+estimated blur map, we introduce a blur-prior attention block, a blur-weighting
+loss and a blur-aware loss to recover the all-in-focus image. Our method
+achieves state-of-the-art performance in extensive experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenKL: An Iterative Framework for Resolving Label Ambiguity and Label
+  Non-conformity in Web Images Via a New Generalized KL Divergence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xia Huang, Kai Fong Ernest Chong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Web image datasets curated online inherently contain ambiguous
+in-distribution (ID) instances and out-of-distribution (OOD) instances, which
+we collectively call non-conforming (NC) instances. In many recent approaches
+for mitigating the negative effects of NC instances, the core implicit
+assumption is that the NC instances can be found via entropy maximization. For
+"entropy" to be well-defined, we are interpreting the output prediction vector
+of an instance as the parameter vector of a multinomial random variable, with
+respect to some trained model with a softmax output layer. Hence, entropy
+maximization is based on the idealized assumption that NC instances have
+predictions that are "almost" uniformly distributed. However, in real-world web
+image datasets, there are numerous NC instances whose predictions are far from
+being uniformly distributed. To tackle the limitation of entropy maximization,
+we propose $(\alpha, \beta)$-generalized KL divergence,
+$\mathcal{D}_{\text{KL}}^{\alpha, \beta}(p\|q)$, which can be used to identify
+significantly more NC instances. Theoretical properties of
+$\mathcal{D}_{\text{KL}}^{\alpha, \beta}(p\|q)$ are proven, and we also show
+empirically that a simple use of $\mathcal{D}_{\text{KL}}^{\alpha,
+\beta}(p\|q)$ outperforms all baselines on the NC instance identification task.
+Building upon $(\alpha,\beta)$-generalized KL divergence, we also introduce a
+new iterative training framework, GenKL, that identifies and relabels NC
+instances. When evaluated on three web image datasets, Clothing1M,
+Food101/Food101N, and mini WebVision 1.0, we achieved new state-of-the-art
+classification accuracies: $81.34\%$, $85.73\%$ and $78.99\%$/$92.54\%$
+(top-1/top-5), respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published (with open access) at International Journal of Computer
+  Vision (IJCV, 2023). 25 pages, 8 figures. Code is available at:
+  https://github.com/codetopaper/GenKL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fix your downsampling ASAP! Be natively more robust via Aliasing and
+  Spectral Artifact free Pooling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09804v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09804v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julia Grabinski, Janis Keuper, Margret Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional neural networks encode images through a sequence of
+convolutions, normalizations and non-linearities as well as downsampling
+operations into potentially strong semantic embeddings. Yet, previous work
+showed that even slight mistakes during sampling, leading to aliasing, can be
+directly attributed to the networks' lack in robustness. To address such issues
+and facilitate simpler and faster adversarial training, [12] recently proposed
+FLC pooling, a method for provably alias-free downsampling - in theory. In this
+work, we conduct a further analysis through the lens of signal processing and
+find that such current pooling methods, which address aliasing in the frequency
+domain, are still prone to spectral leakage artifacts. Hence, we propose
+aliasing and spectral artifact-free pooling, short ASAP. While only introducing
+a few modifications to FLC pooling, networks using ASAP as downsampling method
+exhibit higher native robustness against common corruptions, a property that
+FLC pooling was missing. ASAP also increases native robustness against
+adversarial attacks on high and low resolution data while maintaining similar
+clean accuracy or even outperforming the baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From West to East: Who can understand the music of the others better? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charilaos Papaioannou, Emmanouil Benetos, Alexandros Potamianos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent developments in MIR have led to several benchmark deep learning models
+whose embeddings can be used for a variety of downstream tasks. At the same
+time, the vast majority of these models have been trained on Western pop/rock
+music and related styles. This leads to research questions on whether these
+models can be used to learn representations for different music cultures and
+styles, or whether we can build similar music audio embedding models trained on
+data from different cultures or styles. To that end, we leverage transfer
+learning methods to derive insights about the similarities between the
+different music cultures to which the data belongs to. We use two Western music
+datasets, two traditional/folk datasets coming from eastern Mediterranean
+cultures, and two datasets belonging to Indian art music. Three deep audio
+embedding models are trained and transferred across domains, including two
+CNN-based and a Transformer-based architecture, to perform auto-tagging for
+each target domain dataset. Experimental results show that competitive
+performance is achieved in all domains via transfer learning, while the best
+source dataset varies for each music culture. The implementation and the
+trained models are both provided in a public repository.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffDP: Radiotherapy Dose Prediction via a Diffusion Model <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09794v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09794v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenghao Feng, Lu Wen, Peng Wang, Binyu Yan, Xi Wu, Jiliu Zhou, Yan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Currently, deep learning (DL) has achieved the automatic prediction of dose
+distribution in radiotherapy planning, enhancing its efficiency and quality.
+However, existing methods suffer from the over-smoothing problem for their
+commonly used L_1 or L_2 loss with posterior average calculations. To alleviate
+this limitation, we innovatively introduce a diffusion-based dose prediction
+(DiffDP) model for predicting the radiotherapy dose distribution of cancer
+patients. Specifically, the DiffDP model contains a forward process and a
+reverse process. In the forward process, DiffDP gradually transforms dose
+distribution maps into Gaussian noise by adding small noise and trains a noise
+predictor to predict the noise added in each timestep. In the reverse process,
+it removes the noise from the original Gaussian noise in multiple steps with
+the well-trained noise predictor and finally outputs the predicted dose
+distribution map. To ensure the accuracy of the prediction, we further design a
+structure encoder to extract anatomical information from patient anatomy images
+and enable the noise predictor to be aware of the dose constraints within
+several essential organs, i.e., the planning target volume and organs at risk.
+Extensive experiments on an in-house dataset with 130 rectum cancer patients
+demonstrate the s
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Density-invariant Features for Distant Point Cloud Registration <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quan Liu, Hongzi Zhu, Yunsong Zhou, Hongyang Li, Shan Chang, Minyi Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Registration of distant outdoor LiDAR point clouds is crucial to extending
+the 3D vision of collaborative autonomous vehicles, and yet is challenging due
+to small overlapping area and a huge disparity between observed point
+densities. In this paper, we propose Group-wise Contrastive Learning (GCL)
+scheme to extract density-invariant geometric features to register distant
+outdoor LiDAR point clouds. We mark through theoretical analysis and
+experiments that, contrastive positives should be independent and identically
+distributed (i.i.d.), in order to train densityinvariant feature extractors. We
+propose upon the conclusion a simple yet effective training scheme to force the
+feature of multiple point clouds in the same spatial location (referred to as
+positive groups) to be similar, which naturally avoids the sampling bias
+introduced by a pair of point clouds to conform with the i.i.d. principle. The
+resulting fully-convolutional feature extractor is more powerful and
+density-invariant than state-of-the-art methods, improving the registration
+recall of distant scenarios on KITTI and nuScenes benchmarks by 40.9% and
+26.9%, respectively. The code will be open-sourced.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the IEEE/CVF International Conference on Computer
+  Vision (ICCV), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DVPT: Dynamic Visual <span class="highlight-title">Prompt</span> Tuning of Large <span class="highlight-title">Pre-train</span>ed Models for
+  Medical Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Along He, Kai Wang, Zhihong Wang, Tao Li, Huazhu Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Limited labeled data makes it hard to train models from scratch in medical
+domain, and an important paradigm is pre-training and then fine-tuning. Large
+pre-trained models contain rich representations, which can be adapted to
+downstream medical tasks. However, existing methods either tune all the
+parameters or the task-specific layers of the pre-trained models, ignoring the
+input variations of medical images, and thus they are not efficient or
+effective. In this work, we aim to study parameter-efficient fine-tuning (PEFT)
+for medical image analysis, and propose a dynamic visual prompt tuning method,
+named DVPT. It can extract knowledge beneficial to downstream tasks from large
+models with a few trainable parameters. Firstly, the frozen features are
+transformed by an lightweight bottleneck layer to learn the domain-specific
+distribution of downstream medical tasks, and then a few learnable visual
+prompts are used as dynamic queries and then conduct cross-attention with the
+transformed features, attempting to acquire sample-specific knowledge that are
+suitable for each sample. Finally, the features are projected to original
+feature dimension and aggregated with the frozen features. This DVPT module can
+be shared between different Transformer layers, further reducing the trainable
+parameters. To validate DVPT, we conduct extensive experiments with different
+pre-trained models on medical classification and segmentation tasks. We find
+such PEFT method can not only efficiently adapt the pre-trained models to the
+medical domain, but also brings data efficiency with partial labeled data. For
+example, with 0.5\% extra trainable parameters, our method not only outperforms
+state-of-the-art PEFT methods, even surpasses the full fine-tuning by more than
+2.20\% Kappa score on medical classification task. It can saves up to 60\%
+labeled data and 99\% storage cost of ViT-B/16.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text2Layer: Layered Image Generation using Latent Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09781v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09781v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyang Zhang, Wentian Zhao, Xin Lu, Jeff Chien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Layer compositing is one of the most popular image editing workflows among
+both amateurs and professionals. Motivated by the success of diffusion models,
+we explore layer compositing from a layered image generation perspective.
+Instead of generating an image, we propose to generate background, foreground,
+layer mask, and the composed image simultaneously. To achieve layered image
+generation, we train an autoencoder that is able to reconstruct layered images
+and train diffusion models on the latent representation. One benefit of the
+proposed problem is to enable better compositing workflows in addition to the
+high-quality image output. Another benefit is producing higher-quality layer
+masks compared to masks produced by a separate step of image segmentation.
+Experimental results show that the proposed method is able to generate
+high-quality layered images and initiates a benchmark for future work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Source-Free Domain Adaptation for Medical Image Segmentation via
+  Prototype-Anchored Feature Alignment and Contrastive Learning <span class="chip">MICCAI23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinji Yu, Nan Xi, Junsong Yuan, Ziyu Zhou, Kang Dang, Xiaowei Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation (UDA) has increasingly gained interests for
+its capacity to transfer the knowledge learned from a labeled source domain to
+an unlabeled target domain. However, typical UDA methods require concurrent
+access to both the source and target domain data, which largely limits its
+application in medical scenarios where source data is often unavailable due to
+privacy concern. To tackle the source data-absent problem, we present a novel
+two-stage source-free domain adaptation (SFDA) framework for medical image
+segmentation, where only a well-trained source segmentation model and unlabeled
+target data are available during domain adaptation. Specifically, in the
+prototype-anchored feature alignment stage, we first utilize the weights of the
+pre-trained pixel-wise classifier as source prototypes, which preserve the
+information of source features. Then, we introduce the bi-directional transport
+to align the target features with class prototypes by minimizing its expected
+cost. On top of that, a contrastive learning stage is further devised to
+utilize those pixels with unreliable predictions for a more compact target
+feature distribution. Extensive experiments on a cross-modality medical
+segmentation task demonstrate the superiority of our method in large domain
+discrepancy settings compared with the state-of-the-art SFDA approaches and
+even some UDA methods. Code is available at
+https://github.com/CSCYQJ/MICCAI23-ProtoContra-SFDA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MICCAI23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Building More Robust Models with Frequency Bias <span class="chip">ICCV23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingwen Bu, Dong Huang, Heming Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The vulnerability of deep neural networks to adversarial samples has been a
+major impediment to their broad applications, despite their success in various
+fields. Recently, some works suggested that adversarially-trained models
+emphasize the importance of low-frequency information to achieve higher
+robustness. While several attempts have been made to leverage this frequency
+characteristic, they have all faced the issue that applying low-pass filters
+directly to input images leads to irreversible loss of discriminative
+information and poor generalizability to datasets with distinct frequency
+features. This paper presents a plug-and-play module called the Frequency
+Preference Control Module that adaptively reconfigures the low- and
+high-frequency components of intermediate feature representations, providing
+better utilization of frequency in robust learning. Empirical studies show that
+our proposed module can be easily incorporated into any adversarial training
+framework, further improving model robustness across different architectures
+and datasets. Additionally, experiments were conducted to examine how the
+frequency bias of robust models impacts the adversarial training process and
+its final robustness, revealing interesting insights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Longitudinal Data and a Semantic Similarity Reward for Chest X-Ray
+  Report Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09758v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09758v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron Nicolson, Jason Dowling, Bevan Koopman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chest X-Ray (CXR) report generation is a promising approach to improving the
+efficiency of CXR interpretation. However, a significant increase in diagnostic
+accuracy is required before that can be realised. Motivated by this, we propose
+a framework that is more inline with a radiologist's workflow by considering
+longitudinal data. Here, the decoder is additionally conditioned on the report
+from the subject's previous imaging study via a prompt. We also propose a new
+reward for reinforcement learning based on CXR-BERT, which computes the
+similarity between reports. We conduct experiments on the MIMIC-CXR dataset.
+The results indicate that longitudinal data improves CXR report generation.
+CXR-BERT is also shown to be a promising alternative to the current
+state-of-the-art reward based on RadGraph. This investigation indicates that
+longitudinal CXR report generation can offer a substantial increase in
+diagnostic accuracy. Our Hugging Face model is available at:
+https://huggingface.co/aehrc/cxrmate and code is available at:
+https://github.com/aehrc/cxrmate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative <span class="highlight-title">Prompt</span> Model for Weakly Supervised Object Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzhong Zhao, Qixiang Ye, Weijia Wu, Chunhua Shen, Fang Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly supervised object localization (WSOL) remains challenging when
+learning object localization models from image category labels. Conventional
+methods that discriminatively train activation models ignore representative yet
+less discriminative object parts. In this study, we propose a generative prompt
+model (GenPromp), defining the first generative pipeline to localize less
+discriminative object parts by formulating WSOL as a conditional image
+denoising procedure. During training, GenPromp converts image category labels
+to learnable prompt embeddings which are fed to a generative model to
+conditionally recover the input image with noise and learn representative
+embeddings. During inference, enPromp combines the representative embeddings
+with discriminative embeddings (queried from an off-the-shelf vision-language
+model) for both representative and discriminative capacity. The combined
+embeddings are finally used to generate multi-scale high-quality attention
+maps, which facilitate localizing full object extent. Experiments on
+CUB-200-2011 and ILSVRC show that GenPromp respectively outperforms the best
+discriminative models by 5.2% and 5.6% (Top-1 Loc), setting a solid baseline
+for WSOL with the generative model. Code is available at
+https://github.com/callsys/GenPromp.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Space Engage: Collaborative Space Supervision for Contrastive-based
+  Semi-Supervised Semantic Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changqi Wang, Haoyu Xie, Yuhui Yuan, Chong Fu, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-Supervised Semantic Segmentation (S4) aims to train a segmentation model
+with limited labeled images and a substantial volume of unlabeled images. To
+improve the robustness of representations, powerful methods introduce a
+pixel-wise contrastive learning approach in latent space (i.e., representation
+space) that aggregates the representations to their prototypes in a fully
+supervised manner. However, previous contrastive-based S4 methods merely rely
+on the supervision from the model's output (logits) in logit space during
+unlabeled training. In contrast, we utilize the outputs in both logit space and
+representation space to obtain supervision in a collaborative way. The
+supervision from two spaces plays two roles: 1) reduces the risk of
+over-fitting to incorrect semantic information in logits with the help of
+representations; 2) enhances the knowledge exchange between the two spaces.
+Furthermore, unlike previous approaches, we use the similarity between
+representations and prototypes as a new indicator to tilt training those
+under-performing representations and achieve a more efficient contrastive
+learning process. Results on two public benchmarks demonstrate the competitive
+performance of our method compared with state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Robust Scene Text Image Super-resolution via Explicit Location
+  Enhancement <span class="chip">IJCAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09749v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09749v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Guo, Tao Dai, Guanghao Meng, Shu-Tao Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene text image super-resolution (STISR), aiming to improve image quality
+while boosting downstream scene text recognition accuracy, has recently
+achieved great success. However, most existing methods treat the foreground
+(character regions) and background (non-character regions) equally in the
+forward process, and neglect the disturbance from the complex background, thus
+limiting the performance. To address these issues, in this paper, we propose a
+novel method LEMMA that explicitly models character regions to produce
+high-level text-specific guidance for super-resolution. To model the location
+of characters effectively, we propose the location enhancement module to
+extract character region features based on the attention map sequence. Besides,
+we propose the multi-modal alignment module to perform bidirectional
+visual-semantic alignment to generate high-quality prior guidance, which is
+then incorporated into the super-resolution branch in an adaptive manner using
+the proposed adaptive fusion module. Experiments on TextZoom and four scene
+text recognition benchmarks demonstrate the superiority of our method over
+other state-of-the-art methods. Code is available at
+https://github.com/csguoh/LEMMA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as IJCAI2023 paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Watch out Venomous Snake Species: A Solution to SnakeCLEF2023 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09748v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09748v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiran Hu, Peng Wang, Yangyang Li, Chenlong Duan, Zijian Zhu, Fei Wang, Faen Zhang, Yong Li, Xiu-Shen Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The SnakeCLEF2023 competition aims to the development of advanced algorithms
+for snake species identification through the analysis of images and
+accompanying metadata. This paper presents a method leveraging utilization of
+both images and metadata. Modern CNN models and strong data augmentation are
+utilized to learn better representation of images. To relieve the challenge of
+long-tailed distribution, seesaw loss is utilized in our method. We also design
+a light model to calculate prior probabilities using metadata features
+extracted from CLIP in post processing stage. Besides, we attach more
+importance to venomous species by assigning venomous species labels to some
+examples that model is uncertain about. Our method achieves 91.31% score of the
+final metric combined of F1 and other metrics on private leaderboard, which is
+the 1st place among the participators. The code is available at
+https://github.com/xiaoxsparraw/CLEF2023.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work was the winner solution of the SnakeCLEF2023 challenge</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Distribution Matching for <span class="highlight-title">Dataset</span> Condensation <span class="chip">CVPR2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09742v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09742v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ganlong Zhao, Guanbin Li, Yipeng Qin, Yizhou Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset Condensation aims to condense a large dataset into a smaller one
+while maintaining its ability to train a well-performing model, thus reducing
+the storage cost and training effort in deep learning applications. However,
+conventional dataset condensation methods are optimization-oriented and
+condense the dataset by performing gradient or parameter matching during model
+optimization, which is computationally intensive even on small datasets and
+models. In this paper, we propose a novel dataset condensation method based on
+distribution matching, which is more efficient and promising. Specifically, we
+identify two important shortcomings of naive distribution matching (i.e.,
+imbalanced feature numbers and unvalidated embeddings for distance computation)
+and address them with three novel techniques (i.e., partitioning and expansion
+augmentation, efficient and enriched model sampling, and class-aware
+distribution regularization). Our simple yet effective method outperforms most
+previous optimization-oriented methods with much fewer computational resources,
+thereby scaling data condensation to larger datasets and models. Extensive
+experiments demonstrate the effectiveness of our method. Codes are available at
+https://github.com/uitrbn/IDM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ClickSeg: 3D Instance Segmentation with Click-Level Weak Annotations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leyao Liu, Tao Kong, Minzhao Zhu, Jiashuo Fan, Lu Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D instance segmentation methods often require fully-annotated dense labels
+for training, which are costly to obtain. In this paper, we present ClickSeg, a
+novel click-level weakly supervised 3D instance segmentation method that
+requires one point per instance annotation merely. Such a problem is very
+challenging due to the extremely limited labels, which has rarely been solved
+before. We first develop a baseline weakly-supervised training method, which
+generates pseudo labels for unlabeled data by the model itself. To utilize the
+property of click-level annotation setting, we further propose a new training
+framework. Instead of directly using the model inference way, i.e., mean-shift
+clustering, to generate the pseudo labels, we propose to use k-means with fixed
+initial seeds: the annotated points. New similarity metrics are further
+designed for clustering. Experiments on ScanNetV2 and S3DIS datasets show that
+the proposed ClickSeg surpasses the previous best weakly supervised instance
+segmentation result by a large margin (e.g., +9.4% mAP on ScanNetV2). Using
+0.02% supervision signals merely, ClickSeg achieves $\sim$90% of the accuracy
+of the fully-supervised counterpart. Meanwhile, it also achieves
+state-of-the-art semantic segmentation results among weakly supervised methods
+that use the same annotation settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NTIRE 2023 Quality Assessment of Video Enhancement Challenge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohong Liu, Xiongkuo Min, Wei Sun, Yulun Zhang, Kai Zhang, Radu Timofte, Guangtao Zhai, Yixuan Gao, Yuqin Cao, Tengchuan Kou, Yunlong Dong, Ziheng Jia, Yilin Li, Wei Wu, Shuming Hu, Sibin Deng, Pengxiang Xiao, Ying Chen, Kai Li, Kai Zhao, Kun Yuan, Ming Sun, Heng Cong, Hao Wang, Lingzhi Fu, Yusheng Zhang, Rongyu Zhang, Hang Shi, Qihang Xu, Longan Xiao, Zhiliang Ma, Mirko Agarla, Luigi Celona, Claudio Rota, Raimondo Schettini, Zhiwei Huang, Yanan Li, Xiaotao Wang, Lei Lei, Hongye Liu, Wei Hong, Ironhead Chuang, Allen Lin, Drake Guan, Iris Chen, Kae Lou, Willy Huang, Yachun Tasi, Yvonne Kao, Haotian Fan, Fangyuan Kong, Shiqi Zhou, Hao Liu, Yu Lai, Shanshan Chen, Wenqi Wang, Haoning Wu, Chaofeng Chen, Chunzheng Zhu, Zekun Guo, Shiling Zhao, Haibing Yin, Hongkui Wang, Hanene Brachemi Meftah, Sid Ahmed Fezza, Wassim Hamidouche, Olivier Déforges, Tengfei Shi, Azadeh Mansouri, Hossein Motamednia, Amir Hossein Bakhtiari, Ahmad Mahmoudi Aznaveh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper reports on the NTIRE 2023 Quality Assessment of Video Enhancement
+Challenge, which will be held in conjunction with the New Trends in Image
+Restoration and Enhancement Workshop (NTIRE) at CVPR 2023. This challenge is to
+address a major challenge in the field of video processing, namely, video
+quality assessment (VQA) for enhanced videos. The challenge uses the VQA
+Dataset for Perceptual Video Enhancement (VDPVE), which has a total of 1211
+enhanced videos, including 600 videos with color, brightness, and contrast
+enhancements, 310 videos with deblurring, and 301 deshaked videos. The
+challenge has a total of 167 registered participants. 61 participating teams
+submitted their prediction results during the development phase, with a total
+of 3168 submissions. A total of 176 submissions were submitted by 37
+participating teams during the final testing phase. Finally, 19 participating
+teams submitted their models and fact sheets, and detailed the methods they
+used. Some methods have achieved better results than baseline methods, and the
+winning methods have demonstrated superior prediction performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-Driven Multi-Scale Feature Fusion Network for Real-time
+  Image Deraining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09728v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09728v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Tong, Xuefeng Yan, Yongzhen Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual-based measurement systems are frequently affected by rainy weather due
+to the degradation caused by rain streaks in captured images, and existing
+imaging devices struggle to address this issue in real-time. While most efforts
+leverage deep networks for image deraining and have made progress, their large
+parameter sizes hinder deployment on resource-constrained devices.
+Additionally, these data-driven models often produce deterministic results,
+without considering their inherent epistemic uncertainty, which can lead to
+undesired reconstruction errors. Well-calibrated uncertainty can help alleviate
+prediction errors and assist measurement devices in mitigating risks and
+improving usability. Therefore, we propose an Uncertainty-Driven Multi-Scale
+Feature Fusion Network (UMFFNet) that learns the probability mapping
+distribution between paired images to estimate uncertainty. Specifically, we
+introduce an uncertainty feature fusion block (UFFB) that utilizes uncertainty
+information to dynamically enhance acquired features and focus on blurry
+regions obscured by rain streaks, reducing prediction errors. In addition, to
+further boost the performance of UMFFNet, we fused feature information from
+multiple scales to guide the network for efficient collaborative rain removal.
+Extensive experiments demonstrate that UMFFNet achieves significant performance
+improvements with few parameters, surpassing state-of-the-art image deraining
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAMConvex: Fast Discrete Optimization for CT Registration using
+  <span class="highlight-title">Self-supervised</span> Anatomical Embedding and Correlation Pyramid 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zi Li, Lin Tian, Tony C. W. Mok, Xiaoyu Bai, Puyang Wang, Jia Ge, Jingren Zhou, Le Lu, Xianghua Ye, Ke Yan, Dakai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating displacement vector field via a cost volume computed in the
+feature space has shown great success in image registration, but it suffers
+excessive computation burdens. Moreover, existing feature descriptors only
+extract local features incapable of representing the global semantic
+information, which is especially important for solving large transformations.
+To address the discussed issues, we propose SAMConvex, a fast coarse-to-fine
+discrete optimization method for CT registration that includes a decoupled
+convex optimization procedure to obtain deformation fields based on a
+self-supervised anatomical embedding (SAM) feature extractor that captures both
+local and global information. To be specific, SAMConvex extracts per-voxel
+features and builds 6D correlation volumes based on SAM features, and
+iteratively updates a flow field by performing lookups on the correlation
+volumes with a coarse-to-fine scheme. SAMConvex outperforms the
+state-of-the-art learning-based methods and optimization-based methods over two
+inter-patient registration datasets (Abdomen CT and HeadNeck CT) and one
+intra-patient registration dataset (Lung CT). Moreover, as an
+optimization-based method, SAMConvex only takes $\sim2$s ($\sim5s$ with
+instance optimization) for one paired images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09724v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09724v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kibeom Hong, Seogkyu Jeon, Junsoo Lee, Namhyuk Ahn, Kunhee Kim, Pilhyeon Lee, Daesik Kim, Youngjung Uh, Hyeran Byun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To deliver the artistic expression of the target style, recent studies
+exploit the attention mechanism owing to its ability to map the local patches
+of the style image to the corresponding patches of the content image. However,
+because of the low semantic correspondence between arbitrary content and
+artworks, the attention module repeatedly abuses specific local patches from
+the style image, resulting in disharmonious and evident repetitive artifacts.
+To overcome this limitation and accomplish impeccable artistic style transfer,
+we focus on enhancing the attention mechanism and capturing the rhythm of
+patterns that organize the style. In this paper, we introduce a novel metric,
+namely pattern repeatability, that quantifies the repetition of patterns in the
+style image. Based on the pattern repeatability, we propose Aesthetic
+Pattern-Aware style transfer Networks (AesPA-Net) that discover the sweet spot
+of local and global style expressions. In addition, we propose a novel
+self-supervisory task to encourage the attention mechanism to learn precise and
+meaningful semantic correspondence. Lastly, we introduce the patch-wise style
+loss to transfer the elaborate rhythm of local patterns. Through qualitative
+and quantitative evaluations, we verify the reliability of the proposed pattern
+repeatability that aligns with human perception, and demonstrate the
+superiority of the proposed framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Code is available at this
+  https://github.com/Kibeom-Hong/AesPA-Net</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Grained Multimodal Interaction Network for Entity Linking <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengfei Luo, Tong Xu, Shiwei Wu, Chen Zhu, Linli Xu, Enhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal entity linking (MEL) task, which aims at resolving ambiguous
+mentions to a multimodal knowledge graph, has attracted wide attention in
+recent years. Though large efforts have been made to explore the complementary
+effect among multiple modalities, however, they may fail to fully absorb the
+comprehensive expression of abbreviated textual context and implicit visual
+indication. Even worse, the inevitable noisy data may cause inconsistency of
+different modalities during the learning process, which severely degenerates
+the performance. To address the above issues, in this paper, we propose a novel
+Multi-GraIned Multimodal InteraCtion Network $\textbf{(MIMIC)}$ framework for
+solving the MEL task. Specifically, the unified inputs of mentions and entities
+are first encoded by textual/visual encoders separately, to extract global
+descriptive features and local detailed features. Then, to derive the
+similarity matching score for each mention-entity pair, we device three
+interaction units to comprehensively explore the intra-modal interaction and
+inter-modal fusion among features of entities and mentions. In particular,
+three modules, namely the Text-based Global-Local interaction Unit (TGLU),
+Vision-based DuaL interaction Unit (VDLU) and Cross-Modal Fusion-based
+interaction Unit (CMFU) are designed to capture and integrate the fine-grained
+representation lying in abbreviated text and implicit visual cues. Afterwards,
+we introduce a unit-consistency objective function via contrastive learning to
+avoid inconsistency and model degradation. Experimental results on three public
+benchmark datasets demonstrate that our solution outperforms various
+state-of-the-art baselines, and ablation studies verify the effectiveness of
+designed modules.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic-Aware Dual Contrastive Learning for Multi-label Image
+  Classification <span class="chip">ECAI 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leilei Ma, Dengdi Sun, Lei Wang, Haifang Zhao, Bin Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting image semantics effectively and assigning corresponding labels to
+multiple objects or attributes for natural images is challenging due to the
+complex scene contents and confusing label dependencies. Recent works have
+focused on modeling label relationships with graph and understanding object
+regions using class activation maps (CAM). However, these methods ignore the
+complex intra- and inter-category relationships among specific semantic
+features, and CAM is prone to generate noisy information. To this end, we
+propose a novel semantic-aware dual contrastive learning framework that
+incorporates sample-to-sample contrastive learning (SSCL) as well as
+prototype-to-sample contrastive learning (PSCL). Specifically, we leverage
+semantic-aware representation learning to extract category-related local
+discriminative features and construct category prototypes. Then based on SSCL,
+label-level visual representations of the same category are aggregated
+together, and features belonging to distinct categories are separated.
+Meanwhile, we construct a novel PSCL module to narrow the distance between
+positive samples and category prototypes and push negative samples away from
+the corresponding category prototypes. Finally, the discriminative label-level
+features related to the image content are accurately captured by the joint
+training of the above three parts. Experiments on five challenging large-scale
+public datasets demonstrate that our proposed method is effective and
+outperforms the state-of-the-art methods. Code and supplementary materials are
+released on https://github.com/yu-gi-oh-leilei/SADCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, accepted by ECAI 23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Saner Deep Image Registration <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Duan, Ming Zhong, Yan Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With recent advances in computing hardware and surges of deep-learning
+architectures, learning-based deep image registration methods have surpassed
+their traditional counterparts, in terms of metric performance and inference
+time. However, these methods focus on improving performance measurements such
+as Dice, resulting in less attention given to model behaviors that are equally
+desirable for registrations, especially for medical imaging. This paper
+investigates these behaviors for popular learning-based deep registrations
+under a sanity-checking microscope. We find that most existing registrations
+suffer from low inverse consistency and nondiscrimination of identical pairs
+due to overly optimized image similarities. To rectify these behaviors, we
+propose a novel regularization-based sanity-enforcer method that imposes two
+sanity checks on the deep model to reduce its inverse consistency errors and
+increase its discriminative power simultaneously. Moreover, we derive a set of
+theoretical guarantees for our sanity-checked image registration method, with
+experimental results supporting our theoretical findings and their
+effectiveness in increasing the sanity of models without sacrificing any
+performance. Our code and models are available at
+\url{https://github.com/tuffr5/Saner-deep-registration}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GlobalMapper: Arbitrary-Shaped Urban Layout Generation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu He, Daniel Aliaga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling and designing urban building layouts is of significant interest in
+computer vision, computer graphics, and urban applications. A building layout
+consists of a set of buildings in city blocks defined by a network of roads. We
+observe that building layouts are discrete structures, consisting of multiple
+rows of buildings of various shapes, and are amenable to skeletonization for
+mapping arbitrary city block shapes to a canonical form. Hence, we propose a
+fully automatic approach to building layout generation using graph attention
+networks. Our method generates realistic urban layouts given arbitrary road
+networks, and enables conditional generation based on learned priors. Our
+results, including user study, demonstrate superior performance as compared to
+prior layout generation networks, support arbitrary city block and varying
+building shapes as demonstrated by generating layouts for 28 large cities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eye Disease Classification Using Deep Learning Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tareq Babaqi, Manar Jaradat, Ayse Erdem Yildirim, Saif H. Al-Nimer, Daehan Won
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Eye is the essential sense organ for vision function. Due to the fact that
+certain eye disorders might result in vision loss, it is essential to diagnose
+and treat eye diseases early on. By identifying common eye illnesses and
+performing an eye check, eye care providers can safeguard patients against
+vision loss or blindness. Convolutional neural networks (CNN) and transfer
+learning were employed in this study to discriminate between a normal eye and
+one with diabetic retinopathy, cataract, or glaucoma disease. Using transfer
+learning for multi-class classification, high accuracy was achieved at 94%
+while the traditional CNN achieved 84% rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mining Conditional Part Semantics with Occluded Extrapolation for
+  Human-Object Interaction Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangzhi Wang, Yangyang Guo, Mohan Kankanhalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-Object Interaction Detection is a crucial aspect of human-centric scene
+understanding, with important applications in various domains. Despite recent
+progress in this field, recognizing subtle and detailed interactions remains
+challenging. Existing methods try to use human-related clues to alleviate the
+difficulty, but rely heavily on external annotations or knowledge, limiting
+their practical applicability in real-world scenarios. In this work, we propose
+a novel Part Semantic Network (PSN) to solve this problem. The core of PSN is a
+Conditional Part Attention (CPA) mechanism, where human features are taken as
+keys and values, and the object feature is used as query for the computation in
+a cross-attention mechanism. In this way, our model learns to automatically
+focus on the most informative human parts conditioned on the involved object,
+generating more semantically meaningful features for interaction recognition.
+Additionally, we propose an Occluded Part Extrapolation (OPE) strategy to
+facilitate interaction recognition under occluded scenarios, which teaches the
+model to extrapolate detailed features from partially occluded ones. Our method
+consistently outperforms prior approaches on the V-COCO and HICO-DET datasets,
+without external data or extra annotations. Additional ablation studies
+validate the effectiveness of each component of our proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Novel Batch Active Learning Approach and Its Application to Synthetic
+  Aperture Radar <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Chapman, Bohan Chen, Zheng Tan, Jeff Calder, Kevin Miller, Andrea L. Bertozzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Active learning improves the performance of machine learning methods by
+judiciously selecting a limited number of unlabeled data points to query for
+labels, with the aim of maximally improving the underlying classifier's
+performance. Recent gains have been made using sequential active learning for
+synthetic aperture radar (SAR) data arXiv:2204.00005. In each iteration,
+sequential active learning selects a query set of size one while batch active
+learning selects a query set of multiple datapoints. While batch active
+learning methods exhibit greater efficiency, the challenge lies in maintaining
+model accuracy relative to sequential active learning methods. We developed a
+novel, two-part approach for batch active learning: Dijkstra's Annulus Core-Set
+(DAC) for core-set generation and LocalMax for batch sampling. The batch active
+learning process that combines DAC and LocalMax achieves nearly identical
+accuracy as sequential active learning but is more efficient, proportional to
+the batch size. As an application, a pipeline is built based on transfer
+learning feature embedding, graph learning, DAC, and LocalMax to classify the
+FUSAR-Ship and OpenSARShip datasets. Our pipeline outperforms the
+state-of-the-art CNN-based methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures, Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Backdoor Attack against Object Detection with Clean Annotation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yize Cheng, Wenbin Hu, Minhao Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) have shown unprecedented success in object
+detection tasks. However, it was also discovered that DNNs are vulnerable to
+multiple kinds of attacks, including Backdoor Attacks. Through the attack, the
+attacker manages to embed a hidden backdoor into the DNN such that the model
+behaves normally on benign data samples, but makes attacker-specified judgments
+given the occurrence of a predefined trigger. Although numerous backdoor
+attacks have been experimented on image classification, backdoor attacks on
+object detection tasks have not been properly investigated and explored. As
+object detection has been adopted as an important module in multiple
+security-sensitive applications such as autonomous driving, backdoor attacks on
+object detection could pose even more severe threats. Inspired by the inherent
+property of deep learning-based object detectors, we propose a simple yet
+effective backdoor attack method against object detection without modifying the
+ground truth annotations, specifically focusing on the object disappearance
+attack and object generation attack. Extensive experiments and ablation studies
+prove the effectiveness of our attack on two benchmark object detection
+datasets, PASCAL VOC07+12 and MSCOCO, on which we achieve an attack success
+rate of more than 92% with a poison rate of only 5%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Findings of Factify 2: Multimodal Fake News Detection <span class="chip">AAAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S Suryavardan, Shreyash Mishra, Megha Chakraborty, Parth Patwa, Anku Rani, Aman Chadha, Aishwarya Reganti, Amitava Das, Amit Sheth, Manoj Chinnakotla, Asif Ekbal, Srijan Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With social media usage growing exponentially in the past few years, fake
+news has also become extremely prevalent. The detrimental impact of fake news
+emphasizes the need for research focused on automating the detection of false
+information and verifying its accuracy. In this work, we present the outcome of
+the Factify 2 shared task, which provides a multi-modal fact verification and
+satire news dataset, as part of the DeFactify 2 workshop at AAAI'23. The data
+calls for a comparison based approach to the task by pairing social media
+claims with supporting documents, with both text and image, divided into 5
+classes based on multi-modal relations. In the second iteration of this task we
+had over 60 participants and 9 final test-set submissions. The best
+performances came from the use of DeBERTa for text and Swinv2 and CLIP for
+image. The highest F1 score averaged for all five classes was 81.82%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Defactify2 @AAAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classification of Visualization Types and Perspectives in Patents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junaid Ahmed Ghauri, Eric Müller-Budack, Ralph Ewerth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the swift growth of patent applications each year, information and
+multimedia retrieval approaches that facilitate patent exploration and
+retrieval are of utmost importance. Different types of visualizations (e.g.,
+graphs, technical drawings) and perspectives (e.g., side view, perspective) are
+used to visualize details of innovations in patents. The classification of
+these images enables a more efficient search and allows for further analysis.
+So far, datasets for image type classification miss some important
+visualization types for patents. Furthermore, related work does not make use of
+recent deep learning approaches including transformers. In this paper, we adopt
+state-of-the-art deep learning methods for the classification of visualization
+types and perspectives in patent images. We extend the CLEF-IP dataset for
+image type classification in patents to ten classes and provide manual ground
+truth annotations. In addition, we derive a set of hierarchical classes from a
+dataset that provides weakly-labeled data for image perspectives. Experimental
+results have demonstrated the feasibility of the proposed approaches. Source
+code, models, and dataset will be made publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in International Conference on Theory and Practice of
+  Digital Libraries (TPDL) 2023 (They have the copyright to publish
+  camera-ready version of this work)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Step Towards Worldwide Biodiversity Assessment: The BIOSCAN-1M Insect
+  <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10455v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10455v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zahra Gharaee, ZeMing Gong, Nicholas Pellegrino, Iuliia Zarubiieva, Joakim Bruslund Haurum, Scott C. Lowe, Jaclyn T. A. McKeown, Chris C. Y. Ho, Joschka McLeod, Yi-Yun C Wei, Jireh Agda, Sujeevan Ratnasingham, Dirk Steinke, Angel X. Chang, Graham W. Taylor, Paul Fieguth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In an effort to catalog insect biodiversity, we propose a new large dataset
+of hand-labelled insect images, the BIOSCAN-Insect Dataset. Each record is
+taxonomically classified by an expert, and also has associated genetic
+information including raw nucleotide barcode sequences and assigned barcode
+index numbers, which are genetically-based proxies for species classification.
+This paper presents a curated million-image dataset, primarily to train
+computer-vision models capable of providing image-based taxonomic assessment,
+however, the dataset also presents compelling characteristics, the study of
+which would be of interest to the broader machine learning community. Driven by
+the biological nature inherent to the dataset, a characteristic long-tailed
+class-imbalance distribution is exhibited. Furthermore, taxonomic labelling is
+a hierarchical classification scheme, presenting a highly fine-grained
+classification problem at lower levels. Beyond spurring interest in
+biodiversity research within the machine learning community, progress on
+creating an image-based taxonomic classifier will also further the ultimate
+goal of all BIOSCAN research: to lay the foundation for a comprehensive survey
+of global biodiversity. This paper introduces the dataset and explores the
+classification task through the implementation and analysis of a baseline
+classifier.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Confidence Estimation Using Unlabeled Data <span class="chip">ICLR'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Li, Xiaoling Hu, Chao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Overconfidence is a common issue for deep neural networks, limiting their
+deployment in real-world applications. To better estimate confidence, existing
+methods mostly focus on fully-supervised scenarios and rely on training labels.
+In this paper, we propose the first confidence estimation method for a
+semi-supervised setting, when most training labels are unavailable. We
+stipulate that even with limited training labels, we can still reasonably
+approximate the confidence of model on unlabeled samples by inspecting the
+prediction consistency through the training process. We use training
+consistency as a surrogate function and propose a consistency ranking loss for
+confidence estimation. On both image classification and segmentation tasks, our
+method achieves state-of-the-art performances in confidence estimation.
+Furthermore, we show the benefit of the proposed method through a downstream
+active learning task. The code is available at
+https://github.com/TopoXLab/consistency-ranking-loss
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PreDiff: Precipitation Nowcasting with Latent Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10422v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10422v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihan Gao, Xingjian Shi, Boran Han, Hao Wang, Xiaoyong Jin, Danielle Maddix, Yi Zhu, Mu Li, Yuyang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Earth system forecasting has traditionally relied on complex physical models
+that are computationally expensive and require significant domain expertise. In
+the past decade, the unprecedented increase in spatiotemporal Earth observation
+data has enabled data-driven forecasting models using deep learning techniques.
+These models have shown promise for diverse Earth system forecasting tasks but
+either struggle with handling uncertainty or neglect domain-specific prior
+knowledge, resulting in averaging possible futures to blurred forecasts or
+generating physically implausible predictions. To address these limitations, we
+propose a two-stage pipeline for probabilistic spatiotemporal forecasting: 1)
+We develop PreDiff, a conditional latent diffusion model capable of
+probabilistic forecasts. 2) We incorporate an explicit knowledge control
+mechanism to align forecasts with domain-specific physical constraints. This is
+achieved by estimating the deviation from imposed constraints at each denoising
+step and adjusting the transition distribution accordingly. We conduct
+empirical studies on two datasets: N-body MNIST, a synthetic dataset with
+chaotic behavior, and SEVIR, a real-world precipitation nowcasting dataset.
+Specifically, we impose the law of conservation of energy in N-body MNIST and
+anticipated precipitation intensity in SEVIR. Experiments demonstrate the
+effectiveness of PreDiff in handling uncertainty, incorporating domain-specific
+prior knowledge, and generating forecasts that exhibit high operational
+utility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining Autonomous Driving Actions with Visual Question Answering <span class="chip">SC-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahin Atakishiyev, Mohammad Salameh, Housam Babiker, Randy Goebel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The end-to-end learning ability of self-driving vehicles has achieved
+significant milestones over the last decade owing to rapid advances in deep
+learning and computer vision algorithms. However, as autonomous driving
+technology is a safety-critical application of artificial intelligence (AI),
+road accidents and established regulatory principles necessitate the need for
+the explainability of intelligent action choices for self-driving vehicles. To
+facilitate interpretability of decision-making in autonomous driving, we
+present a Visual Question Answering (VQA) framework, which explains driving
+actions with question-answering-based causal reasoning. To do so, we first
+collect driving videos in a simulation environment using reinforcement learning
+(RL) and extract consecutive frames from this log data uniformly for five
+selected action categories. Further, we manually annotate the extracted frames
+using question-answer pairs as justifications for the actions chosen in each
+scenario. Finally, we evaluate the correctness of the VQA-predicted answers for
+actions on unseen driving scenes. The empirical results suggest that the VQA
+mechanism can provide support to interpret real-time decisions of autonomous
+vehicles and help enhance overall driving safety.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 2023 IEEE International Conference on Intelligent
+  Transportation Systems (IEEE ITSC-2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpreting and Correcting Medical Image Classification with PIP-Net 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meike Nauta, Johannes H. Hegeman, Jeroen Geerdink, Jörg Schlötterer, Maurice van Keulen, Christin Seifert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Part-prototype models are explainable-by-design image classifiers, and a
+promising alternative to black box AI. This paper explores the applicability
+and potential of interpretable machine learning, in particular PIP-Net, for
+automated diagnosis support on real-world medical imaging data. PIP-Net learns
+human-understandable prototypical image parts and we evaluate its accuracy and
+interpretability for fracture detection and skin cancer diagnosis. We find that
+PIP-Net's decision making process is in line with medical classification
+standards, while only provided with image-level class labels. Because of
+PIP-Net's unsupervised pretraining of prototypes, data quality problems such as
+undesired text in an X-ray or labelling errors can be easily identified.
+Additionally, we are the first to show that humans can manually correct the
+reasoning of PIP-Net by directly disabling undesired prototypes. We conclude
+that part-prototype models are promising for medical applications due to their
+interpretability and potential for advanced model debugging.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ POV-Surgery: A <span class="highlight-title">Dataset</span> for Egocentric Hand and Tool Pose Estimation
+  During Surgical Activities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10387v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10387v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Wang, Sophokles Ktistakis, Siwei Zhang, Mirko Meboldt, Quentin Lohmeyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The surgical usage of Mixed Reality (MR) has received growing attention in
+areas such as surgical navigation systems, skill assessment, and robot-assisted
+surgeries. For such applications, pose estimation for hand and surgical
+instruments from an egocentric perspective is a fundamental task and has been
+studied extensively in the computer vision field in recent years. However, the
+development of this field has been impeded by a lack of datasets, especially in
+the surgical field, where bloody gloves and reflective metallic tools make it
+hard to obtain 3D pose annotations for hands and objects using conventional
+methods. To address this issue, we propose POV-Surgery, a large-scale,
+synthetic, egocentric dataset focusing on pose estimation for hands with
+different surgical gloves and three orthopedic surgical instruments, namely
+scalpel, friem, and diskplacer. Our dataset consists of 53 sequences and 88,329
+frames, featuring high-resolution RGB-D video streams with activity
+annotations, accurate 3D and 2D annotations for hand-object pose, and 2D
+hand-object segmentation masks. We fine-tune the current SOTA methods on
+POV-Surgery and further show the generalizability when applying to real-life
+cases with surgical gloves and tools by extensive evaluations. The code and the
+dataset are publicly available at batfacewayne.github.io/POV_Surgery_io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TokenFlow: Consistent Diffusion Features for Consistent Video Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Geyer, Omer Bar-Tal, Shai Bagon, Tali Dekel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generative AI revolution has recently expanded to videos. Nevertheless,
+current state-of-the-art video models are still lagging behind image models in
+terms of visual quality and user control over the generated content. In this
+work, we present a framework that harnesses the power of a text-to-image
+diffusion model for the task of text-driven video editing. Specifically, given
+a source video and a target text-prompt, our method generates a high-quality
+video that adheres to the target text, while preserving the spatial layout and
+motion of the input video. Our method is based on a key observation that
+consistency in the edited video can be obtained by enforcing consistency in the
+diffusion feature space. We achieve this by explicitly propagating diffusion
+features based on inter-frame correspondences, readily available in the model.
+Thus, our framework does not require any training or fine-tuning, and can work
+in conjunction with any off-the-shelf text-to-image editing method. We
+demonstrate state-of-the-art editing results on a variety of real-world videos.
+Webpage: https://diffusion-tokenflow.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Multimodal <span class="highlight-title">Dataset</span>s with Image Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10350v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10350v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thao Nguyen, Samir Yitzhak Gadre, Gabriel Ilharco, Sewoong Oh, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Massive web datasets play a key role in the success of large vision-language
+models like CLIP and Flamingo. However, the raw web data is noisy, and existing
+filtering methods to reduce noise often come at the expense of data diversity.
+Our work focuses on caption quality as one major source of noise, and studies
+how generated captions can increase the utility of web-scraped datapoints with
+nondescript text. Through exploring different mixing strategies for raw and
+generated captions, we outperform the best filtering method proposed by the
+DataComp benchmark by 2% on ImageNet and 4% on average across 38 tasks, given a
+candidate pool of 128M image-text pairs. Our best approach is also 2x better at
+Flickr and MS-COCO retrieval. We then analyze what makes synthetic captions an
+effective source of text supervision. In experimenting with different image
+captioning models, we also demonstrate that the performance of a model on
+standard image captioning benchmarks (e.g., NoCaps CIDEr) is not a reliable
+indicator of the utility of the captions it generates for multimodal training.
+Finally, our experiments with using generated captions at DataComp's large
+scale (1.28B image-text pairs) offer insights into the limitations of synthetic
+text, as well as the importance of image curation with increasing training data
+quantity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Persistent Animal Identification Leveraging Non-Visual Markers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.06809v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.06809v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael P. J. Camilleri, Li Zhang, Rasneer S. Bains, Andrew Zisserman, Christopher K. I. Williams
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our objective is to locate and provide a unique identifier for each mouse in
+a cluttered home-cage environment through time, as a precursor to automated
+behaviour recognition for biological research. This is a very challenging
+problem due to (i) the lack of distinguishing visual features for each mouse,
+and (ii) the close confines of the scene with constant occlusion, making
+standard visual tracking approaches unusable. However, a coarse estimate of
+each mouse's location is available from a unique RFID implant, so there is the
+potential to optimally combine information from (weak) tracking with coarse
+information on identity. To achieve our objective, we make the following key
+contributions: (a) the formulation of the object identification problem as an
+assignment problem (solved using Integer Linear Programming), and (b) a novel
+probabilistic model of the affinity between tracklets and RFID data. The latter
+is a crucial part of the model, as it provides a principled probabilistic
+treatment of object detections given coarse localisation. Our approach achieves
+77% accuracy on this animal identification problem, and is able to reject
+spurious detections when the animals are hidden.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of Complexity Measures for Deep Learning Generalization in
+  Medical Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.03328v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.03328v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksandar Vakanski, Min Xian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generalization performance of deep learning models for medical image
+analysis often decreases on images collected with different devices for data
+acquisition, device settings, or patient population. A better understanding of
+the generalization capacity on new images is crucial for clinicians'
+trustworthiness in deep learning. Although significant research efforts have
+been recently directed toward establishing generalization bounds and complexity
+measures, still, there is often a significant discrepancy between the predicted
+and actual generalization performance. As well, related large empirical studies
+have been primarily based on validation with general-purpose image datasets.
+This paper presents an empirical study that investigates the correlation
+between 25 complexity measures and the generalization abilities of supervised
+deep learning classifiers for breast ultrasound images. The results indicate
+that PAC-Bayes flatness-based and path norm-based measures produce the most
+consistent explanation for the combination of models and data. We also
+investigate the use of multi-task classification and segmentation approach for
+breast images, and report that such learning approach acts as an implicit
+regularizer and is conducive toward improved generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IST-Net: Prior-free Category-level Pose Estimation with Implicit Space
+  Transformation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13479v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13479v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianhui Liu, Yukang Chen, Xiaoqing Ye, Xiaojuan Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Category-level 6D pose estimation aims to predict the poses and sizes of
+unseen objects from a specific category. Thanks to prior deformation, which
+explicitly adapts a category-specific 3D prior (i.e., a 3D template) to a given
+object instance, prior-based methods attained great success and have become a
+major research stream. However, obtaining category-specific priors requires
+collecting a large amount of 3D models, which is labor-consuming and often not
+accessible in practice. This motivates us to investigate whether priors are
+necessary to make prior-based methods effective. Our empirical study shows that
+the 3D prior itself is not the credit to the high performance. The keypoint
+actually is the explicit deformation process, which aligns camera and world
+coordinates supervised by world-space 3D models (also called canonical space).
+Inspired by these observations, we introduce a simple prior-free implicit space
+transformation network, namely IST-Net, to transform camera-space features to
+world-space counterparts and build correspondence between them in an implicit
+manner without relying on 3D priors. Besides, we design camera- and world-space
+enhancers to enrich the features with pose-sensitive information and
+geometrical constraints, respectively. Albeit simple, IST-Net achieves
+state-of-the-art performance based-on prior-free design, with top inference
+speed on the REAL275 benchmark. Our code and models are available at
+https://github.com/CVMI-Lab/IST-Net.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Learning for Videos: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.00419v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.00419v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Madeline C. Schiappa, Yogesh S. Rawat, Mubarak Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable success of deep learning in various domains relies on the
+availability of large-scale annotated datasets. However, obtaining annotations
+is expensive and requires great effort, which is especially challenging for
+videos. Moreover, the use of human-generated annotations leads to models with
+biased learning and poor domain generalization and robustness. As an
+alternative, self-supervised learning provides a way for representation
+learning which does not require annotations and has shown promise in both image
+and video domains. Different from the image domain, learning video
+representations are more challenging due to the temporal dimension, bringing in
+motion and other environmental dynamics. This also provides opportunities for
+video-exclusive ideas that advance self-supervised learning in the video and
+multimodal domain. In this survey, we provide a review of existing approaches
+on self-supervised learning focusing on the video domain. We summarize these
+methods into four different categories based on their learning objectives: 1)
+pretext tasks, 2) generative learning, 3) contrastive learning, and 4)
+cross-modal agreement. We further introduce the commonly used datasets,
+downstream evaluation tasks, insights into the limitations of existing works,
+and the potential future directions in this area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM CSUR (December 2022). Project Link: https://bit.ly/3Oimc7Q</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CREPE: Learnable <span class="highlight-title">Prompt</span>ing With CLIP Improves Visual Relationship
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04838v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04838v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rakshith Subramanyam, T. S. Jayram, Rushil Anirudh, Jayaraman J. Thiagarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we explore the potential of Vision-Language Models (VLMs),
+specifically CLIP, in predicting visual object relationships, which involves
+interpreting visual features from images into language-based relations. Current
+state-of-the-art methods use complex graphical models that utilize language
+cues and visual features to address this challenge. We hypothesize that the
+strong language priors in CLIP embeddings can simplify these graphical models
+paving for a simpler approach. We adopt the UVTransE relation prediction
+framework, which learns the relation as a translational embedding with subject,
+object, and union box embeddings from a scene. We systematically explore the
+design of CLIP-based subject, object, and union-box representations within the
+UVTransE framework and propose CREPE (CLIP Representation Enhanced Predicate
+Estimation). CREPE utilizes text-based representations for all three bounding
+boxes and introduces a novel contrastive training strategy to automatically
+infer the text prompt for union-box. Our approach achieves state-of-the-art
+performance in predicate estimation, mR@5 27.79, and mR@20 31.95 on the Visual
+Genome benchmark, achieving a 15.3\% gain in performance over recent
+state-of-the-art at mR@20. This work demonstrates CLIP's effectiveness in
+object relation prediction and encourages further research on VLMs in this
+challenging domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ iSLAM: Imperative SLAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07894v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07894v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taimeng Fu, Shaoshu Su, Chen Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simultaneous localization and mapping (SLAM) stands as one of the critical
+challenges in robot navigation. Recent advancements suggest that methods based
+on supervised learning deliver impressive performance in front-end odometry,
+while traditional optimization-based methods still play a vital role in the
+back-end for minimizing estimation drift. In this paper, we found that such
+decoupled paradigm can lead to only sub-optimal performance, consequently
+curtailing system capabilities and generalization potential. To solve this
+problem, we proposed a novel self-supervised learning framework, imperative
+SLAM (iSLAM), which fosters reciprocal correction between the front-end and
+back-end, thus enhancing performance without necessitating any external
+supervision. Specifically, we formulate a SLAM system as a bi-level
+optimization problem so that the two components are bidirectionally connected.
+As a result, the front-end model is able to learn global geometric knowledge
+obtained through pose graph optimization by back-propagating the residuals from
+the back-end. This significantly improves the generalization ability of the
+entire system and thus achieves the accuracy improvement up to 45%. To the best
+of our knowledge, iSLAM is the first SLAM system showing that the front-end and
+back-end can learn jointly and mutually contribute to each other in a
+self-supervised manner.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Temporal Label-Refinement for Weakly-Supervised Audio-Visual Event
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06385v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06385v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kalyan Ramakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-Visual Event Localization (AVEL) is the task of temporally localizing
+and classifying \emph{audio-visual events}, i.e., events simultaneously visible
+and audible in a video. In this paper, we solve AVEL in a weakly-supervised
+setting, where only video-level event labels (their presence/absence, but not
+their locations in time) are available as supervision for training. Our idea is
+to use a base model to estimate labels on the training data at a finer temporal
+resolution than at the video level and re-train the model with these labels.
+I.e., we determine the subset of labels for each \emph{slice} of frames in a
+training video by (i) replacing the frames outside the slice with those from a
+second video having no overlap in video-level labels, and (ii) feeding this
+synthetic video into the base model to extract labels for just the slice in
+question. To handle the out-of-distribution nature of our synthetic videos, we
+propose an auxiliary objective for the base model that induces more reliable
+predictions of the localized event labels as desired. Our three-stage pipeline
+outperforms several existing AVEL methods with no architectural changes and
+improves performance on a related weakly-supervised task as well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A comparative analysis of SRGAN models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09456v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09456v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Rezapoor Nikroo, Ajinkya Deshmukh, Anantha Sharma, Adrian Tam, Kaarthik Kumar, Cleo Norris, Aditya Dangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we evaluate the performance of multiple state-of-the-art SRGAN
+(Super Resolution Generative Adversarial Network) models, ESRGAN, Real-ESRGAN
+and EDSR, on a benchmark dataset of real-world images which undergo degradation
+using a pipeline. Our results show that some models seem to significantly
+increase the resolution of the input images while preserving their visual
+quality, this is assessed using Tesseract OCR engine. We observe that EDSR-BASE
+model from huggingface outperforms the remaining candidate models in terms of
+both quantitative metrics and subjective visual quality assessments with least
+compute overhead. Specifically, EDSR generates images with higher peak
+signal-to-noise ratio (PSNR) and structural similarity index (SSIM) values and
+are seen to return high quality OCR results with Tesseract OCR engine. These
+findings suggest that EDSR is a robust and effective approach for single-image
+super-resolution and may be particularly well-suited for applications where
+high-quality visual fidelity is critical and optimized compute.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 tables, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards the Sparseness of Projection Head in <span class="highlight-title">Self-Supervised</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08913v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08913v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeen Song, Xingzhe Su, Jingyao Wang, Wenwen Qiang, Changwen Zheng, Fuchun Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, self-supervised learning (SSL) has emerged as a promising
+approach for extracting valuable representations from unlabeled data. One
+successful SSL method is contrastive learning, which aims to bring positive
+examples closer while pushing negative examples apart. Many current contrastive
+learning approaches utilize a parameterized projection head. Through a
+combination of empirical analysis and theoretical investigation, we provide
+insights into the internal mechanisms of the projection head and its
+relationship with the phenomenon of dimensional collapse. Our findings
+demonstrate that the projection head enhances the quality of representations by
+performing contrastive loss in a projected subspace. Therefore, we propose an
+assumption that only a subset of features is necessary when minimizing the
+contrastive loss of a mini-batch of data. Theoretical analysis further suggests
+that a sparse projection head can enhance generalization, leading us to
+introduce SparseHead - a regularization term that effectively constrains the
+sparsity of the projection head, and can be seamlessly integrated with any
+self-supervised learning (SSL) approaches. Our experimental results validate
+the effectiveness of SparseHead, demonstrating its ability to improve the
+performance of existing contrastive methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ M-FLAG: Medical Vision-Language <span class="highlight-title">Pre-train</span>ing with Frozen Language Models
+  and Latent Space Geometry Optimization <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08347v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08347v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Che Liu, Sibo Cheng, Chen Chen, Mengyun Qiao, Weitong Zhang, Anand Shah, Wenjia Bai, Rossella Arcucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical vision-language models enable co-learning and integrating features
+from medical imaging and clinical text. However, these models are not easy to
+train and the latent representation space can be complex. Here we propose a
+novel way for pre-training and regularising medical vision-language models. The
+proposed method, named Medical vision-language pre-training with Frozen
+language models and Latent spAce Geometry optimization (M-FLAG), leverages a
+frozen language model for training stability and efficiency and introduces a
+novel orthogonality loss to harmonize the latent space geometry. We demonstrate
+the potential of the pre-trained model on three downstream tasks: medical image
+classification, segmentation, and object detection. Extensive experiments
+across five public datasets demonstrate that M-FLAG significantly outperforms
+existing medical vision-language pre-training approaches and reduces the number
+of parameters by 78\%. Notably, M-FLAG achieves outstanding performance on the
+segmentation task while using only 1\% of the RSNA dataset, even outperforming
+ImageNet pre-trained models that have been fine-tuned using 100\% of the data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disentangle then Parse:Night-time Semantic Segmentation with
+  Illumination Disentanglement <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09362v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09362v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixiang Wei, Lin Chen, Tao Tu, Huaian Chen, Pengyang Ling, Yi Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most prior semantic segmentation methods have been developed for day-time
+scenes, while typically underperforming in night-time scenes due to
+insufficient and complicated lighting conditions. In this work, we tackle this
+challenge by proposing a novel night-time semantic segmentation paradigm, i.e.,
+disentangle then parse (DTP). DTP explicitly disentangles night-time images
+into light-invariant reflectance and light-specific illumination components and
+then recognizes semantics based on their adaptive fusion. Concretely, the
+proposed DTP comprises two key components: 1) Instead of processing
+lighting-entangled features as in prior works, our Semantic-Oriented
+Disentanglement (SOD) framework enables the extraction of reflectance component
+without being impeded by lighting, allowing the network to consistently
+recognize the semantics under cover of varying and complicated lighting
+conditions. 2) Based on the observation that the illumination component can
+serve as a cue for some semantically confused regions, we further introduce an
+Illumination-Aware Parser (IAParser) to explicitly learn the correlation
+between semantics and lighting, and aggregate the illumination features to
+yield more precise predictions. Extensive experiments on the night-time
+segmentation task with various settings demonstrate that DTP significantly
+outperforms state-of-the-art methods. Furthermore, with negligible additional
+parameters, DTP can be directly used to benefit existing day-time methods for
+night-time segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaMSS: Adaptive Multi-Modality Segmentation-to-Survival Learning for
+  Survival Outcome Prediction from PET/CT Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09946v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09946v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Meng, Bingxin Gu, Michael Fulham, Shaoli Song, Dagan Feng, Lei Bi, Jinman Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Survival prediction is a major concern for cancer management. Deep survival
+models based on deep learning have been widely adopted to perform end-to-end
+survival prediction from medical images. Recent deep survival models achieved
+promising performance by jointly performing tumor segmentation with survival
+prediction, where the models were guided to extract tumor-related information
+through Multi-Task Learning (MTL). However, these deep survival models have
+difficulties in exploring out-of-tumor prognostic information. In addition,
+existing deep survival models are unable to effectively leverage multi-modality
+images. Empirically-designed fusion strategies were commonly adopted to fuse
+multi-modality information via task-specific manually-designed networks, thus
+limiting the adaptability to different scenarios. In this study, we propose an
+Adaptive Multi-modality Segmentation-to-Survival model (AdaMSS) for survival
+prediction from PET/CT images. Instead of adopting MTL, we propose a novel
+Segmentation-to-Survival Learning (SSL) strategy, where our AdaMSS is trained
+for tumor segmentation and survival prediction sequentially in two stages. This
+strategy enables the AdaMSS to focus on tumor regions in the first stage and
+gradually expand its focus to include other prognosis-related regions in the
+second stage. We also propose a data-driven strategy to fuse multi-modality
+information, which realizes adaptive optimization of fusion strategies based on
+training data during training. With the SSL and data-driven fusion strategies,
+our AdaMSS is designed as an adaptive model that can self-adapt its focus
+regions and fusion strategy for different training stages. Extensive
+experiments with two large clinical datasets show that our AdaMSS outperforms
+state-of-the-art survival prediction methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mining Negative Temporal Contexts For False Positive Suppression In
+  Real-Time Ultrasound Lesion Detection <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18060v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18060v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojun Yu, Youcheng Li, QuanLin Wu, Ziwei Zhao, Dengbo Chen, Dong Wang, Liwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  During ultrasonic scanning processes, real-time lesion detection can assist
+radiologists in accurate cancer diagnosis. However, this essential task remains
+challenging and underexplored. General-purpose real-time object detection
+models can mistakenly report obvious false positives (FPs) when applied to
+ultrasound videos, potentially misleading junior radiologists. One key issue is
+their failure to utilize negative symptoms in previous frames, denoted as
+negative temporal contexts (NTC). To address this issue, we propose to extract
+contexts from previous frames, including NTC, with the guidance of inverse
+optical flow. By aggregating extracted contexts, we endow the model with the
+ability to suppress FPs by leveraging NTC. We call the resulting model
+UltraDet. The proposed UltraDet demonstrates significant improvement over
+previous state-of-the-arts and achieves real-time inference speed. We release
+the code, checkpoints, and high-quality labels of the CVA-BUS dataset in
+https://github.com/HaojunYu1998/UltraDet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, MICCAI 2023 Early Accept</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MixPath: A Unified Approach for One-shot Neural Architecture Search <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2001.05887v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2001.05887v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangxiang Chu, Shun Lu, Xudong Li, Bo Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blending multiple convolutional kernels is proved advantageous in neural
+architecture design. However, current two-stage neural architecture search
+methods are mainly limited to single-path search spaces. How to efficiently
+search models of multi-path structures remains a difficult problem. In this
+paper, we are motivated to train a one-shot multi-path supernet to accurately
+evaluate the candidate architectures. Specifically, we discover that in the
+studied search spaces, feature vectors summed from multiple paths are nearly
+multiples of those from a single path. Such disparity perturbs the supernet
+training and its ranking ability. Therefore, we propose a novel mechanism
+called Shadow Batch Normalization (SBN) to regularize the disparate feature
+statistics. Extensive experiments prove that SBNs are capable of stabilizing
+the optimization and improving ranking performance. We call our unified
+multi-path one-shot approach as MixPath, which generates a series of models
+that achieve state-of-the-art results on ImageNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal brain age estimation using interpretable adaptive
+  population-graph learning <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04639v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04639v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyriaki-Margarita Bintsi, Vasileios Baltatzis, Rolandos Alexandros Potamias, Alexander Hammers, Daniel Rueckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain age estimation is clinically important as it can provide valuable
+information in the context of neurodegenerative diseases such as Alzheimer's.
+Population graphs, which include multimodal imaging information of the subjects
+along with the relationships among the population, have been used in literature
+along with Graph Convolutional Networks (GCNs) and have proved beneficial for a
+variety of medical imaging tasks. A population graph is usually static and
+constructed manually using non-imaging information. However, graph construction
+is not a trivial task and might significantly affect the performance of the
+GCN, which is inherently very sensitive to the graph structure. In this work,
+we propose a framework that learns a population graph structure optimized for
+the downstream task. An attention mechanism assigns weights to a set of imaging
+and non-imaging features (phenotypes), which are then used for edge extraction.
+The resulting graph is used to train the GCN. The entire pipeline can be
+trained end-to-end. Additionally, by visualizing the attention weights that
+were the most important for the graph construction, we increase the
+interpretability of the graph. We use the UK Biobank, which provides a large
+variety of neuroimaging and non-imaging phenotypes, to evaluate our method on
+brain age regression and classification. The proposed method outperforms
+competing static graph approaches and other state-of-the-art adaptive methods.
+We further show that the assigned attention scores indicate that there are both
+imaging and non-imaging phenotypes that are informative for brain age
+estimation and are in agreement with the relevant literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Schema Inference for Interpretable Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06635v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06635v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haofei Zhang, Mengqi Xue, Xiaokang Liu, Kaixuan Chen, Jie Song, Mingli Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study a novel inference paradigm, termed as schema
+inference, that learns to deductively infer the explainable predictions by
+rebuilding the prior deep neural network (DNN) forwarding scheme, guided by the
+prevalent philosophical cognitive concept of schema. We strive to reformulate
+the conventional model inference pipeline into a graph matching policy that
+associates the extracted visual concepts of an image with the pre-computed
+scene impression, by analogy with human reasoning mechanism via impression
+matching. To this end, we devise an elaborated architecture, termed as
+SchemaNet, as a dedicated instantiation of the proposed schema inference
+concept, that models both the visual semantics of input instances and the
+learned abstract imaginations of target categories as topological relational
+graphs. Meanwhile, to capture and leverage the compositional contributions of
+visual semantics in a global view, we also introduce a universal Feat2Graph
+scheme in SchemaNet to establish the relational graphs that contain abundant
+interaction information. Both the theoretical analysis and the experimental
+results on several benchmarks demonstrate that the proposed schema inference
+achieves encouraging performance and meanwhile yields a clear picture of the
+deductive process leading to the predictions. Our code is available at
+https://github.com/zhfeing/SchemaNet-PyTorch.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ I See Dead People: Gray-Box Adversarial Attack on Image-To-Text Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07591v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07591v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raz Lapid, Moshe Sipper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern image-to-text systems typically adopt the encoder-decoder framework,
+which comprises two main components: an image encoder, responsible for
+extracting image features, and a transformer-based decoder, used for generating
+captions. Taking inspiration from the analysis of neural networks' robustness
+against adversarial perturbations, we propose a novel gray-box algorithm for
+creating adversarial examples in image-to-text models. Unlike image
+classification tasks that have a finite set of class labels, finding visually
+similar adversarial examples in an image-to-text task poses greater challenges
+because the captioning system allows for a virtually infinite space of possible
+captions. In this paper, we present a gray-box adversarial attack on
+image-to-text, both untargeted and targeted. We formulate the process of
+discovering adversarial perturbations as an optimization problem that uses only
+the image-encoder component, meaning the proposed attack is language-model
+agnostic. Through experiments conducted on the ViT-GPT2 model, which is the
+most-used image-to-text model in Hugging Face, and the Flickr30k dataset, we
+demonstrate that our proposed attack successfully generates visually similar
+adversarial examples, both with untargeted and targeted captions. Notably, our
+attack operates in a gray-box manner, requiring no knowledge about the decoder
+module. We also show that our attacks fool the popular open-source platform
+Hugging Face.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAR: Region-Aware Point Cloud Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.03544v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.03544v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Hao, Yi Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper concerns the research problem of point cloud registration to find
+the rigid transformation to optimally align the source point set with the
+target one. Learning robust point cloud registration models with deep neural
+networks has emerged as a powerful paradigm, offering promising performance in
+predicting the global geometric transformation for a pair of point sets.
+Existing methods firstly leverage an encoder to regress a latent shape
+embedding, which is then decoded into a shape-conditioned transformation via
+concatenation-based conditioning. However, different regions of a 3D shape vary
+in their geometric structures which makes it more sense that we have a
+region-conditioned transformation instead of the shape-conditioned one. In this
+paper we present a \underline{R}egion-\underline{A}ware point cloud
+\underline{R}egistration, denoted as RAR, to predict transformation for
+pairwise point sets in the self-supervised learning fashion. More specifically,
+we develop a novel region-aware decoder (RAD) module that is formed with an
+implicit neural region representation parameterized by neural networks. The
+implicit neural region representation is learned with a self-supervised 3D
+shape reconstruction loss without the need for region labels. Consequently, the
+region-aware decoder (RAD) module guides the training of the region-aware
+transformation (RAT) module and region-aware weight (RAW) module, which predict
+the transforms and weights for different regions respectively. The global
+geometric transformation from source point set to target one is then formed by
+the weighted fusion of region-aware transforms. Compared to the
+state-of-the-art approaches, our experiments show that our RAR achieves
+superior registration performance over various benchmark datasets (e.g.
+ModelNet40).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2006.06200</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reduction of Class Activation Uncertainty with Background Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03238v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03238v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        H M Dipu Kabir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multitask learning is a popular approach to training high-performing neural
+networks with improved generalization. In this paper, we propose a background
+class to achieve improved generalization at a lower computation compared to
+multitask learning to help researchers and organizations with limited
+computation power. We also present a methodology for selecting background
+images and discuss potential future improvements. We apply our approach to
+several datasets and achieved improved generalization with much lower
+computation. We also investigate class activation mappings (CAMs) of the
+trained model and observed the tendency towards looking at a bigger picture in
+a few class classification problems with the proposed model training
+methodology. Applying transformer with the proposed background class, we
+receive state-of-the-art (SOTA) performance on STL-10, Caltech-101, and
+CINIC-10 datasets. Example scripts are available in the `CAM' folder of the
+following GitHub Repository: github.com/dipuk0506/UQ
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CB-HVTNet: A channel-boosted hybrid vision <span class="highlight-title">transformer</span> network for
+  lymphocyte assessment in histopathological images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09211v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09211v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Momina Liaqat Ali, Zunaira Rauf, Asifullah Khan, Anabia Sohail, Rafi Ullah, Jeonghwan Gwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers, due to their ability to learn long range dependencies, have
+overcome the shortcomings of convolutional neural networks (CNNs) for global
+perspective learning. Therefore, they have gained the focus of researchers for
+several vision related tasks including medical diagnosis. However, their
+multi-head attention module only captures global level feature representations,
+which is insufficient for medical images. To address this issue, we propose a
+Channel Boosted Hybrid Vision Transformer (CB HVT) that uses transfer learning
+to generate boosted channels and employs both transformers and CNNs to analyse
+lymphocytes in histopathological images. The proposed CB HVT comprises five
+modules, including a channel generation module, channel exploitation module,
+channel merging module, region-aware module, and a detection and segmentation
+head, which work together to effectively identify lymphocytes. The channel
+generation module uses the idea of channel boosting through transfer learning
+to extract diverse channels from different auxiliary learners. In the CB HVT,
+these boosted channels are first concatenated and ranked using an attention
+mechanism in the channel exploitation module. A fusion block is then utilized
+in the channel merging module for a gradual and systematic merging of the
+diverse boosted channels to improve the network's learning representations. The
+CB HVT also employs a proposal network in its region aware module and a head to
+effectively identify objects, even in overlapping regions and with artifacts.
+We evaluated the proposed CB HVT on two publicly available datasets for
+lymphocyte assessment in histopathological images. The results show that CB HVT
+outperformed other state of the art detection models, and has good
+generalization ability, demonstrating its value as a tool for pathologists.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging triplet loss for unsupervised action segmentation <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06403v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06403v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        E. Bueno-Benito, B. Tura, M. Dimiccoli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel fully unsupervised framework that learns
+action representations suitable for the action segmentation task from the
+single input video itself, without requiring any training data. Our method is a
+deep metric learning approach rooted in a shallow network with a triplet loss
+operating on similarity distributions and a novel triplet selection strategy
+that effectively models temporal and semantic priors to discover actions in the
+new representational space. Under these circumstances, we successfully recover
+temporal boundaries in the learned action representations with higher quality
+compared with existing unsupervised approaches. The proposed method is
+evaluated on two widely used benchmark datasets for the action segmentation
+task and it achieves competitive performance by applying a generic clustering
+algorithm on the learned representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern
+  Recognition (CVPR) Workshops, 2023, pp. 4921-4929</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The MONET <span class="highlight-title">dataset</span>: Multimodal drone thermal <span class="highlight-title">dataset</span> recorded in rural
+  scenarios <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05417v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05417v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luigi Riz, Andrea Caraffa, Matteo Bortolon, Mohamed Lamine Mekhalfi, Davide Boscaini, André Moura, José Antunes, André Dias, Hugo Silva, Andreas Leonidou, Christos Constantinides, Christos Keleshis, Dante Abate, Fabio Poiesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present MONET, a new multimodal dataset captured using a thermal camera
+mounted on a drone that flew over rural areas, and recorded human and vehicle
+activities. We captured MONET to study the problem of object localisation and
+behaviour understanding of targets undergoing large-scale variations and being
+recorded from different and moving viewpoints. Target activities occur in two
+different land sites, each with unique scene structures and cluttered
+backgrounds. MONET consists of approximately 53K images featuring 162K manually
+annotated bounding boxes. Each image is timestamp-aligned with drone metadata
+that includes information about attitudes, speed, altitude, and GPS
+coordinates. MONET is different from previous thermal drone datasets because it
+features multimodal data, including rural scenes captured with thermal cameras
+containing both person and vehicle targets, along with trajectory information
+and metadata. We assessed the difficulty of the dataset in terms of transfer
+learning between the two sites and evaluated nine object detection algorithms
+to identify the open challenges associated with this type of data. Project
+page: https://github.com/fabiopoiesi/monet_dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Computer Vision and Pattern Recognition (CVPR) Workshops
+  2023 - 6th Multimodal Learning and Applications Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Fields for Interactive Visualization of Statistical Dependencies
+  in 3D Simulation Ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02203v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02203v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Farokhmanesh, Kevin Höhlein, Christoph Neuhauser, Rüdiger Westermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the first neural network that has learned to compactly represent
+and can efficiently reconstruct the statistical dependencies between the values
+of physical variables at different spatial locations in large 3D simulation
+ensembles. Going beyond linear dependencies, we consider mutual information as
+a measure of non-linear dependence. We demonstrate learning and reconstruction
+with a large weather forecast ensemble comprising 1000 members, each storing
+multiple physical variables at a 250 x 352 x 20 simulation grid. By
+circumventing compute-intensive statistical estimators at runtime, we
+demonstrate significantly reduced memory and computation requirements for
+reconstructing the major dependence structures. This enables embedding the
+estimator into a GPU-accelerated direct volume renderer and interactively
+visualizing all mutual dependencies for a selected domain point.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bidirectional Temporal Diffusion Model for Temporally Consistent Human
+  Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00574v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00574v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tserendorj Adiya, Sanghun Kim, Jung Eun Lee, Jae Shin Yoon, Hwasup Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a method to generate temporally coherent human animation from a
+single image, a video, or a random noise. This problem has been formulated as
+modeling of an auto-regressive generation, i.e., to regress past frames to
+decode future frames. However, such unidirectional generation is highly prone
+to motion drifting over time, generating unrealistic human animation with
+significant artifacts such as appearance distortion. We claim that
+bidirectional temporal modeling enforces temporal coherence on a generative
+network by largely suppressing the motion ambiguity of human appearance. To
+prove our claim, we design a novel human animation framework using a denoising
+diffusion model: a neural network learns to generate the image of a person by
+denoising temporal Gaussian noises whose intermediate results are
+cross-conditioned bidirectionally between consecutive frames. In the
+experiments, our method demonstrates strong performance compared to existing
+unidirectional approaches with realistic temporal coherence
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Why Does Little Robustness Help? Understanding Adversarial
+  Transferability From Surrogate Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07873v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07873v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yechao Zhang, Shengshan Hu, Leo Yu Zhang, Junyu Shi, Minghui Li, Xiaogeng Liu, Wei Wan, Hai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs
+that successfully fool white-box surrogate models can also deceive other
+black-box models with different architectures. Although a bunch of empirical
+studies have provided guidance on generating highly transferable AEs, many of
+these findings lack explanations and even lead to inconsistent advice. In this
+paper, we take a further step towards understanding adversarial
+transferability, with a particular focus on surrogate aspects. Starting from
+the intriguing little robustness phenomenon, where models adversarially trained
+with mildly perturbed adversarial samples can serve as better surrogates, we
+attribute it to a trade-off between two predominant factors: model smoothness
+and gradient similarity. Our investigations focus on their joint effects,
+rather than their separate correlations with transferability. Through a series
+of theoretical and empirical analyses, we conjecture that the data distribution
+shift in adversarial training explains the degradation of gradient similarity.
+Building on these insights, we explore the impacts of data augmentation and
+gradient regularization on transferability and identify that the trade-off
+generally exists in the various training mechanisms, thus building a
+comprehensive blueprint for the regulation mechanism behind transferability.
+Finally, we provide a general route for constructing better surrogates to boost
+transferability which optimizes both model smoothness and gradient similarity
+simultaneously, e.g., the combination of input gradient regularization and
+sharpness-aware minimization (SAM), validated by extensive experiments. In
+summary, we call for attention to the united impacts of these two factors for
+launching effective transfer attacks, rather than optimizing one while ignoring
+the other, and emphasize the crucial role of manipulating surrogate models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21
+  pages, 12 figures, 13 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchically Decomposed Graph Convolutional Networks for
+  Skeleton-Based Action Recognition <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.10741v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.10741v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jungho Lee, Minhyeok Lee, Dogyoon Lee, Sangyoun Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph convolutional networks (GCNs) are the most commonly used methods for
+skeleton-based action recognition and have achieved remarkable performance.
+Generating adjacency matrices with semantically meaningful edges is
+particularly important for this task, but extracting such edges is challenging
+problem. To solve this, we propose a hierarchically decomposed graph
+convolutional network (HD-GCN) architecture with a novel hierarchically
+decomposed graph (HD-Graph). The proposed HD-GCN effectively decomposes every
+joint node into several sets to extract major structurally adjacent and distant
+edges, and uses them to construct an HD-Graph containing those edges in the
+same semantic spaces of a human skeleton. In addition, we introduce an
+attention-guided hierarchy aggregation (A-HA) module to highlight the dominant
+hierarchical edge sets of the HD-Graph. Furthermore, we apply a new six-way
+ensemble method, which uses only joint and bone stream without any motion
+stream. The proposed model is evaluated and achieves state-of-the-art
+performance on four large, popular datasets. Finally, we demonstrate the
+effectiveness of our model with various comparative experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ YOLIC: An Efficient Method for Object Localization and Classification on
+  Edge Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06689v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06689v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Su, Qiangfu Zhao, Yoichi Tomioka, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of Tiny AI, we introduce "You Only Look at Interested Cells"
+(YOLIC), an efficient method for object localization and classification on edge
+devices. Seamlessly blending the strengths of semantic segmentation and object
+detection, YOLIC offers superior computational efficiency and precision. By
+adopting Cells of Interest for classification instead of individual pixels,
+YOLIC encapsulates relevant information, reduces computational load, and
+enables rough object shape inference. Importantly, the need for bounding box
+regression is obviated, as YOLIC capitalizes on the predetermined cell
+configuration that provides information about potential object location, size,
+and shape. To tackle the issue of single-label classification limitations, a
+multi-label classification approach is applied to each cell, effectively
+recognizing overlapping or closely situated objects. This paper presents
+extensive experiments on multiple datasets, demonstrating that YOLIC achieves
+detection performance comparable to the state-of-the-art YOLO algorithms while
+surpassing in speed, exceeding 30fps on a Raspberry Pi 4B CPU. All resources
+related to this study, including datasets, cell designer, image annotation
+tool, and source code, have been made publicly available on our project website
+at https://kai3316.github.io/yolic.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boundary Distribution Estimation for Precise Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.01396v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.01396v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Zhi, Haoran Zhou, Hang Huang, Rui Zhao, Rui Zhou, Qingguo Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of state-of-the-art object detection, the task of object
+localization is typically accomplished through a dedicated subnet that
+emphasizes bounding box regression. This subnet traditionally predicts the
+object's position by regressing the box's center position and scaling factors.
+Despite the widespread adoption of this approach, we have observed that the
+localization results often suffer from defects, leading to unsatisfactory
+detector performance. In this paper, we address the shortcomings of previous
+methods through theoretical analysis and experimental verification and present
+an innovative solution for precise object detection. Instead of solely focusing
+on the object's center and size, our approach enhances the accuracy of bounding
+box localization by refining the box edges based on the estimated distribution
+at the object's boundary. Experimental results demonstrate the potential and
+generalizability of our proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Automated Hemorrhage Detection in Sparse-view Computed
+  Tomography via Deep Convolutional Neural Network based Artifact Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09340v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09340v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Thalhammer, Manuel Schultheiss, Tina Dorosti, Tobias Lasser, Franz Pfeiffer, Daniela Pfeiffer, Florian Schaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Sparse-view computed tomography (CT) is an effective way to reduce
+dose by lowering the total number of views acquired, albeit at the expense of
+image quality, which, in turn, can impact the ability to detect diseases. We
+explore deep learning-based artifact reduction in sparse-view cranial CT scans
+and its impact on automated hemorrhage detection. Methods: We trained a U-Net
+for artefact reduction on simulated sparse-view cranial CT scans from 3000
+patients obtained from a public dataset and reconstructed with varying levels
+of sub-sampling. Additionally, we trained a convolutional neural network on
+fully sampled CT data from 17,545 patients for automated hemorrhage detection.
+We evaluated the classification performance using the area under the receiver
+operator characteristic curves (AUC-ROCs) with corresponding 95% confidence
+intervals (CIs) and the DeLong test, along with confusion matrices. The
+performance of the U-Net was compared to an analytical approach based on total
+variation (TV). Results: The U-Net performed superior compared to unprocessed
+and TV-processed images with respect to image quality and automated hemorrhage
+diagnosis. With U-Net post-processing, the number of views can be reduced from
+4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;
+0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256
+views (0.967; 0.964-0.969) with a slight performance decrease (P<.001).
+Conclusion: The results suggest that U-Net based artifact reduction
+substantially enhances automated hemorrhage detection in sparse-view cranial
+CTs. Our findings highlight that appropriate post-processing is crucial for
+optimal image quality and diagnostic accuracy while minimizing radiation dose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion<span class="highlight-title">BERT</span>: A Unified Perspective on Learning Human Motion
+  Representations <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06551v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06551v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Zhu, Xiaoxuan Ma, Zhaoyang Liu, Libin Liu, Wayne Wu, Yizhou Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a unified perspective on tackling various human-centric video
+tasks by learning human motion representations from large-scale and
+heterogeneous data resources. Specifically, we propose a pretraining stage in
+which a motion encoder is trained to recover the underlying 3D motion from
+noisy partial 2D observations. The motion representations acquired in this way
+incorporate geometric, kinematic, and physical knowledge about human motion,
+which can be easily transferred to multiple downstream tasks. We implement the
+motion encoder with a Dual-stream Spatio-temporal Transformer (DSTformer)
+neural network. It could capture long-range spatio-temporal relationships among
+the skeletal joints comprehensively and adaptively, exemplified by the lowest
+3D pose estimation error so far when trained from scratch. Furthermore, our
+proposed framework achieves state-of-the-art performance on all three
+downstream tasks by simply finetuning the pretrained motion encoder with a
+simple regression head (1-2 layers), which demonstrates the versatility of the
+learned motion representations. Code and models are available at
+https://motionbert.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation
+  Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09416v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09416v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Betti, Jacopo Staiano, Lorenzo Baraldi, Lorenzo Baraldi, Rita Cucchiara, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research in Image Generation has recently made significant progress,
+particularly boosted by the introduction of Vision-Language models which are
+able to produce high-quality visual content based on textual inputs. Despite
+ongoing advancements in terms of generation quality and realism, no methodical
+frameworks have been defined yet to quantitatively measure the quality of the
+generated content and the adherence with the prompted requests: so far, only
+human-based evaluations have been adopted for quality satisfaction and for
+comparing different generative methods. We introduce a novel automated method
+for Visual Concept Evaluation (ViCE), i.e. to assess consistency between a
+generated/edited image and the corresponding prompt/instructions, with a
+process inspired by the human cognitive behaviour. ViCE combines the strengths
+of Large Language Models (LLMs) and Visual Question Answering (VQA) into a
+unified pipeline, aiming to replicate the human cognitive process in quality
+assessment. This method outlines visual concepts, formulates image-specific
+verification questions, utilizes the Q&A system to investigate the image, and
+scores the combined outcome. Although this brave new hypothesis of mimicking
+humans in the image evaluation process is in its preliminary assessment stage,
+results are promising and open the door to a new form of automatic evaluation
+which could have significant impact as the image generation or the image target
+editing tasks become more and more sophisticated.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as oral at ACM MultiMedia 2023 (Brave New Ideas track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Super Vision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.11397v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.11397v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingbao Lin, Mengzhao Chen, Yuxin Zhang, Chunhua Shen, Rongrong Ji, Liujuan Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We attempt to reduce the computational costs in vision transformers (ViTs),
+which increase quadratically in the token number. We present a novel training
+paradigm that trains only one ViT model at a time, but is capable of providing
+improved image recognition performance with various computational costs. Here,
+the trained ViT model, termed super vision transformer (SuperViT), is empowered
+with the versatile ability to solve incoming patches of multiple sizes as well
+as preserve informative tokens with multiple keeping rates (the ratio of
+keeping tokens) to achieve good hardware efficiency for inference, given that
+the available hardware resources often change from time to time. Experimental
+results on ImageNet demonstrate that our SuperViT can considerably reduce the
+computational costs of ViT models with even performance increase. For example,
+we reduce 2x FLOPs of DeiT-S while increasing the Top-1 accuracy by 0.2% and
+0.7% for 1.5x reduction. Also, our SuperViT significantly outperforms existing
+studies on efficient vision transformers. For example, when consuming the same
+amount of FLOPs, our SuperViT surpasses the recent state-of-the-art (SOTA) EViT
+by 1.1% when using DeiT-S as their backbones. The project of this work is made
+publicly available at https://github.com/lmbxmu/SuperViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Journal of Computer Vision (IJCV) in the
+  year of 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MELON: NeRF with Unposed Images in SO(3) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08096v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08096v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Axel Levy, Mark Matthews, Matan Sela, Gordon Wetzstein, Dmitry Lagun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural radiance fields enable novel-view synthesis and scene reconstruction
+with photorealistic quality from a few images, but require known and accurate
+camera poses. Conventional pose estimation algorithms fail on smooth or
+self-similar scenes, while methods performing inverse rendering from unposed
+views require a rough initialization of the camera orientations. The main
+difficulty of pose estimation lies in real-life objects being almost invariant
+under certain transformations, making the photometric distance between rendered
+views non-convex with respect to the camera parameters. Using an equivalence
+relation that matches the distribution of local minima in camera space, we
+reduce this space to its quotient set, in which pose estimation becomes a more
+convex problem. Using a neural-network to regularize pose estimation, we
+demonstrate that our method - MELON - can reconstruct a neural radiance field
+from unposed images with state-of-the-art accuracy while requiring ten times
+fewer views than adversarial approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TinyTracker: Ultra-Fast and Ultra-Low-Power Edge Vision In-Sensor for
+  Gaze Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07813v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07813v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pietro Bonazzi, Thomas Ruegg, Sizhen Bian, Yawei Li, Michele Magno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent edge vision tasks encounter the critical challenge of ensuring
+power and latency efficiency due to the typically heavy computational load they
+impose on edge platforms.This work leverages one of the first "AI in sensor"
+vision platforms, IMX500 by Sony, to achieve ultra-fast and ultra-low-power
+end-to-end edge vision applications. We evaluate the IMX500 and compare it to
+other edge platforms, such as the Google Coral Dev Micro and Sony Spresense, by
+exploring gaze estimation as a case study. We propose TinyTracker, a highly
+efficient, fully quantized model for 2D gaze estimation designed to maximize
+the performance of the edge vision systems considered in this study.
+TinyTracker achieves a 41x size reduction (600Kb) compared to iTracker [1]
+without significant loss in gaze estimation accuracy (maximum of 0.16 cm when
+fully quantized). TinyTracker's deployment on the Sony IMX500 vision sensor
+results in end-to-end latency of around 19ms. The camera takes around 17.9ms to
+read, process and transmit the pixels to the accelerator. The inference time of
+the network is 0.86ms with an additional 0.24 ms for retrieving the results
+from the sensor. The overall energy consumption of the end-to-end system is 4.9
+mJ, including 0.06 mJ for inference. The end-to-end study shows that IMX500 is
+1.7x faster than CoralMicro (19ms vs 34.4ms) and 7x more power efficient (4.9mJ
+VS 34.2mJ)
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Scene Text Recognition: A Data Perspective <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08723v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08723v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qing Jiang, Jiapeng Wang, Dezhi Peng, Chongyu Liu, Lianwen Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper aims to re-assess scene text recognition (STR) from a
+data-oriented perspective. We begin by revisiting the six commonly used
+benchmarks in STR and observe a trend of performance saturation, whereby only
+2.91% of the benchmark images cannot be accurately recognized by an ensemble of
+13 representative models. While these results are impressive and suggest that
+STR could be considered solved, however, we argue that this is primarily due to
+the less challenging nature of the common benchmarks, thus concealing the
+underlying issues that STR faces. To this end, we consolidate a large-scale
+real STR dataset, namely Union14M, which comprises 4 million labeled images and
+10 million unlabeled images, to assess the performance of STR models in more
+complex real-world scenarios. Our experiments demonstrate that the 13 models
+can only achieve an average accuracy of 66.53% on the 4 million labeled images,
+indicating that STR still faces numerous challenges in the real world. By
+analyzing the error patterns of the 13 models, we identify seven open
+challenges in STR and develop a challenge-driven benchmark consisting of eight
+distinct subsets to facilitate further progress in the field. Our exploration
+demonstrates that STR is far from being solved and leveraging data may be a
+promising solution. In this regard, we find that utilizing the 10 million
+unlabeled images through self-supervised pre-training can significantly improve
+the robustness of STR model in real-world scenarios and leads to
+state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving the Transferability of Adversarial Attacks on Face Recognition
+  with Beneficial Perturbation Feature Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.16117v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.16117v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengfan Zhou, Hefei Ling, Yuxuan Shi, Jiazhong Chen, Zongyi Li, Ping Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition (FR) models can be easily fooled by adversarial examples,
+which are crafted by adding imperceptible perturbations on benign face images.
+The existence of adversarial face examples poses a great threat to the security
+of society. In order to build a more sustainable digital nation, in this paper,
+we improve the transferability of adversarial face examples to expose more
+blind spots of existing FR models. Though generating hard samples has shown its
+effectiveness in improving the generalization of models in training tasks, the
+effectiveness of utilizing this idea to improve the transferability of
+adversarial face examples remains unexplored. To this end, based on the
+property of hard samples and the symmetry between training tasks and
+adversarial attack tasks, we propose the concept of hard models, which have
+similar effects as hard samples for adversarial attack tasks. Utilizing the
+concept of hard models, we propose a novel attack method called Beneficial
+Perturbation Feature Augmentation Attack (BPFA), which reduces the overfitting
+of adversarial examples to surrogate FR models by constantly generating new
+hard models to craft the adversarial examples. Specifically, in the
+backpropagation, BPFA records the gradients on pre-selected feature maps and
+uses the gradient on the input image to craft the adversarial example. In the
+next forward propagation, BPFA leverages the recorded gradients to add
+beneficial perturbations on their corresponding feature maps to increase the
+loss. Extensive experiments demonstrate that BPFA can significantly boost the
+transferability of adversarial attacks on FR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>\c{opyright} 2023 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Making Substitute Models More Bayesian Can Enhance Transferability of
+  Adversarial Examples <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.05086v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.05086v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhang Li, Yiwen Guo, Wangmeng Zuo, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transferability of adversarial examples across deep neural networks
+(DNNs) is the crux of many black-box attacks. Many prior efforts have been
+devoted to improving the transferability via increasing the diversity in inputs
+of some substitute models. In this paper, by contrast, we opt for the diversity
+in substitute models and advocate to attack a Bayesian model for achieving
+desirable transferability. Deriving from the Bayesian formulation, we develop a
+principled strategy for possible finetuning, which can be combined with many
+off-the-shelf Gaussian posterior approximations over DNN parameters. Extensive
+experiments have been conducted to verify the effectiveness of our method, on
+common benchmark datasets, and the results demonstrate that our method
+outperforms recent state-of-the-arts by large margins (roughly 19% absolute
+increase in average attack success rate on ImageNet), and, by combining with
+these recent methods, further performance gain can be obtained. Our code:
+https://github.com/qizhangli/MoreBayesian-attack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2023, fix typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting 3-DoF Ground-to-Satellite Camera Localization Accuracy via
+  Geometry-Guided Cross-View <span class="highlight-title">Transformer</span> <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08015v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08015v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujiao Shi, Fei Wu, Ankit Vora, Akhil Perincherry, Hongdong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image retrieval-based cross-view localization methods often lead to very
+coarse camera pose estimation, due to the limited sampling density of the
+database satellite images. In this paper, we propose a method to increase the
+accuracy of a ground camera's location and orientation by estimating the
+relative rotation and translation between the ground-level image and its
+matched/retrieved satellite image. Our approach designs a geometry-guided
+cross-view transformer that combines the benefits of conventional geometry and
+learnable cross-view transformers to map the ground-view observations to an
+overhead view. Given the synthesized overhead view and observed satellite
+feature maps, we construct a neural pose optimizer with strong global
+information embedding ability to estimate the relative rotation between them.
+After aligning their rotations, we develop an uncertainty-guided spatial
+correlation to generate a probability map of the vehicle locations, from which
+the relative translation can be determined. Experimental results demonstrate
+that our method significantly outperforms the state-of-the-art. Notably, the
+likelihood of restricting the vehicle lateral pose to be within 1m of its
+Ground Truth (GT) value on the cross-view KITTI dataset has been improved from
+$35.54\%$ to $76.44\%$, and the likelihood of restricting the vehicle
+orientation to be within $1^{\circ}$ of its GT value has been improved from
+$19.64\%$ to $99.10\%$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Video Anomaly Detection with Diffusion Models Conditioned
+  on Compact Motion Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01533v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01533v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anil Osman Tur, Nicola Dall'Asen, Cigdem Beyan, Elisa Ricci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper aims to address the unsupervised video anomaly detection (VAD)
+problem, which involves classifying each frame in a video as normal or
+abnormal, without any access to labels. To accomplish this, the proposed method
+employs conditional diffusion models, where the input data is the
+spatiotemporal features extracted from a pre-trained network, and the condition
+is the features extracted from compact motion representations that summarize a
+given video segment in terms of its motion and appearance. Our method utilizes
+a data-driven threshold and considers a high reconstruction error as an
+indicator of anomalous events. This study is the first to utilize compact
+motion representations for VAD and the experiments conducted on two large-scale
+VAD benchmarks demonstrate that they supply relevant information to the
+diffusion model, and consequently improve VAD performances w.r.t the prior art.
+Importantly, our method exhibits better generalization performance across
+different datasets, notably outperforming both the state-of-the-art and
+baseline methods. The code of our method is available at
+https://github.com/AnilOsmanTur/conditioned_video_anomaly_diffusion
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICIAP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Joint Demosaicking and Denoising Benefits from a Two-stage Training
+  Strategy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2009.06205v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2009.06205v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Guo, Qiyu Jin, Gabriele Facciolo, Tieyong Zeng, Jean-Michel Morel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image demosaicking and denoising are the first two key steps of the color
+image production pipeline. The classical processing sequence has for a long
+time consisted of applying denoising first, and then demosaicking. Applying the
+operations in this order leads to oversmoothing and checkerboard effects. Yet,
+it was difficult to change this order, because once the image is demosaicked,
+the statistical properties of the noise are dramatically changed and hard to
+handle by traditional denoising models. In this paper, we address this problem
+by a hybrid machine learning method. We invert the traditional color filter
+array (CFA) processing pipeline by first demosaicking and then denoising. Our
+demosaicking algorithm, trained on noiseless images, combines a traditional
+method and a residual convolutional neural network (CNN). This first stage
+retains all known information, which is the key point to obtain faithful final
+results. The noisy demosaicked image is then passed through a second CNN
+restoring a noiseless full-color image. This pipeline order completely avoids
+checkerboard effects and restores fine image detail. Although CNNs can be
+trained to solve jointly demosaicking-denoising end-to-end, we find that this
+two-stage training performs better and is less prone to failure. It is shown
+experimentally to improve on the state of the art, both quantitatively and in
+terms of visual quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 40 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Similarity Min-Max: Zero-Shot Day-Night Domain Adaptation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08779v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08779v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rundong Luo, Wenjing Wang, Wenhan Yang, Jiaying Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-light conditions not only hamper human visual experience but also degrade
+the model's performance on downstream vision tasks. While existing works make
+remarkable progress on day-night domain adaptation, they rely heavily on domain
+knowledge derived from the task-specific nighttime dataset. This paper
+challenges a more complicated scenario with border applicability, i.e.,
+zero-shot day-night domain adaptation, which eliminates reliance on any
+nighttime data. Unlike prior zero-shot adaptation approaches emphasizing either
+image-level translation or model-level adaptation, we propose a similarity
+min-max paradigm that considers them under a unified framework. On the image
+level, we darken images towards minimum feature similarity to enlarge the
+domain gap. Then on the model level, we maximize the feature similarity between
+the darkened images and their normal-light counterparts for better model
+adaptation. To the best of our knowledge, this work represents the pioneering
+effort in jointly optimizing both aspects, resulting in a significant
+improvement of model generalizability. Extensive experiments demonstrate our
+method's effectiveness and broad applicability on various nighttime vision
+tasks, including classification, semantic segmentation, visual place
+recognition, and video action recognition. Code and pre-trained models are
+available at https://red-fairy.github.io/ZeroShotDayNightDA-Webpage/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Iterative Scale-Up ExpansionIoU and Deep Features Association for
+  Multi-Object Tracking in Sports 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13074v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13074v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hsiang-Wei Huang, Cheng-Yen Yang, Jiacheng Sun, Jenq-Neng Hwang, Chung-I Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-object tracking algorithms have made significant advancements due to
+the recent developments in object detection. However, most existing methods
+primarily focus on tracking pedestrians or vehicles, which exhibit relatively
+simple and regular motion patterns. Consequently, there is a scarcity of
+algorithms that address the tracking of targets with irregular or non-linear
+motion, such as multi-athlete tracking. Furthermore, popular tracking
+algorithms often rely on the Kalman filter for object motion modeling, which
+fails to track objects when their motion contradicts the linear motion
+assumption of the Kalman filter. Due to this reason, we proposed a novel online
+and robust multi-object tracking approach, named Iterative Scale-Up
+ExpansionIoU and Deep Features for multi-object tracking. Unlike conventional
+methods, we abandon the use of the Kalman filter and propose utilizing the
+iterative scale-up expansion IoU. This approach achieves superior tracking
+performance without requiring additional training data or adopting a more
+robust detector, all while maintaining a lower computational cost compared to
+other appearance-based methods. Our proposed method demonstrates remarkable
+effectiveness in tracking irregular motion objects, achieving a score of 76.9%
+in HOTA. It outperforms all state-of-the-art tracking algorithms on the
+SportsMOT dataset, covering various kinds of sport scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unified Adversarial Patch for Cross-modal Attacks in the Physical World <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07859v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07859v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingxing Wei, Yao Huang, Yitong Sun, Jie Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, physical adversarial attacks have been presented to evade
+DNNs-based object detectors. To ensure the security, many scenarios are
+simultaneously deployed with visible sensors and infrared sensors, leading to
+the failures of these single-modal physical attacks. To show the potential
+risks under such scenes, we propose a unified adversarial patch to perform
+cross-modal physical attacks, i.e., fooling visible and infrared object
+detectors at the same time via a single patch. Considering different imaging
+mechanisms of visible and infrared sensors, our work focuses on modeling the
+shapes of adversarial patches, which can be captured in different modalities
+when they change. To this end, we design a novel boundary-limited shape
+optimization to achieve the compact and smooth shapes, and thus they can be
+easily implemented in the physical world. In addition, to balance the fooling
+degree between visible detector and infrared detector during the optimization
+process, we propose a score-aware iterative evaluation, which can guide the
+adversarial patch to iteratively reduce the predicted scores of the multi-modal
+sensors. We finally test our method against the one-stage detector: YOLOv3 and
+the two-stage detector: Faster RCNN. Results show that our unified patch
+achieves an Attack Success Rate (ASR) of 73.33% and 69.17%, respectively. More
+importantly, we verify the effective attacks in the physical world when visible
+and infrared sensors shoot the objects under various settings like different
+angles, distances, postures, and scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures, accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-IMU with Online Self-Consistency for Freehand 3D Ultrasound
+  Reconstruction <span class="chip">MICCAI-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16197v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16197v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Luo, Xin Yang, Zhongnuo Yan, Junyu Li, Yuanji Zhang, Jiongquan Chen, Xindi Hu, Jikuan Qian, Jun Cheng, Dong Ni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultrasound (US) imaging is a popular tool in clinical diagnosis, offering
+safety, repeatability, and real-time capabilities. Freehand 3D US is a
+technique that provides a deeper understanding of scanned regions without
+increasing complexity. However, estimating elevation displacement and
+accumulation error remains challenging, making it difficult to infer the
+relative position using images alone. The addition of external lightweight
+sensors has been proposed to enhance reconstruction performance without adding
+complexity, which has been shown to be beneficial. We propose a novel online
+self-consistency network (OSCNet) using multiple inertial measurement units
+(IMUs) to improve reconstruction performance. OSCNet utilizes a modal-level
+self-supervised strategy to fuse multiple IMU information and reduce
+differences between reconstruction results obtained from each IMU data.
+Additionally, a sequence-level self-consistency strategy is proposed to improve
+the hierarchical consistency of prediction results among the scanning sequence
+and its sub-sequences. Experiments on large-scale arm and carotid datasets with
+multiple scanning tactics demonstrate that our OSCNet outperforms previous
+methods, achieving state-of-the-art reconstruction performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MICCAI-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LA-Net: Landmark-Aware Learning for Reliable Facial Expression
+  Recognition under Label Noise <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09023v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09023v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyu Wu, Jinshi Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial expression recognition (FER) remains a challenging task due to the
+ambiguity of expressions. The derived noisy labels significantly harm the
+performance in real-world scenarios. To address this issue, we present a new
+FER model named Landmark-Aware Net~(LA-Net), which leverages facial landmarks
+to mitigate the impact of label noise from two perspectives. Firstly, LA-Net
+uses landmark information to suppress the uncertainty in expression space and
+constructs the label distribution of each sample by neighborhood aggregation,
+which in turn improves the quality of training supervision. Secondly, the model
+incorporates landmark information into expression representations using the
+devised expression-landmark contrastive loss. The enhanced expression feature
+extractor can be less susceptible to label noise. Our method can be integrated
+with any deep neural network for better training supervision without
+introducing extra inference costs. We conduct extensive experiments on both
+in-the-wild datasets and synthetic noisy datasets and demonstrate that LA-Net
+achieves state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Spatio-Temporal Dependency for Skeleton-Based Action
+  Recognition <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.04761v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.04761v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jungho Lee, Minhyeok Lee, Suhwan Cho, Sungmin Woo, Sungjun Jang, Sangyoun Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Skeleton-based action recognition has attracted considerable attention due to
+its compact representation of the human body's skeletal sructure. Many recent
+methods have achieved remarkable performance using graph convolutional networks
+(GCNs) and convolutional neural networks (CNNs), which extract spatial and
+temporal features, respectively. Although spatial and temporal dependencies in
+the human skeleton have been explored separately, spatio-temporal dependency is
+rarely considered. In this paper, we propose the Spatio-Temporal Curve Network
+(STC-Net) to effectively leverage the spatio-temporal dependency of the human
+skeleton. Our proposed network consists of two novel elements: 1) The
+Spatio-Temporal Curve (STC) module; and 2) Dilated Kernels for Graph
+Convolution (DK-GC). The STC module dynamically adjusts the receptive field by
+identifying meaningful node connections between every adjacent frame and
+generating spatio-temporal curves based on the identified node connections,
+providing an adaptive spatio-temporal coverage. In addition, we propose DK-GC
+to consider long-range dependencies, which results in a large receptive field
+without any additional parameters by applying an extended kernel to the given
+adjacency matrices of the graph. Our STC-Net combines these two modules and
+achieves state-of-the-art performance on four skeleton-based action recognition
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforced Disentanglement for Face Swapping without Skip Connection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07928v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07928v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohang Ren, Xingyu Chen, Pengfei Yao, Heung-Yeung Shum, Baoyuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The SOTA face swap models still suffer the problem of either target identity
+(i.e., shape) being leaked or the target non-identity attributes (i.e.,
+background, hair) failing to be fully preserved in the final results. We show
+that this insufficient disentanglement is caused by two flawed designs that
+were commonly adopted in prior models: (1) counting on only one compressed
+encoder to represent both the semantic-level non-identity facial
+attributes(i.e., pose) and the pixel-level non-facial region details, which is
+contradictory to satisfy at the same time; (2) highly relying on long
+skip-connections between the encoder and the final generator, leaking a certain
+amount of target face identity into the result. To fix them, we introduce a new
+face swap framework called 'WSC-swap' that gets rid of skip connections and
+uses two target encoders to respectively capture the pixel-level non-facial
+region attributes and the semantic non-identity attributes in the face region.
+To further reinforce the disentanglement learning for the target encoder, we
+employ both identity removal loss via adversarial training (i.e., GAN) and the
+non-identity preservation loss via prior 3DMM models like [11]. Extensive
+experiments on both FaceForensics++ and CelebA-HQ show that our results
+significantly outperform previous works on a rich set of metrics, including one
+novel metric for measuring identity consistency that was completely neglected
+before.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distilling Large Vision-Language Model with Out-of-Distribution
+  Generalizability <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03135v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03135v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanlin Li, Yunhao Fang, Minghua Liu, Zhan Ling, Zhuowen Tu, Hao Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models have achieved outstanding performance, but their
+size and computational requirements make their deployment on
+resource-constrained devices and time-sensitive tasks impractical. Model
+distillation, the process of creating smaller, faster models that maintain the
+performance of larger models, is a promising direction towards the solution.
+This paper investigates the distillation of visual representations in large
+teacher vision-language models into lightweight student models using a small-
+or mid-scale dataset. Notably, this study focuses on open-vocabulary
+out-of-distribution (OOD) generalization, a challenging problem that has been
+overlooked in previous model distillation literature. We propose two principles
+from vision and language modality perspectives to enhance student's OOD
+generalization: (1) by better imitating teacher's visual representation space,
+and carefully promoting better coherence in vision-language alignment with the
+teacher; (2) by enriching the teacher's language representations with
+informative and finegrained semantic attributes to effectively distinguish
+between different labels. We propose several metrics and conduct extensive
+experiments to investigate their techniques. The results demonstrate
+significant improvements in zero-shot and few-shot student performance on
+open-vocabulary out-of-distribution classification, highlighting the
+effectiveness of our proposed approaches. Code released at
+https://github.com/xuanlinli17/large_vlm_distillation_ood
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at International Conference on Computer Vision (ICCV) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OPHAvatars: One-shot Photo-realistic Head Avatars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09153v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09153v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaoxu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method for synthesizing photo-realistic digital avatars from
+only one portrait as the reference. Given a portrait, our method synthesizes a
+coarse talking head video using driving keypoints features. And with the coarse
+video, our method synthesizes a coarse talking head avatar with a deforming
+neural radiance field. With rendered images of the coarse avatar, our method
+updates the low-quality images with a blind face restoration model. With
+updated images, we retrain the avatar for higher quality. After several
+iterations, our method can synthesize a photo-realistic animatable 3D neural
+head avatar. The motivation of our method is deformable neural radiance field
+can eliminate the unnatural distortion caused by the image2video method. Our
+method outperforms state-of-the-art methods in quantitative and qualitative
+studies on various subjects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code: https://github.com/lsx0101/OPHAvatars</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AirNet: Neural Network Transmission over the Air 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.11166v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.11166v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikolaj Jankowski, Deniz Gunduz, Krystian Mikolajczyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art performance for many edge applications is achieved by deep
+neural networks (DNNs). Often, these DNNs are location- and time-sensitive, and
+must be delivered over a wireless channel rapidly and efficiently. In this
+paper, we introduce AirNet, a family of novel training and transmission methods
+that allow DNNs to be efficiently delivered over wireless channels under
+stringent transmit power and latency constraints. This corresponds to a new
+class of joint source-channel coding problems, aimed at delivering DNNs with
+the goal of maximizing their accuracy at the receiver, rather than recovering
+them with high fidelity. In AirNet, we propose the direct mapping of the DNN
+parameters to transmitted channel symbols, while the network is trained to meet
+the channel constraints, and exhibit robustness against channel noise. AirNet
+achieves higher accuracy compared to separation-based alternatives. We further
+improve the performance of AirNet by pruning the network below the available
+bandwidth, and expanding it for improved robustness. We also benefit from
+unequal error protection by selectively expanding important layers of the
+network. Finally, we develop an approach, which simultaneously trains a
+spectrum of DNNs, each targeting a different channel condition, resolving the
+impractical memory requirements of training distinct networks for different
+channel conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fairness in AI and Its Long-Term Implications on Society 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09826v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09826v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ondrej Bohdal, Timothy Hospedales, Philip H. S. Torr, Fazl Barez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Successful deployment of artificial intelligence (AI) in various settings has
+led to numerous positive outcomes for individuals and society. However, AI
+systems have also been shown to harm parts of the population due to biased
+predictions. AI fairness focuses on mitigating such biases to ensure AI
+decision making is not discriminatory towards certain groups. We take a closer
+look at AI fairness and analyze how lack of AI fairness can lead to deepening
+of biases over time and act as a social stressor. More specifically, we discuss
+how biased models can lead to more negative real-world outcomes for certain
+groups, which may then become more prevalent by deploying new AI models trained
+on increasingly biased data, resulting in a feedback loop. If the issues
+persist, they could be reinforced by interactions with other risks and have
+severe implications on society in the form of social unrest. We examine current
+strategies for improving AI fairness, assess their limitations in terms of
+real-world deployment, and explore potential paths forward to ensure we reap
+AI's benefits without causing society's collapse.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Stanford Existential Risks Conference 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">15</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniMatch: A Unified User-Item Matching Framework for the Multi-purpose
+  Merchant Marketing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09989v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09989v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qifang Zhao, Tianyu Li, Meng Du, Yu Jiang, Qinghui Sun, Zhongyao Wang, Hong Liu, Huan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When doing private domain marketing with cloud services, the merchants
+usually have to purchase different machine learning models for the multiple
+marketing purposes, leading to a very high cost. We present a unified user-item
+matching framework to simultaneously conduct item recommendation and user
+targeting with just one model. We empirically demonstrate that the above
+concurrent modeling is viable via modeling the user-item interaction matrix
+with the multinomial distribution, and propose a bidirectional bias-corrected
+NCE loss for the implementation. The proposed loss function guides the model to
+learn the user-item joint probability $p(u,i)$ instead of the conditional
+probability $p(i|u)$ or $p(u|i)$ through correcting both the users and items'
+biases caused by the in-batch negative sampling. In addition, our framework is
+model-agnostic enabling a flexible adaptation of different model architectures.
+Extensive experiments demonstrate that our framework results in significant
+performance gains in comparison with the state-of-the-art methods, with greatly
+reduced cost on computing resources and daily maintenance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Our Model Achieves Excellent Performance on MovieLens: What Does it
+  Mean? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-chen Fan, Yitong Ji, Jie Zhang, Aixin Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A typical benchmark dataset for recommender system (RecSys) evaluation
+consists of user-item interactions generated on a platform within a time
+period. The interaction generation mechanism partially explains why a user
+interacts with (e.g.,like, purchase, rate) an item, and the context of when a
+particular interaction happened. In this study, we conduct a meticulous
+analysis on the MovieLens dataset and explain the potential impact on using the
+dataset for evaluating recommendation algorithms. We make a few main findings
+from our analysis. First, there are significant differences in user
+interactions at the different stages when a user interacts with the MovieLens
+platform. The early interactions largely define the user portrait which affect
+the subsequent interactions. Second, user interactions are highly affected by
+the candidate movies that are recommended by the platform's internal
+recommendation algorithm(s). Removal of interactions that happen nearer to the
+last few interactions of a user leads to increasing difficulty in learning user
+preference, thus deteriorating recommendation accuracy. Third, changing the
+order of user interactions makes it more difficult for sequential algorithms to
+capture the progressive interaction process. Based on these findings, we
+further discuss the discrepancy between the interaction generation mechanism
+that is employed by the MovieLens system and that of typical real world
+recommendation scenarios. In summary, models that achieve excellent
+recommendation accuracy on the MovieLens dataset may not demonstrate superior
+performance in practice for at least two kinds of differences: (i) the
+differences in the contexts of user-item interaction generation, and (ii) the
+differences in user knowledge about the item collections.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Who Provides the Largest Megaphone? The Role of Google News in Promoting
+  Russian State-Affiliated News Sources 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09834v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09834v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keeley Erhardt, Saurabh Khanna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Internet has not only digitized but also democratized information access
+across the globe. This gradual but path-breaking move to online information
+propagation has resulted in search engines playing an increasingly prominent
+role in shaping access to human knowledge. When an Internet user enters a
+query, the search engine sorts through the hundreds of billions of possible
+webpages to determine what to show. Google dominates the search engine market,
+with Google Search surpassing 80% market share globally every year of the last
+decade. Only in Russia and China do Google competitors claim more market share,
+with approximately 60% of Internet users in Russia preferring Yandex (compared
+to 40% in favor of Google) and more than 80% of China's Internet users
+accessing Baidu as of 2022. Notwithstanding this long-standing regional
+variation in Internet search providers, there is limited research showing how
+these providers compare in terms of propagating state-sponsored information.
+Our study fills this research gap by focusing on Russian cyberspace and
+examining how Google and Yandex's search algorithms rank content from Russian
+state-controlled media (hereon, RSM) outlets. This question is timely and of
+practical interest given widespread reports indicating that RSM outlets have
+actively engaged in promoting Kremlin propaganda in the lead-up to, and in the
+aftermath of, the Russian invasion of Ukraine in February 2022.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DisCover: Disentangled Music Representation Learning for Cover Song
+  Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahao Xun, Shengyu Zhang, Yanting Yang, Jieming Zhu, Liqun Deng, Zhou Zhao, Zhenhua Dong, Ruiqi Li, Lichao Zhang, Fei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of music information retrieval (MIR), cover song identification
+(CSI) is a challenging task that aims to identify cover versions of a query
+song from a massive collection. Existing works still suffer from high
+intra-song variances and inter-song correlations, due to the entangled nature
+of version-specific and version-invariant factors in their modeling. In this
+work, we set the goal of disentangling version-specific and version-invariant
+factors, which could make it easier for the model to learn invariant music
+representations for unseen query songs. We analyze the CSI task in a
+disentanglement view with the causal graph technique, and identify the
+intra-version and inter-version effects biasing the invariant learning. To
+block these effects, we propose the disentangled music representation learning
+framework (DisCover) for CSI. DisCover consists of two critical components: (1)
+Knowledge-guided Disentanglement Module (KDM) and (2) Gradient-based
+Adversarial Disentanglement Module (GADM), which block intra-version and
+inter-version biased effects, respectively. KDM minimizes the mutual
+information between the learned representations and version-variant factors
+that are identified with prior domain knowledge. GADM identifies
+version-variant factors by simulating the representation transitions between
+intra-song versions, and exploits adversarial distillation for effect blocking.
+Extensive comparisons with best-performing methods and in-depth analysis
+demonstrate the effectiveness of DisCover and the and necessity of
+disentanglement for CSI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Information Retrieval Meets Large Language Models: A Strategic Report
+  from Chinese IR Community 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyao Ai, Ting Bai, Zhao Cao, Yi Chang, Jiawei Chen, Zhumin Chen, Zhiyong Cheng, Shoubin Dong, Zhicheng Dou, Fuli Feng, Shen Gao, Jiafeng Guo, Xiangnan He, Yanyan Lan, Chenliang Li, Yiqun Liu, Ziyu Lyu, Weizhi Ma, Jun Ma, Zhaochun Ren, Pengjie Ren, Zhiqiang Wang, Mingwen Wang, Jirong Wen, Le Wu, Xin Xin, Jun Xu, Dawei Yin, Peng Zhang, Fan Zhang, Weinan Zhang, Min Zhang, Xiaofei Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The research field of Information Retrieval (IR) has evolved significantly,
+expanding beyond traditional search to meet diverse user information needs.
+Recently, Large Language Models (LLMs) have demonstrated exceptional
+capabilities in text understanding, generation, and knowledge inference,
+opening up exciting avenues for IR research. LLMs not only facilitate
+generative retrieval but also offer improved solutions for user understanding,
+model evaluation, and user-system interactions. More importantly, the
+synergistic relationship among IR models, LLMs, and humans forms a new
+technical paradigm that is more powerful for information seeking. IR models
+provide real-time and relevant information, LLMs contribute internal knowledge,
+and humans play a central role of demanders and evaluators to the reliability
+of information services. Nevertheless, significant challenges exist, including
+computational costs, credibility concerns, domain-specific limitations, and
+ethical considerations. To thoroughly discuss the transformative impact of LLMs
+on IR research, the Chinese IR community conducted a strategic workshop in
+April 2023, yielding valuable insights. This paper provides a summary of the
+workshop's outcomes, including the rethinking of IR's core values, the mutual
+enhancement of LLMs and IR, the proposal of a novel IR technical paradigm, and
+open challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Amazon-M2: A Multilingual Multi-locale Shopping Session <span class="highlight-title">Dataset</span> for
+  Recommendation and Text Generation <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jin, Haitao Mao, Zheng Li, Haoming Jiang, Chen Luo, Hongzhi Wen, Haoyu Han, Hanqing Lu, Zhengyang Wang, Ruirui Li, Zhen Li, Monica Xiao Cheng, Rahul Goutam, Haiyang Zhang, Karthik Subbian, Suhang Wang, Yizhou Sun, Jiliang Tang, Bing Yin, Xianfeng Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling customer shopping intentions is a crucial task for e-commerce, as it
+directly impacts user experience and engagement. Thus, accurately understanding
+customer preferences is essential for providing personalized recommendations.
+Session-based recommendation, which utilizes customer session data to predict
+their next interaction, has become increasingly popular. However, existing
+session datasets have limitations in terms of item attributes, user diversity,
+and dataset scale. As a result, they cannot comprehensively capture the
+spectrum of user behaviors and preferences. To bridge this gap, we present the
+Amazon Multilingual Multi-locale Shopping Session Dataset, namely Amazon-M2. It
+is the first multilingual dataset consisting of millions of user sessions from
+six different locales, where the major languages of products are English,
+German, Japanese, French, Italian, and Spanish. Remarkably, the dataset can
+help us enhance personalization and understanding of user preferences, which
+can benefit various existing tasks as well as enable new tasks. To test the
+potential of the dataset, we introduce three tasks in this work: (1)
+next-product recommendation, (2) next-product recommendation with domain
+shifts, and (3) next-product title generation. With the above tasks, we
+benchmark a range of algorithms on our proposed dataset, drawing new insights
+for further research and practice. In addition, based on the proposed dataset
+and tasks, we hosted a competition in the KDD CUP 2023 and have attracted
+thousands of users and submissions. The winning solutions and the associated
+workshop can be accessed at our website https://kddcup23.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Dataset for KDD Cup 2023, https://kddcup23.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SPRINT: A Unified Toolkit for Evaluating and Demystifying Zero-shot
+  Neural Sparse Retrieval <span class="chip">SIGIR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10488v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10488v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nandan Thakur, Kexin Wang, Iryna Gurevych, Jimmy Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditionally, sparse retrieval systems relied on lexical representations to
+retrieve documents, such as BM25, dominated information retrieval tasks. With
+the onset of pre-trained transformer models such as BERT, neural sparse
+retrieval has led to a new paradigm within retrieval. Despite the success,
+there has been limited software supporting different sparse retrievers running
+in a unified, common environment. This hinders practitioners from fairly
+comparing different sparse models and obtaining realistic evaluation results.
+Another missing piece is, that a majority of prior work evaluates sparse
+retrieval models on in-domain retrieval, i.e. on a single dataset: MS MARCO.
+However, a key requirement in practical retrieval systems requires models that
+can generalize well to unseen out-of-domain, i.e. zero-shot retrieval tasks. In
+this work, we provide SPRINT, a unified Python toolkit based on Pyserini and
+Lucene, supporting a common interface for evaluating neural sparse retrieval.
+The toolkit currently includes five built-in models: uniCOIL, DeepImpact,
+SPARTA, TILDEv2 and SPLADEv2. Users can also easily add customized models by
+defining their term weighting method. Using our toolkit, we establish strong
+and reproducible zero-shot sparse retrieval baselines across the
+well-acknowledged benchmark, BEIR. Our results demonstrate that SPLADEv2
+achieves the best average score of 0.470 nDCG@10 on BEIR amongst all neural
+sparse retrievers. In this work, we further uncover the reasons behind its
+performance gain. We show that SPLADEv2 produces sparse representations with a
+majority of tokens outside of the original query and document which is often
+crucial for its performance gains, i.e. a limitation among its other sparse
+counterparts. We provide our SPRINT toolkit, models, and data used in our
+experiments publicly here at https://github.com/thakur-nandan/sprint.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at SIGIR 2023 (Resource Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Approximate Nearest Neighbor Search with a Dynamic Exploration
+  Graph using Continuous Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10479v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10479v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nico Hezel, Kai Uwe Barthel, Konstantin Schall, Klaus Jung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For approximate nearest neighbor search, graph-based algorithms have shown to
+offer the best trade-off between accuracy and search time. We propose the
+Dynamic Exploration Graph (DEG) which significantly outperforms existing
+algorithms in terms of search and exploration efficiency by combining two new
+ideas: First, a single undirected even regular graph is incrementally built by
+partially replacing existing edges to integrate new vertices and to update old
+neighborhoods at the same time. Secondly, an edge optimization algorithm is
+used to continuously improve the quality of the graph. Combining this ongoing
+refinement with the graph construction process leads to a well-organized graph
+structure at all times, resulting in: (1) increased search efficiency, (2)
+predictable index size, (3) guaranteed connectivity and therefore reachability
+of all vertices, and (4) a dynamic graph structure. In addition we investigate
+how well existing graph-based search systems can handle indexed queries where
+the seed vertex of a search is the query itself. Such exploration tasks,
+despite their good starting point, are not necessarily easy. High efficiency in
+approximate nearest neighbor search (ANNS) does not automatically imply good
+performance in exploratory search. Extensive experiments show that our new
+Dynamic Exploration Graph outperforms existing algorithms significantly for
+indexed and unindexed queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classification of Visualization Types and Perspectives in Patents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junaid Ahmed Ghauri, Eric Müller-Budack, Ralph Ewerth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the swift growth of patent applications each year, information and
+multimedia retrieval approaches that facilitate patent exploration and
+retrieval are of utmost importance. Different types of visualizations (e.g.,
+graphs, technical drawings) and perspectives (e.g., side view, perspective) are
+used to visualize details of innovations in patents. The classification of
+these images enables a more efficient search and allows for further analysis.
+So far, datasets for image type classification miss some important
+visualization types for patents. Furthermore, related work does not make use of
+recent deep learning approaches including transformers. In this paper, we adopt
+state-of-the-art deep learning methods for the classification of visualization
+types and perspectives in patent images. We extend the CLEF-IP dataset for
+image type classification in patents to ten classes and provide manual ground
+truth annotations. In addition, we derive a set of hierarchical classes from a
+dataset that provides weakly-labeled data for image perspectives. Experimental
+results have demonstrated the feasibility of the proposed approaches. Source
+code, models, and dataset will be made publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in International Conference on Theory and Practice of
+  Digital Libraries (TPDL) 2023 (They have the copyright to publish
+  camera-ready version of this work)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IncDSI: Incrementally Updatable Document Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Varsha Kishore, Chao Wan, Justin Lovelace, Yoav Artzi, Kilian Q. Weinberger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differentiable Search Index is a recently proposed paradigm for document
+retrieval, that encodes information about a corpus of documents within the
+parameters of a neural network and directly maps queries to corresponding
+documents. These models have achieved state-of-the-art performances for
+document retrieval across many benchmarks. These kinds of models have a
+significant limitation: it is not easy to add new documents after a model is
+trained. We propose IncDSI, a method to add documents in real time (about
+20-50ms per document), without retraining the model on the entire dataset (or
+even parts thereof). Instead we formulate the addition of documents as a
+constrained optimization problem that makes minimal changes to the network
+parameters. Although orders of magnitude faster, our approach is competitive
+with re-training the model on the whole dataset and enables the development of
+document retrieval systems that can be updated with new information in
+real-time. Our code for IncDSI is available at
+https://github.com/varshakishore/IncDSI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mood Classification of Bangla Songs Based on Lyrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maliha Mahajebin, Mohammad Rifat Ahmmad Rashid, Nafees Mansoor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music can evoke various emotions, and with the advancement of technology, it
+has become more accessible to people. Bangla music, which portrays different
+human emotions, lacks sufficient research. The authors of this article aim to
+analyze Bangla songs and classify their moods based on the lyrics. To achieve
+this, this research has compiled a dataset of 4000 Bangla song lyrics, genres,
+and used Natural Language Processing and the Bert Algorithm to analyze the
+data. Among the 4000 songs, 1513 songs are represented for the sad mood, 1362
+for the romantic mood, 886 for happiness, and the rest 239 are classified as
+relaxation. By embedding the lyrics of the songs, the authors have classified
+the songs into four moods: Happy, Sad, Romantic, and Relaxed. This research is
+crucial as it enables a multi-class classification of songs' moods, making the
+music more relatable to people's emotions. The article presents the automated
+result of the four moods accurately derived from the song lyrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at International Conference on. Inventive Communication and
+  Computational Technologies 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trustworthy Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.06265v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.06265v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shoujin Wang, Xiuzhen Zhang, Yan Wang, Huan Liu, Francesco Ricci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems (RSs) aim to help users to effectively retrieve items of
+their interests from a large catalogue. For a quite long period of time,
+researchers and practitioners have been focusing on developing accurate RSs.
+Recent years have witnessed an increasing number of threats to RSs, coming from
+attacks, system and user generated noise, system bias. As a result, it has
+become clear that a strict focus on RS accuracy is limited and the research
+must consider other important factors, e.g., trustworthiness. For end users, a
+trustworthy RS (TRS) should not only be accurate, but also transparent,
+unbiased and fair as well as robust to noise or attacks. These observations
+actually led to a paradigm shift of the research on RSs: from accuracy-oriented
+RSs to TRSs. However, researchers lack a systematic overview and discussion of
+the literature in this novel and fast developing field of TRSs. To this end, in
+this paper, we provide an overview of TRSs, including a discussion of the
+motivation and basic concepts of TRSs, a presentation of the challenges in
+building TRSs, and a perspective on the future directions in this area. We also
+provide a novel conceptual framework to support the construction of TRSs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Injecting Domain Adaptation with Learning-to-hash for Effective and
+  Efficient Zero-shot Dense Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.11498v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.11498v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nandan Thakur, Nils Reimers, Jimmy Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dense retrieval overcome the lexical gap and has shown great success in
+ad-hoc information retrieval (IR). Despite their success, dense retrievers are
+expensive to serve across practical use cases. For use cases requiring to
+search from millions of documents, the dense index becomes bulky and requires
+high memory usage for storing the index. More recently, learning-to-hash (LTH)
+techniques, for e.g., BPR and JPQ, produce binary document vectors, thereby
+reducing the memory requirement to efficiently store the dense index. LTH
+techniques are supervised and finetune the retriever using a ranking loss. They
+outperform their counterparts, i.e., traditional out-of-the-box vector
+compression techniques such as PCA or PQ. A missing piece from prior work is
+that existing techniques have been evaluated only in-domain, i.e., on a single
+dataset such as MS MARCO. In our work, we evaluate LTH and vector compression
+techniques for improving the downstream zero-shot retrieval accuracy of the
+TAS-B dense retriever while maintaining efficiency at inference. Our results
+demonstrate that, unlike prior work, LTH strategies when applied naively can
+underperform the zero-shot TAS-B dense retriever on average by up to 14%
+nDCG@10 on the BEIR benchmark. To solve this limitation, in our work, we
+propose an easy yet effective solution of injecting domain adaptation with
+existing supervised LTH techniques. We experiment with two well-known
+unsupervised domain adaptation techniques: GenQ and GPL. Our domain adaptation
+injection technique can improve the downstream zero-shot retrieval
+effectiveness for both BPR and JPQ variants of the TAS-B model by on average
+11.5% and 8.2% nDCG@10 while both maintaining 32$\times$ memory efficiency and
+14$\times$ and 2$\times$ speedup respectively in CPU retrieval latency on BEIR.
+All our code, models, and data are publicly available at
+https://github.com/thakur-nandan/income.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ReNeuIR 2023 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Exploration for Recommendation Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.12509v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.12509v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheqing Zhu, Benjamin Van Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern recommendation systems ought to benefit by probing for and learning
+from delayed feedback. Research has tended to focus on learning from a user's
+response to a single recommendation. Such work, which leverages methods of
+supervised and bandit learning, forgoes learning from the user's subsequent
+behavior. Where past work has aimed to learn from subsequent behavior, there
+has been a lack of effective methods for probing to elicit informative delayed
+feedback. Effective exploration through probing for delayed feedback becomes
+particularly challenging when rewards are sparse. To address this, we develop
+deep exploration methods for recommendation systems. In particular, we
+formulate recommendation as a sequential decision problem and demonstrate
+benefits of deep exploration over single-step exploration. Our experiments are
+carried out with high-fidelity industrial-grade simulators and establish large
+improvements over existing algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Text Matching in E-Commerce Search with A Rationalizable,
+  Intervenable and Fast Entity-Based Relevance Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00370v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00370v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiong Cai, Yong Jiang, Yue Zhang, Chengyue Jiang, Ke Yu, Jianhui Ji, Rong Xiao, Haihong Tang, Tao Wang, Zhongqiang Huang, Pengjun Xie, Fei Huang, Kewei Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discovering the intended items of user queries from a massive repository of
+items is one of the main goals of an e-commerce search system. Relevance
+prediction is essential to the search system since it helps improve
+performance. When online serving a relevance model, the model is required to
+perform fast and accurate inference. Currently, the widely used models such as
+Bi-encoder and Cross-encoder have their limitations in accuracy or inference
+speed respectively. In this work, we propose a novel model called the
+Entity-Based Relevance Model (EBRM). We identify the entities contained in an
+item and decompose the QI (query-item) relevance problem into multiple QE
+(query-entity) relevance problems; we then aggregate their results to form the
+QI prediction using a soft logic formulation. The decomposition allows us to
+use a Cross-encoder QE relevance module for high accuracy as well as cache QE
+predictions for fast online inference. Utilizing soft logic makes the
+prediction procedure interpretable and intervenable. We also show that
+pretraining the QE module with auto-generated QE data from user logs can
+further improve the overall performance. The proposed method is evaluated on
+labeled data from e-commerce websites. Empirical results show that it achieves
+promising improvements with computation efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">116</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LightPath: Lightweight and Scalable Path Representation Learning <span class="chip">KDD-23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10171v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10171v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Bin Yang, Jilin Hu, Chenjuan Guo, Bin Yang, Christian S. Jensen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Movement paths are used widely in intelligent transportation and smart city
+applications. To serve such applications, path representation learning aims to
+provide compact representations of paths that enable efficient and accurate
+operations when used for different downstream tasks such as path ranking and
+travel cost estimation. In many cases, it is attractive that the path
+representation learning is lightweight and scalable; in resource-limited
+environments and under green computing limitations, it is essential. Yet,
+existing path representation learning studies focus on accuracy and pay at most
+secondary attention to resource consumption and scalability.
+  We propose a lightweight and scalable path representation learning framework,
+termed LightPath, that aims to reduce resource consumption and achieve
+scalability without affecting accuracy, thus enabling broader applicability.
+More specifically, we first propose a sparse auto-encoder that ensures that the
+framework achieves good scalability with respect to path length. Next, we
+propose a relational reasoning framework to enable faster training of more
+robust sparse path encoders. We also propose global-local knowledge
+distillation to further reduce the size and improve the performance of sparse
+path encoders. Finally, we report extensive experiments on two real-world
+datasets to offer insight into the efficiency, scalability, and effectiveness
+of the proposed framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ACM SIGKDD-23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Challenges and Applications of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean Kaddour, Joshua Harris, Maximilian Mozes, Herbie Bradley, Roberta Raileanu, Robert McHardy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) went from non-existent to ubiquitous in the
+machine learning discourse within a few years. Due to the fast pace of the
+field, it is difficult to identify the remaining challenges and already
+fruitful application areas. In this paper, we aim to establish a systematic set
+of open problems and application successes so that ML researchers can
+comprehend the field's current state more quickly and become productive.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>72 pages. v01. Work in progress. Feedback and comments are highly
+  appreciated!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VITS : Variational Inference Thomson Sampling for contextual bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Clavier, Tom Huix, Alain Durmus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce and analyze a variant of the Thompson sampling
+(TS) algorithm for contextual bandits. At each round, traditional TS requires
+samples from the current posterior distribution, which is usually intractable.
+To circumvent this issue, approximate inference techniques can be used and
+provide samples with distribution close to the posteriors. However, current
+approximate techniques yield to either poor estimation (Laplace approximation)
+or can be computationally expensive (MCMC methods, Ensemble sampling...). In
+this paper, we propose a new algorithm, Varational Inference Thompson sampling
+VITS, based on Gaussian Variational Inference. This scheme provides powerful
+posterior approximations which are easy to sample from, and is computationally
+efficient, making it an ideal choice for TS. In addition, we show that VITS
+achieves a sub-linear regret bound of the same order in the dimension and
+number of round as traditional TS for linear contextual bandit. Finally, we
+demonstrate experimentally the effectiveness of VITS on both synthetic and real
+world datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Backdoor Attacks <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alaa Khaddaj, Guillaume Leclerc, Aleksandar Makelov, Kristian Georgiev, Hadi Salman, Andrew Ilyas, Aleksander Madry
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a backdoor attack, an adversary inserts maliciously constructed backdoor
+examples into a training set to make the resulting model vulnerable to
+manipulation. Defending against such attacks typically involves viewing these
+inserted examples as outliers in the training set and using techniques from
+robust statistics to detect and remove them.
+  In this work, we present a different approach to the backdoor attack problem.
+Specifically, we show that without structural information about the training
+data distribution, backdoor attacks are indistinguishable from
+naturally-occurring features in the data--and thus impossible to "detect" in a
+general sense. Then, guided by this observation, we revisit existing defenses
+against backdoor attacks and characterize the (often latent) assumptions they
+make and on which they depend. Finally, we explore an alternative perspective
+on backdoor attacks: one that assumes these attacks correspond to the strongest
+feature in the training data. Under this assumption (which we make formal) we
+develop a new primitive for detecting backdoor attacks. Our primitive naturally
+gives rise to a detection algorithm that comes with theoretical guarantees and
+is effective in practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Driving Policy Learning with Guided Meta Reinforcement Learning <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanghoon Lee, Jiachen Li, David Isele, Jinkyoo Park, Kikuo Fujimura, Mykel J. Kochenderfer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although deep reinforcement learning (DRL) has shown promising results for
+autonomous navigation in interactive traffic scenarios, existing work typically
+adopts a fixed behavior policy to control social vehicles in the training
+environment. This may cause the learned driving policy to overfit the
+environment, making it difficult to interact well with vehicles with different,
+unseen behaviors. In this work, we introduce an efficient method to train
+diverse driving policies for social vehicles as a single meta-policy. By
+randomizing the interaction-based reward functions of social vehicles, we can
+generate diverse objectives and efficiently train the meta-policy through
+guiding policies that achieve specific objectives. We further propose a
+training strategy to enhance the robustness of the ego vehicle's driving policy
+using the environment where social vehicles are controlled by the learned
+meta-policy. Our method successfully learns an ego driving policy that
+generalizes well to unseen situations with out-of-distribution (OOD) social
+agents' behaviors in a challenging uncontrolled T-intersection scenario.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ITSC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Curvature-based Clustering on Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Tian, Zachary Lubberts, Melanie Weber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised node clustering (or community detection) is a classical graph
+learning task. In this paper, we study algorithms, which exploit the geometry
+of the graph to identify densely connected substructures, which form clusters
+or communities. Our method implements discrete Ricci curvatures and their
+associated geometric flows, under which the edge weights of the graph evolve to
+reveal its community structure. We consider several discrete curvature notions
+and analyze the utility of the resulting algorithms. In contrast to prior
+literature, we study not only single-membership community detection, where each
+node belongs to exactly one community, but also mixed-membership community
+detection, where communities may overlap. For the latter, we argue that it is
+beneficial to perform community detection on the line graph, i.e., the graph's
+dual. We provide both theoretical and empirical evidence for the utility of our
+curvature-based clustering algorithms. In addition, we give several results on
+the relationship between the curvature of a graph and that of its dual, which
+enable the efficient implementation of our proposed mixed-membership community
+detection approach and which may be of independent interest for curvature-based
+network analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>65 pages, 19 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Potential Based Rewards for Learning Humanoid Locomotion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Se Hwan Jeon, Steve Heim, Charles Khazoom, Sangbae Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main challenge in developing effective reinforcement learning (RL)
+pipelines is often the design and tuning the reward functions. Well-designed
+shaping reward can lead to significantly faster learning. Naively formulated
+rewards, however, can conflict with the desired behavior and result in
+overfitting or even erratic performance if not properly tuned. In theory, the
+broad class of potential based reward shaping (PBRS) can help guide the
+learning process without affecting the optimal policy. Although several studies
+have explored the use of potential based reward shaping to accelerate learning
+convergence, most have been limited to grid-worlds and low-dimensional systems,
+and RL in robotics has predominantly relied on standard forms of reward
+shaping. In this paper, we benchmark standard forms of shaping with PBRS for a
+humanoid robot. We find that in this high-dimensional system, PBRS has only
+marginal benefits in convergence speed. However, the PBRS reward terms are
+significantly more robust to scaling than typical reward shaping approaches,
+and thus easier to tune.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient Sparsification For Masked Fine-Tuning of <span class="highlight-title">Transformer</span>s <span class="chip">IJCNN 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James O' Neill, Sourav Dutta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning pretrained self-supervised language models is widely adopted for
+transfer learning to downstream tasks. Fine-tuning can be achieved by freezing
+gradients of the pretrained network and only updating gradients of a newly
+added classification layer, or by performing gradient updates on all
+parameters. Gradual unfreezing makes a trade-off between the two by gradually
+unfreezing gradients of whole layers during training. This has been an
+effective strategy to trade-off between storage and training speed with
+generalization performance. However, it is not clear whether gradually
+unfreezing layers throughout training is optimal, compared to sparse variants
+of gradual unfreezing which may improve fine-tuning performance. In this paper,
+we propose to stochastically mask gradients to regularize pretrained language
+models for improving overall fine-tuned performance. We introduce GradDrop and
+variants thereof, a class of gradient sparsification methods that mask
+gradients during the backward pass, acting as gradient noise. GradDrop is
+sparse and stochastic unlike gradual freezing. Extensive experiments on the
+multilingual XGLUE benchmark with XLMR-Large show that GradDrop is competitive
+against methods that use additional translated data for intermediate
+pretraining and outperforms standard fine-tuning and gradual unfreezing. A
+post-analysis shows how GradDrop improves performance with languages it was not
+trained on, such as under-resourced languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IJCNN 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting invariances and introducing priors in Gromov-Wasserstein
+  distances 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pinar Demetci, Quang Huy Tran, Ievgen Redko, Ritambhara Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gromov-Wasserstein distance has found many applications in machine learning
+due to its ability to compare measures across metric spaces and its invariance
+to isometric transformations. However, in certain applications, this invariance
+property can be too flexible, thus undesirable. Moreover, the
+Gromov-Wasserstein distance solely considers pairwise sample similarities in
+input datasets, disregarding the raw feature representations. We propose a new
+optimal transport-based distance, called Augmented Gromov-Wasserstein, that
+allows for some control over the level of rigidity to transformations. It also
+incorporates feature alignments, enabling us to better leverage prior knowledge
+on the input data for improved performance. We present theoretical insights
+into the proposed metric. We then demonstrate its usefulness for single-cell
+multi-omic alignment tasks and a transfer learning scenario in machine
+learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Android in the Wild: A Large-Scale <span class="highlight-title">Dataset</span> for Android Device Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Rawles, Alice Li, Daniel Rodriguez, Oriana Riva, Timothy Lillicrap
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a growing interest in device-control systems that can interpret
+human natural language instructions and execute them on a digital device by
+directly controlling its user interface. We present a dataset for
+device-control research, Android in the Wild (AITW), which is orders of
+magnitude larger than current datasets. The dataset contains human
+demonstrations of device interactions, including the screens and actions, and
+corresponding natural language instructions. It consists of 715k episodes
+spanning 30k unique instructions, four versions of Android (v10-13),and eight
+device types (Pixel 2 XL to Pixel 6) with varying screen resolutions. It
+contains multi-step tasks that require semantic understanding of language and
+visual context. This dataset poses a new challenge: actions available through
+the user interface must be inferred from their visual appearance. And, instead
+of simple UI element-based actions, the action space consists of precise
+gestures (e.g., horizontal scrolls to operate carousel widgets). We organize
+our dataset to encourage robustness analysis of device-control systems, i.e.,
+how well a system performs in the presence of new task descriptions, new
+applications, or new platform versions. We develop two agents and report
+performance across the dataset. The dataset is available at
+https://github.com/google-research/google-research/tree/master/android_in_the_wild.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Dual Formulation for Probabilistic Principal Component Analysis <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henri De Plaen, Johan A. K. Suykens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we characterize Probabilistic Principal Component Analysis in
+Hilbert spaces and demonstrate how the optimal solution admits a representation
+in dual space. This allows us to develop a generative framework for kernel
+methods. Furthermore, we show how it englobes Kernel Principal Component
+Analysis and illustrate its working on a toy and a real dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 Workshop on Duality for Modern Machine Learning (DP4ML). 14
+  pages (8 main + 5 appendix), 4 figures and 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Accuracy Estimation of Deep Visual Models using
+  Domain-Adaptive Adversarial Perturbation without Source Samples <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10062v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10062v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        JoonHo Lee, Jae Oh Woo, Hankyu Moon, Kwonho Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying deep visual models can lead to performance drops due to the
+discrepancies between source and target distributions. Several approaches
+leverage labeled source data to estimate target domain accuracy, but accessing
+labeled source data is often prohibitively difficult due to data
+confidentiality or resource limitations on serving devices. Our work proposes a
+new framework to estimate model accuracy on unlabeled target data without
+access to source data. We investigate the feasibility of using pseudo-labels
+for accuracy estimation and evolve this idea into adopting recent advances in
+source-free domain adaptation algorithms. Our approach measures the
+disagreement rate between the source hypothesis and the target pseudo-labeling
+function, adapted from the source hypothesis. We mitigate the impact of
+erroneous pseudo-labels that may arise due to a high ideal joint hypothesis
+risk by employing adaptive adversarial perturbation on the input of the target
+model. Our proposed source-free framework effectively addresses the challenging
+distribution shift scenarios and outperforms existing methods requiring source
+data and labels for training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accurate deep learning sub-grid scale models for large eddy simulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rikhi Bose, Arunabha M. Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present two families of sub-grid scale (SGS) turbulence models developed
+for large-eddy simulation (LES) purposes. Their development required the
+formulation of physics-informed robust and efficient Deep Learning (DL)
+algorithms which, unlike state-of-the-art analytical modeling techniques can
+produce high-order complex non-linear relations between inputs and outputs.
+Explicit filtering of data from direct simulations of the canonical channel
+flow at two friction Reynolds numbers $Re_\tau\approx 395$ and 590 provided
+accurate data for training and testing. The two sets of models use different
+network architectures. One of the architectures uses tensor basis neural
+networks (TBNN) and embeds the simplified analytical model form of the general
+effective-viscosity hypothesis, thus incorporating the Galilean, rotational and
+reflectional invariances. The other architecture is that of a relatively simple
+network, that is able to incorporate the Galilean invariance only. However,
+this simpler architecture has better feature extraction capacity owing to its
+ability to establish relations between and extract information from
+cross-components of the integrity basis tensors and the SGS stresses. Both sets
+of models are used to predict the SGS stresses for feature datasets generated
+with different filter widths, and at different Reynolds numbers. It is shown
+that due to the simpler model's better feature learning capabilities, it
+outperforms the invariance embedded model in statistical performance metrics.
+In a priori tests, both sets of models provide similar levels of dissipation
+and backscatter. Based on the test results, both sets of models should be
+usable in a posteriori actual LESs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convergence Guarantees for Stochastic Subgradient Methods in Nonsmooth
+  Nonconvex Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nachuan Xiao, Xiaoyin Hu, Kim-Chuan Toh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the convergence properties of the stochastic
+gradient descent (SGD) method and its variants, especially in training neural
+networks built from nonsmooth activation functions. We develop a novel
+framework that assigns different timescales to stepsizes for updating the
+momentum terms and variables, respectively. Under mild conditions, we prove the
+global convergence of our proposed framework in both single-timescale and
+two-timescale cases. We show that our proposed framework encompasses a wide
+range of well-known SGD-type methods, including heavy-ball SGD, SignSGD, Lion,
+normalized SGD and clipped SGD. Furthermore, when the objective function adopts
+a finite-sum formulation, we prove the convergence properties for these
+SGD-type methods based on our proposed framework. In particular, we prove that
+these SGD-type methods find the Clarke stationary points of the objective
+function with randomly chosen stepsizes and initial points under mild
+assumptions. Preliminary numerical experiments demonstrate the high efficiency
+of our analyzed SGD-type methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contextual Reliability: When Different Features Matter in Different
+  Contexts <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10026v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10026v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gaurav Ghosal, Amrith Setlur, Daniel S. Brown, Anca D. Dragan, Aditi Raghunathan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks often fail catastrophically by relying on spurious
+correlations. Most prior work assumes a clear dichotomy into spurious and
+reliable features; however, this is often unrealistic. For example, most of the
+time we do not want an autonomous car to simply copy the speed of surrounding
+cars -- we don't want our car to run a red light if a neighboring car does so.
+However, we cannot simply enforce invariance to next-lane speed, since it could
+provide valuable information about an unobservable pedestrian at a crosswalk.
+Thus, universally ignoring features that are sometimes (but not always)
+reliable can lead to non-robust performance. We formalize a new setting called
+contextual reliability which accounts for the fact that the "right" features to
+use may vary depending on the context. We propose and analyze a two-stage
+framework called Explicit Non-spurious feature Prediction (ENP) which first
+identifies the relevant features to use for a given context, then trains a
+model to rely exclusively on these features. Our work theoretically and
+empirically demonstrates the advantages of ENP over existing methods and
+provides new benchmarks for contextual reliability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 Camera Ready Version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Europepolls: A <span class="highlight-title">Dataset</span> of Country-Level Opinion Polling Data for the
+  European Union and the UK 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10022v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10022v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantinos Pitas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  I propose an open dataset of country-level historical opinion polling data
+for the European Union and the UK. The dataset aims to fill a gap in available
+opinion polling data for the European Union. Some existing datasets are
+restricted to the past five years, limiting research opportunities. At the same
+time, some larger proprietary datasets exist but are available only in a visual
+preprocessed time series format. Finally, while other large datasets for
+individual countries might exist, these could be inaccessible due to language
+barriers. The data was gathered from Wikipedia, and preprocessed using the
+pandas library. Both the raw and the preprocessed data are in the .csv format.
+I hope that given the recent advances in LLMs and deep learning in general,
+this large dataset will enable researchers to uncover complex interactions
+between multimodal data (news articles, economic indicators, social media) and
+voting behavior. The raw data, the preprocessed data, and the preprocessing
+scripts are available on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TbExplain: A Text-based Explanation Method for Scene Classification
+  Models with the Statistical Prediction Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amirhossein Aminimehr, Pouya Khani, Amirali Molaei, Amirmohammad Kazemeini, Erik Cambria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of Explainable Artificial Intelligence (XAI) aims to improve the
+interpretability of black-box machine learning models. Building a heatmap based
+on the importance value of input features is a popular method for explaining
+the underlying functions of such models in producing their predictions.
+Heatmaps are almost understandable to humans, yet they are not without flaws.
+Non-expert users, for example, may not fully understand the logic of heatmaps
+(the logic in which relevant pixels to the model's prediction are highlighted
+with different intensities or colors). Additionally, objects and regions of the
+input image that are relevant to the model prediction are frequently not
+entirely differentiated by heatmaps. In this paper, we propose a framework
+called TbExplain that employs XAI techniques and a pre-trained object detector
+to present text-based explanations of scene classification models. Moreover,
+TbExplain incorporates a novel method to correct predictions and textually
+explain them based on the statistics of objects in the input image when the
+initial prediction is unreliable. To assess the trustworthiness and validity of
+the text-based explanations, we conducted a qualitative experiment, and the
+findings indicated that these explanations are sufficiently reliable.
+Furthermore, our quantitative and qualitative experiments on TbExplain with
+scene classification datasets reveal an improvement in classification accuracy
+over ResNet variants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Impact of Disentanglement on Pruning Neural Networks <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carl Shneider, Peyman Rostami, Anis Kacem, Nilotpal Sinha, Abd El Rahman Shabayek, Djamila Aouada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying deep learning neural networks on edge devices, to accomplish task
+specific objectives in the real-world, requires a reduction in their memory
+footprint, power consumption, and latency. This can be realized via efficient
+model compression. Disentangled latent representations produced by variational
+autoencoder (VAE) networks are a promising approach for achieving model
+compression because they mainly retain task-specific information, discarding
+useless information for the task at hand. We make use of the Beta-VAE framework
+combined with a standard criterion for pruning to investigate the impact of
+forcing the network to learn disentangled representations on the pruning
+process for the task of classification. In particular, we perform experiments
+on MNIST and CIFAR10 datasets, examine disentanglement challenges, and propose
+a path forward for future works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented in ISCS23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniMatch: A Unified User-Item Matching Framework for the Multi-purpose
+  Merchant Marketing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09989v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09989v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qifang Zhao, Tianyu Li, Meng Du, Yu Jiang, Qinghui Sun, Zhongyao Wang, Hong Liu, Huan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When doing private domain marketing with cloud services, the merchants
+usually have to purchase different machine learning models for the multiple
+marketing purposes, leading to a very high cost. We present a unified user-item
+matching framework to simultaneously conduct item recommendation and user
+targeting with just one model. We empirically demonstrate that the above
+concurrent modeling is viable via modeling the user-item interaction matrix
+with the multinomial distribution, and propose a bidirectional bias-corrected
+NCE loss for the implementation. The proposed loss function guides the model to
+learn the user-item joint probability $p(u,i)$ instead of the conditional
+probability $p(i|u)$ or $p(u|i)$ through correcting both the users and items'
+biases caused by the in-batch negative sampling. In addition, our framework is
+model-agnostic enabling a flexible adaptation of different model architectures.
+Extensive experiments demonstrate that our framework results in significant
+performance gains in comparison with the state-of-the-art methods, with greatly
+reduced cost on computing resources and daily maintenance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TinyTrain: Deep Neural Network Training at the Extreme Edge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Young D. Kwon, Rui Li, Stylianos I. Venieris, Jagmohan Chauhan, Nicholas D. Lane, Cecilia Mascolo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  On-device training is essential for user personalisation and privacy. With
+the pervasiveness of IoT devices and microcontroller units (MCU), this task
+becomes more challenging due to the constrained memory and compute resources,
+and the limited availability of labelled user data. Nonetheless, prior works
+neglect the data scarcity issue, require excessively long training time (e.g. a
+few hours), or induce substantial accuracy loss ($\geq$10\%). We propose
+TinyTrain, an on-device training approach that drastically reduces training
+time by selectively updating parts of the model and explicitly coping with data
+scarcity. TinyTrain introduces a task-adaptive sparse-update method that
+dynamically selects the layer/channel based on a multi-objective criterion that
+jointly captures user data, the memory, and the compute capabilities of the
+target device, leading to high accuracy on unseen tasks with reduced
+computation and memory footprint. TinyTrain outperforms vanilla fine-tuning of
+the entire network by 3.6-5.0\% in accuracy, while reducing the backward-pass
+memory and computation cost by up to 2,286$\times$ and 7.68$\times$,
+respectively. Targeting broadly used real-world edge devices, TinyTrain
+achieves 9.5$\times$ faster and 3.5$\times$ more energy-efficient training over
+status-quo approaches, and 2.8$\times$ smaller memory footprint than SOTA
+approaches, while remaining within the 1 MB memory envelope of MCU-grade
+platforms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learner Referral for Cost-Effective Federated Learning Over Hierarchical
+  IoT Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulan Gao, Ziqiang Ye, Yue Xiao, Wei Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paradigm of federated learning (FL) to address data privacy concerns by
+locally training parameters on resource-constrained clients in a distributed
+manner has garnered significant attention. Nonetheless, FL is not applicable
+when not all clients within the coverage of the FL server are registered with
+the FL network. To bridge this gap, this paper proposes joint learner referral
+aided federated client selection (LRef-FedCS), along with communications and
+computing resource scheduling, and local model accuracy optimization (LMAO)
+methods. These methods are designed to minimize the cost incurred by the
+worst-case participant and ensure the long-term fairness of FL in hierarchical
+Internet of Things (HieIoT) networks. Utilizing the Lyapunov optimization
+technique, we reformulate the original problem into a stepwise joint
+optimization problem (JOP). Subsequently, to tackle the mixed-integer
+non-convex JOP, we separatively and iteratively address LRef-FedCS and LMAO
+through the centralized method and self-adaptive global best harmony search
+(SGHS) algorithm, respectively. To enhance scalability, we further propose a
+distributed LRef-FedCS approach based on a matching game to replace the
+centralized method described above. Numerical simulations and experimental
+results on the MNIST/CIFAR-10 datasets demonstrate that our proposed LRef-FedCS
+approach could achieve a good balance between pursuing high global accuracy and
+reducing cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards green AI-based software systems: an architecture-centric
+  approach (GAISSA) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Silverio Martínez-Fernández, Xavier Franch, Francisco Durán
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, AI-based systems have achieved outstanding results and have
+outperformed humans in different domains. However, the processes of training AI
+models and inferring from them require high computational resources, which pose
+a significant challenge in the current energy efficiency societal demand. To
+cope with this challenge, this research project paper describes the main
+vision, goals, and expected outcomes of the GAISSA project. The GAISSA project
+aims at providing data scientists and software engineers tool-supported,
+architecture-centric methods for the modelling and development of green
+AI-based systems. Although the project is in an initial stage, we describe the
+current research results, which illustrate the potential to achieve GAISSA
+objectives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication as full paper - 2023 49th Euromicro
+  Conference Series on Software Engineering and Advanced Applications (SEAA)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XSkill: Cross Embodiment Skill Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09955v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09955v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengda Xu, Zhenjia Xu, Cheng Chi, Manuela Veloso, Shuran Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human demonstration videos are a widely available data source for robot
+learning and an intuitive user interface for expressing desired behavior.
+However, directly extracting reusable robot manipulation skills from
+unstructured human videos is challenging due to the big embodiment difference
+and unobserved action parameters. To bridge this embodiment gap, this paper
+introduces XSkill, an imitation learning framework that 1) discovers a
+cross-embodiment representation called skill prototypes purely from unlabeled
+human and robot manipulation videos, 2) transfers the skill representation to
+robot actions using conditional diffusion policy, and finally, 3) composes the
+learned skill to accomplish unseen tasks specified by a human prompt video. Our
+experiments in simulation and real-world environments show that the discovered
+skill prototypes facilitate both skill transfer and composition for unseen
+tasks, resulting in a more general and scalable imitation learning framework.
+The performance of XSkill is best understood from the anonymous website:
+https://xskillcorl.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Impatient Bandits: Optimizing for the Long-Term Without Delay <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas McDonald, Lucas Maystre, Mounia Lalmas, Daniel Russo, Kamil Ciosek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems are a ubiquitous feature of online platforms.
+Increasingly, they are explicitly tasked with increasing users' long-term
+satisfaction. In this context, we study a content exploration task, which we
+formalize as a multi-armed bandit problem with delayed rewards. We observe that
+there is an apparent trade-off in choosing the learning signal: Waiting for the
+full reward to become available might take several weeks, hurting the rate at
+which learning happens, whereas measuring short-term proxy rewards reflects the
+actual long-term goal only imperfectly. We address this challenge in two steps.
+First, we develop a predictive model of delayed rewards that incorporates all
+information obtained to date. Full observations as well as partial (short or
+medium-term) outcomes are combined through a Bayesian filter to obtain a
+probabilistic belief. Second, we devise a bandit algorithm that takes advantage
+of this new predictive model. The algorithm quickly learns to identify content
+aligned with long-term success by carefully balancing exploration and
+exploitation. We apply our approach to a podcast recommendation problem, where
+we seek to identify shows that users engage with repeatedly over two months. We
+empirically validate that our approach results in substantially better
+performance compared to approaches that either optimize for short-term proxies,
+or wait for the long-term outcome to be fully realized.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at the 29th ACM SIGKDD Conference on Knowledge Discovery
+  and Data Mining (KDD '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TREEMENT: Interpretable Patient-Trial Matching via Personalized Dynamic
+  Tree-Based Memory Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09942v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09942v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon Theodorou, Cao Xiao, Jimeng Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical trials are critical for drug development but often suffer from
+expensive and inefficient patient recruitment. In recent years, machine
+learning models have been proposed for speeding up patient recruitment via
+automatically matching patients with clinical trials based on longitudinal
+patient electronic health records (EHR) data and eligibility criteria of
+clinical trials. However, they either depend on trial-specific expert rules
+that cannot expand to other trials or perform matching at a very general level
+with a black-box model where the lack of interpretability makes the model
+results difficult to be adopted.
+  To provide accurate and interpretable patient trial matching, we introduce a
+personalized dynamic tree-based memory network model named TREEMENT. It
+utilizes hierarchical clinical ontologies to expand the personalized patient
+representation learned from sequential EHR data, and then uses an attentional
+beam-search query learned from eligibility criteria embedding to offer a
+granular level of alignment for improved performance and interpretability. We
+evaluated TREEMENT against existing models on real-world datasets and
+demonstrated that TREEMENT outperforms the best baseline by 7% in terms of
+error reduction in criteria-level matching and achieves state-of-the-art
+results in its trial-level matching ability. Furthermore, we also show TREEMENT
+can offer good interpretability to make the model results easier for adoption.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spuriosity Didn't Kill the Classifier: Using Invariant Predictions to
+  Harness Spurious Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09933v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09933v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cian Eastwood, Shashank Singh, Andrei Liviu Nicolicioiu, Marin Vlastelica, Julius von Kügelgen, Bernhard Schölkopf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To avoid failures on out-of-distribution data, recent works have sought to
+extract features that have a stable or invariant relationship with the label
+across domains, discarding the "spurious" or unstable features whose
+relationship with the label changes across domains. However, unstable features
+often carry complementary information about the label that could boost
+performance if used correctly in the test domain. Our main contribution is to
+show that it is possible to learn how to use these unstable features in the
+test domain without labels. In particular, we prove that pseudo-labels based on
+stable features provide sufficient guidance for doing so, provided that stable
+and unstable features are conditionally independent given the label. Based on
+this theoretical insight, we propose Stable Feature Boosting (SFB), an
+algorithm for: (i) learning a predictor that separates stable and
+conditionally-independent unstable features; and (ii) using the stable-feature
+predictions to adapt the unstable-feature predictions in the test domain.
+Theoretically, we prove that SFB can learn an asymptotically-optimal predictor
+without test-domain labels. Empirically, we demonstrate the effectiveness of
+SFB on real and synthetic data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DISA: DIfferentiable Similarity Approximation for Universal Multimodal
+  Registration <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Ronchetti, Wolfgang Wein, Nassir Navab, Oliver Zettinig, Raphael Prevost
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal image registration is a challenging but essential step for
+numerous image-guided procedures. Most registration algorithms rely on the
+computation of complex, frequently non-differentiable similarity metrics to
+deal with the appearance discrepancy of anatomical structures between imaging
+modalities. Recent Machine Learning based approaches are limited to specific
+anatomy-modality combinations and do not generalize to new settings. We propose
+a generic framework for creating expressive cross-modal descriptors that enable
+fast deformable global registration. We achieve this by approximating existing
+metrics with a dot-product in the feature space of a small convolutional neural
+network (CNN) which is inherently differentiable can be trained without
+registered data. Our method is several orders of magnitude faster than local
+patch-based metrics and can be directly applied in clinical settings by
+replacing the similarity measure with the proposed one. Experiments on three
+different datasets demonstrate that our approach generalizes well beyond the
+training data, yielding a broad capture range even on unseen anatomies and
+modality pairs, without the need for specialized retraining. We make our
+training code and data publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This preprint was submitted to MICCAI 2023. The Version of Record of
+  this contribution will be published in Springer LNCS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TimeTuner: Diagnosing Time Representations for Time-Series Forecasting
+  with Counterfactual Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianing Hao, Qing Shi, Yilin Ye, Wei Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) approaches are being increasingly used for time-series
+forecasting, with many efforts devoted to designing complex DL models. Recent
+studies have shown that the DL success is often attributed to effective data
+representations, fostering the fields of feature engineering and representation
+learning. However, automated approaches for feature learning are typically
+limited with respect to incorporating prior knowledge, identifying interactions
+among variables, and choosing evaluation metrics to ensure that the models are
+reliable. To improve on these limitations, this paper contributes a novel
+visual analytics framework, namely TimeTuner, designed to help analysts
+understand how model behaviors are associated with localized correlations,
+stationarity, and granularity of time-series representations. The system mainly
+consists of the following two-stage technique: We first leverage counterfactual
+explanations to connect the relationships among time-series representations,
+multivariate features and model predictions. Next, we design multiple
+coordinated views including a partition-based correlation matrix and juxtaposed
+bivariate stripes, and provide a set of interactions that allow users to step
+into the transformation selection process, navigate through the feature space,
+and reason the model performance. We instantiate TimeTuner with two
+transformation methods of smoothing and sampling, and demonstrate its
+applicability on real-world time-series forecasting of univariate sunspots and
+multivariate air pollutants. Feedback from domain experts indicates that our
+system can help characterize time-series representations and guide the feature
+engineering processes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures, this paper has been accepted by VIS2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep projection networks for learning time-homogeneous dynamical systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vladimir R. Kostic, Pietro Novelli, Riccardo Grazzi, Karim Lounici, Massimiliano Pontil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the general class of time-homogeneous dynamical systems, both
+discrete and continuous, and study the problem of learning a meaningful
+representation of the state from observed data. This is instrumental for the
+task of learning a forward transfer operator of the system, that in turn can be
+used for forecasting future states or observables. The representation,
+typically parametrized via a neural network, is associated with a projection
+operator and is learned by optimizing an objective function akin to that of
+canonical correlation analysis (CCA). However, unlike CCA, our objective avoids
+matrix inversions and therefore is generally more stable and applicable to
+challenging scenarios. Our objective is a tight relaxation of CCA and we
+further enhance it by proposing two regularization schemes, one encouraging the
+orthogonality of the components of the representation while the other
+exploiting Chapman-Kolmogorov's equation. We apply our method to challenging
+discrete dynamical systems, discussing improvements over previous methods, as
+well as to continuous dynamical systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Repeated Observations for Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hüseyin Afşer, László Györfi, Harro Walk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem nonparametric classification with repeated observations.
+Let $\bX$ be the $d$ dimensional feature vector and let $Y$ denote the label
+taking values in $\{1,\dots ,M\}$. In contrast to usual setup with large sample
+size $n$ and relatively low dimension $d$, this paper deals with the situation,
+when instead of observing a single feature vector $\bX$ we are given $t$
+repeated feature vectors $\bV_1,\dots ,\bV_t $. Some simple classification
+rules are presented such that the conditional error probabilities have
+exponential convergence rate of convergence as $t\to\infty$. In the analysis,
+we investigate particular models like robust detection by nominal densities,
+prototype classification, linear transformation, linear classification,
+scaling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symmetric Equilibrium Learning of VAEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09883v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09883v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boris Flach, Dmitrij Schlesinger, Alexander Shekhovtsov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We view variational autoencoders (VAE) as decoder-encoder pairs, which map
+distributions in the data space to distributions in the latent space and vice
+versa. The standard learning approach for VAEs, i.e. maximisation of the
+evidence lower bound (ELBO), has an obvious asymmetry in that respect.
+Moreover, it requires a closed form a-priori latent distribution. This limits
+the applicability of VAEs in more complex scenarios, such as general
+semi-supervised learning and employing complex generative models as priors. We
+propose a Nash equilibrium learning approach that relaxes these restrictions
+and allows learning VAEs in situations where both the data and the latent
+distributions are accessible only by sampling. The flexibility and simplicity
+of this approach allows its application to a wide range of learning scenarios
+and downstream tasks. We show experimentally that the models learned by this
+method are comparable to those obtained by ELBO learning and demonstrate its
+applicability for tasks that are not accessible by standard VAE learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Likelihood Estimation with One-way Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omri Ben-Dov, Pravir Singh Gupta, Victoria Abrevaya, Michael J. Black, Partha Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GANs) can produce high-quality samples, but
+do not provide an estimate of the probability density around the samples.
+However, it has been noted that maximizing the log-likelihood within an
+energy-based setting can lead to an adversarial framework where the
+discriminator provides unnormalized density (often called energy). We further
+develop this perspective, incorporate importance sampling, and show that 1)
+Wasserstein GAN performs a biased estimate of the partition function, and we
+propose instead to use an unbiased estimator; 2) when optimizing for
+likelihood, one must maximize generator entropy. This is hypothesized to
+provide a better mode coverage. Different from previous works, we explicitly
+compute the density of the generated samples. This is the key enabler to
+designing an unbiased estimator of the partition function and computation of
+the generator entropy term. The generator density is obtained via a new type of
+flow network, called one-way flow network, that is less constrained in terms of
+architecture, as it does not require to have a tractable inverse function. Our
+experimental results show that we converge faster, produce comparable sample
+quality to GANs with similar architecture, successfully avoid over-fitting to
+commonly used datasets and produce smooth low-dimensional latent
+representations of the training data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Vulnerable Nodes in Urban Infrastructure Interdependent
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinzhu Mao, Liu Cao, Chen Gao, Huandong Wang, Hangyu Fan, Depeng Jin, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding and characterizing the vulnerability of urban infrastructures,
+which refers to the engineering facilities essential for the regular running of
+cities and that exist naturally in the form of networks, is of great value to
+us. Potential applications include protecting fragile facilities and designing
+robust topologies, etc. Due to the strong correlation between different
+topological characteristics and infrastructure vulnerability and their
+complicated evolution mechanisms, some heuristic and machine-assisted analysis
+fall short in addressing such a scenario. In this paper, we model the
+interdependent network as a heterogeneous graph and propose a system based on
+graph neural network with reinforcement learning, which can be trained on
+real-world data, to characterize the vulnerability of the city system
+accurately. The presented system leverages deep learning techniques to
+understand and analyze the heterogeneous graph, which enables us to capture the
+risk of cascade failure and discover vulnerable infrastructures of cities.
+Extensive experiments with various requests demonstrate not only the expressive
+power of our system but also transferring ability and necessity of the specific
+components.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards a population-informed approach to the definition of data-driven
+  models for structural dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        G. Tsialiamanis, N. Dervilis, D. J. Wagg, K. Worden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning has affected the way in which many phenomena for various
+domains are modelled, one of these domains being that of structural dynamics.
+However, because machine-learning algorithms are problem-specific, they often
+fail to perform efficiently in cases of data scarcity. To deal with such
+issues, combination of physics-based approaches and machine learning algorithms
+have been developed. Although such methods are effective, they also require the
+analyser's understanding of the underlying physics of the problem. The current
+work is aimed at motivating the use of models which learn such relationships
+from a population of phenomena, whose underlying physics are similar. The
+development of such models is motivated by the way that physics-based models,
+and more specifically finite element models, work. Such models are considered
+transferrable, explainable and trustworthy, attributes which are not trivially
+imposed or achieved for machine-learning models. For this reason,
+machine-learning approaches are less trusted by industry and often considered
+more difficult to form validated models. To achieve such data-driven models, a
+population-based scheme is followed here and two different machine-learning
+algorithms from the meta-learning domain are used. The two algorithms are the
+model-agnostic meta-learning (MAML) algorithm and the conditional neural
+processes (CNP) model. The algorithms seem to perform as intended and
+outperform a traditional machine-learning algorithm at approximating the
+quantities of interest. Moreover, they exhibit behaviour similar to traditional
+machine learning algorithms (e.g. neural networks or Gaussian processes),
+concerning their performance as a function of the available structures in the
+training population.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning for Credit Index Option Hedging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09844v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09844v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Mandelli, Marco Pinciroli, Michele Trapletti, Edoardo Vittori
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on finding the optimal hedging strategy of a credit
+index option using reinforcement learning. We take a practical approach, where
+the focus is on realism i.e. discrete time, transaction costs; even testing our
+policy on real market data. We apply a state of the art algorithm, the Trust
+Region Volatility Optimization (TRVO) algorithm and show that the derived
+hedging strategy outperforms the practitioner's Black & Scholes delta hedge.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Near-Linear Time Projection onto the $\ell_{1,\infty}$ Ball; Application
+  to Sparse Autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume Perez, Laurent Condat, Michel Barlaud
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Looking for sparsity is nowadays crucial to speed up the training of
+large-scale neural networks. Projections onto the $\ell_{1,2}$ and
+$\ell_{1,\infty}$ are among the most efficient techniques to sparsify and
+reduce the overall cost of neural networks. In this paper, we introduce a new
+projection algorithm for the $\ell_{1,\infty}$ norm ball. The worst-case time
+complexity of this algorithm is $\mathcal{O}\big(nm+J\log(nm)\big)$ for a
+matrix in $\mathbb{R}^{n\times m}$. $J$ is a term that tends to 0 when the
+sparsity is high, and to $nm$ when the sparsity is low. Its implementation is
+easy and it is guaranteed to converge to the exact solution in a finite time.
+Moreover, we propose to incorporate the $\ell_{1,\infty}$ ball projection while
+training an autoencoder to enforce feature selection and sparsity of the
+weights. Sparsification appears in the encoder to primarily do feature
+selection due to our application in biology, where only a very small part
+($<2\%$) of the data is relevant. We show that both in the biological case and
+in the general case of sparsity that our method is the fastest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Operator Network Approximation Rates for Lipschitz Operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoph Schwab, Andreas Stein, Jakob Zech
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We establish universality and expression rate bounds for a class of neural
+Deep Operator Networks (DON) emulating Lipschitz (or H\"older) continuous maps
+$\mathcal G:\mathcal X\to\mathcal Y$ between (subsets of) separable Hilbert
+spaces $\mathcal X$, $\mathcal Y$. The DON architecture considered uses linear
+encoders $\mathcal E$ and decoders $\mathcal D$ via (biorthogonal) Riesz bases
+of $\mathcal X$, $\mathcal Y$, and an approximator network of an
+infinite-dimensional, parametric coordinate map that is Lipschitz continuous on
+the sequence space $\ell^2(\mathbb N)$. Unlike previous works ([Herrmann,
+Schwab and Zech: Neural and Spectral operator surrogates: construction and
+expression rate bounds, SAM Report, 2022], [Marcati and Schwab: Exponential
+Convergence of Deep Operator Networks for Elliptic Partial Differential
+Equations, SAM Report, 2022]), which required for example $\mathcal G$ to be
+holomorphic, the present expression rate results require mere Lipschitz (or
+H\"older) continuity of $\mathcal G$. Key in the proof of the present
+expression rate bounds is the use of either super-expressive activations (e.g.
+[Yarotski: Elementary superexpressive activations, Int. Conf. on ML, 2021],
+[Shen, Yang and Zhang: Neural network approximation: Three hidden layers are
+enough, Neural Networks, 2021], and the references there) which are inspired by
+the Kolmogorov superposition theorem, or of nonstandard NN architectures with
+standard (ReLU) activations as recently proposed in [Zhang, Shen and Yang:
+Neural Network Architecture Beyond Width and Depth, Adv. in Neural Inf. Proc.
+Sys., 2022]. We illustrate the abstract results by approximation rate bounds
+for emulation of a) solution operators for parametric elliptic variational
+inequalities, and b) Lipschitz maps of Hilbert-Schmidt operators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What do neural networks learn in image classification? A frequency
+  shortcut perspective <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09829v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09829v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shunxin Wang, Raymond Veldhuis, Christoph Brune, Nicola Strisciuglio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Frequency analysis is useful for understanding the mechanisms of
+representation learning in neural networks (NNs). Most research in this area
+focuses on the learning dynamics of NNs for regression tasks, while little for
+classification. This study empirically investigates the latter and expands the
+understanding of frequency shortcuts. First, we perform experiments on
+synthetic datasets, designed to have a bias in different frequency bands. Our
+results demonstrate that NNs tend to find simple solutions for classification,
+and what they learn first during training depends on the most distinctive
+frequency characteristics, which can be either low- or high-frequencies.
+Second, we confirm this phenomenon on natural images. We propose a metric to
+measure class-wise frequency characteristics and a method to identify frequency
+shortcuts. The results show that frequency shortcuts can be texture-based or
+shape-based, depending on what best simplifies the objective. Third, we
+validate the transferability of frequency shortcuts on out-of-distribution
+(OOD) test sets. Our results suggest that frequency shortcuts can be
+transferred across datasets and cannot be fully avoided by larger model
+capacity and data augmentation. We recommend that future research should focus
+on effective training schemes mitigating frequency shortcut learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-modal Learning based Prediction for Disease 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaran Chen, Xueyu Chen, Yu Han, Haoran Li, Dongbin Zhao, Jingzhong Li, Xu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non alcoholic fatty liver disease (NAFLD) is the most common cause of chronic
+liver disease, which can be predicted accurately to prevent advanced fibrosis
+and cirrhosis. While, a liver biopsy, the gold standard for NAFLD diagnosis, is
+invasive, expensive, and prone to sampling errors. Therefore, non-invasive
+studies are extremely promising, yet they are still in their infancy due to the
+lack of comprehensive research data and intelligent methods for multi-modal
+data. This paper proposes a NAFLD diagnosis system (DeepFLDDiag) combining a
+comprehensive clinical dataset (FLDData) and a multi-modal learning based NAFLD
+prediction method (DeepFLD). The dataset includes over 6000 participants
+physical examinations, laboratory and imaging studies, extensive
+questionnaires, and facial images of partial participants, which is
+comprehensive and valuable for clinical studies. From the dataset, we
+quantitatively analyze and select clinical metadata that most contribute to
+NAFLD prediction. Furthermore, the proposed DeepFLD, a deep neural network
+model designed to predict NAFLD using multi-modal input, including metadata and
+facial images, outperforms the approach that only uses metadata. Satisfactory
+performance is also verified on other unseen datasets. Inspiringly, DeepFLD can
+achieve competitive results using only facial images as input rather than
+metadata, paving the way for a more robust and simpler non-invasive NAFLD
+diagnosis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep unrolling Shrinkage Network for Dynamic MR imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghao Zhang, Xiaodi Li, Weihang Li, Yue Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep unrolling networks that utilize sparsity priors have achieved great
+success in dynamic magnetic resonance (MR) imaging. The convolutional neural
+network (CNN) is usually utilized to extract the transformed domain, and then
+the soft thresholding (ST) operator is applied to the CNN-transformed data to
+enforce the sparsity priors. However, the ST operator is usually constrained to
+be the same across all channels of the CNN-transformed data. In this paper, we
+propose a novel operator, called soft thresholding with channel attention
+(AST), that learns the threshold for each channel. In particular, we put
+forward a novel deep unrolling shrinkage network (DUS-Net) by unrolling the
+alternating direction method of multipliers (ADMM) for optimizing the
+transformed $l_1$ norm dynamic MR reconstruction model. Experimental results on
+an open-access dynamic cine MR dataset demonstrate that the proposed DUS-Net
+outperforms the state-of-the-art methods. The source code is available at
+\url{https://github.com/yhao-z/DUS-Net}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages,3 figures,2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Manifold Learning with Sparse Regularised Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09816v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09816v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Zhang, Gilles Mordant, Tetsuya Matsumoto, Geoffrey Schiebinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Manifold learning is a central task in modern statistics and data science.
+Many datasets (cells, documents, images, molecules) can be represented as point
+clouds embedded in a high dimensional ambient space, however the degrees of
+freedom intrinsic to the data are usually far fewer than the number of ambient
+dimensions. The task of detecting a latent manifold along which the data are
+embedded is a prerequisite for a wide family of downstream analyses. Real-world
+datasets are subject to noisy observations and sampling, so that distilling
+information about the underlying manifold is a major challenge. We propose a
+method for manifold learning that utilises a symmetric version of optimal
+transport with a quadratic regularisation that constructs a sparse and adaptive
+affinity matrix, that can be interpreted as a generalisation of the
+bistochastic kernel normalisation. We prove that the resulting kernel is
+consistent with a Laplace-type operator in the continuous limit, establish
+robustness to heteroskedastic noise and exhibit these results in simulations.
+We identify a highly efficient computational scheme for computing this optimal
+transport for discrete data and demonstrate that it outperforms competing
+methods in a set of examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenKL: An Iterative Framework for Resolving Label Ambiguity and Label
+  Non-conformity in Web Images Via a New Generalized KL Divergence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xia Huang, Kai Fong Ernest Chong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Web image datasets curated online inherently contain ambiguous
+in-distribution (ID) instances and out-of-distribution (OOD) instances, which
+we collectively call non-conforming (NC) instances. In many recent approaches
+for mitigating the negative effects of NC instances, the core implicit
+assumption is that the NC instances can be found via entropy maximization. For
+"entropy" to be well-defined, we are interpreting the output prediction vector
+of an instance as the parameter vector of a multinomial random variable, with
+respect to some trained model with a softmax output layer. Hence, entropy
+maximization is based on the idealized assumption that NC instances have
+predictions that are "almost" uniformly distributed. However, in real-world web
+image datasets, there are numerous NC instances whose predictions are far from
+being uniformly distributed. To tackle the limitation of entropy maximization,
+we propose $(\alpha, \beta)$-generalized KL divergence,
+$\mathcal{D}_{\text{KL}}^{\alpha, \beta}(p\|q)$, which can be used to identify
+significantly more NC instances. Theoretical properties of
+$\mathcal{D}_{\text{KL}}^{\alpha, \beta}(p\|q)$ are proven, and we also show
+empirically that a simple use of $\mathcal{D}_{\text{KL}}^{\alpha,
+\beta}(p\|q)$ outperforms all baselines on the NC instance identification task.
+Building upon $(\alpha,\beta)$-generalized KL divergence, we also introduce a
+new iterative training framework, GenKL, that identifies and relabels NC
+instances. When evaluated on three web image datasets, Clothing1M,
+Food101/Food101N, and mini WebVision 1.0, we achieved new state-of-the-art
+classification accuracies: $81.34\%$, $85.73\%$ and $78.99\%$/$92.54\%$
+(top-1/top-5), respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published (with open access) at International Journal of Computer
+  Vision (IJCV, 2023). 25 pages, 8 figures. Code is available at:
+  https://github.com/codetopaper/GenKL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Federated Learning Based on the Decentralized Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09801v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09801v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peilin Liu, Yanni Tang, Mingyue Zhang, Wu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph learning has a wide range of applications in many scenarios, which
+require more need for data privacy. Federated learning is an emerging
+distributed machine learning approach that leverages data from individual
+devices or data centers to improve the accuracy and generalization of the
+model, while also protecting the privacy of user data. Graph-federated learning
+is mainly based on the classical federated learning framework i.e., the
+Client-Server framework. However, the Client-Server framework faces problems
+such as a single point of failure of the central server and poor scalability of
+network topology. First, we introduce the decentralized framework to
+graph-federated learning. Second, determine the confidence among nodes based on
+the similarity of data among nodes, subsequently, the gradient information is
+then aggregated by linear weighting based on confidence. Finally, the proposed
+method is compared with FedAvg, Fedprox, GCFL, and GCFL+ to verify the
+effectiveness of the proposed method. Experiments demonstrate that the proposed
+method outperforms other methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Forecasting with Coherent Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09797v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09797v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geoffrey Négiar, Ruijun Ma, O. Nangba Meetei, Mengfei Cao, Michael W. Mahoney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining accurate probabilistic forecasts while respecting hierarchical
+information is an important operational challenge in many applications, perhaps
+most obviously in energy management, supply chain planning, and resource
+allocation. The basic challenge, especially for multivariate forecasting, is
+that forecasts are often required to be coherent with respect to the
+hierarchical structure. In this paper, we propose a new model which leverages a
+factor model structure to produce coherent forecasts by construction. This is a
+consequence of a simple (exchangeability) observation: permuting
+\textit{}base-level series in the hierarchy does not change their aggregates.
+Our model uses a convolutional neural network to produce parameters for the
+factors, their loadings and base-level distributions; it produces samples which
+can be differentiated with respect to the model's parameters; and it can
+therefore optimize for any sample-based loss function, including the Continuous
+Ranked Probability Score and quantile losses. We can choose arbitrary
+continuous distributions for the factor and the base-level distributions. We
+compare our method to two previous methods which can be optimized end-to-end,
+while enforcing coherent aggregation. Our model achieves significant
+improvements: between $11.8-41.4\%$ on three hierarchical forecasting datasets.
+We also analyze the influence of parameters in our model with respect to
+base-level distribution and number of factors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Forecasting Early with Meta Learning <span class="chip">IJCNN 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09796v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09796v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shayan Jawed, Kiran Madhusudhanan, Vijaya Krishna Yalavarthi, Lars Schmidt-Thieme
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the early observation period of a time series, there might be only a few
+historic observations available to learn a model. However, in cases where an
+existing prior set of datasets is available, Meta learning methods can be
+applicable. In this paper, we devise a Meta learning method that exploits
+samples from additional datasets and learns to augment time series through
+adversarial learning as an auxiliary task for the target dataset. Our model
+(FEML), is equipped with a shared Convolutional backbone that learns features
+for varying length inputs from different datasets and has dataset specific
+heads to forecast for different output lengths. We show that FEML can meta
+learn across datasets and by additionally learning on adversarial generated
+samples as auxiliary samples for the target dataset, it can improve the
+forecasting performance compared to single task learning, and various solutions
+adapted from Joint learning, Multi-task learning and classic forecasting
+baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCNN 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From West to East: Who can understand the music of the others better? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charilaos Papaioannou, Emmanouil Benetos, Alexandros Potamianos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent developments in MIR have led to several benchmark deep learning models
+whose embeddings can be used for a variety of downstream tasks. At the same
+time, the vast majority of these models have been trained on Western pop/rock
+music and related styles. This leads to research questions on whether these
+models can be used to learn representations for different music cultures and
+styles, or whether we can build similar music audio embedding models trained on
+data from different cultures or styles. To that end, we leverage transfer
+learning methods to derive insights about the similarities between the
+different music cultures to which the data belongs to. We use two Western music
+datasets, two traditional/folk datasets coming from eastern Mediterranean
+cultures, and two datasets belonging to Indian art music. Three deep audio
+embedding models are trained and transferred across domains, including two
+CNN-based and a Transformer-based architecture, to perform auto-tagging for
+each target domain dataset. Experimental results show that competitive
+performance is achieved in all domains via transfer learning, while the best
+source dataset varies for each music culture. The implementation and the
+trained models are both provided in a public repository.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Note on Hardness of Computing Recursive Teaching Dimension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09792v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09792v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pasin Manurangsi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this short note, we show that the problem of computing the recursive
+teaching dimension (RTD) for a concept class (given explicitly as input)
+requires $n^{\Omega(\log n)}$-time, assuming the exponential time hypothesis
+(ETH). This matches the running time $n^{O(\log n)}$ of the brute-force
+algorithm for the problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in IPL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization
+  Using Floating-Point Formats 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxia Wu, Zhewei Yao, Yuxiong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the complex domain of large language models (LLMs), striking a balance
+between computational efficiency and maintaining model quality is a formidable
+challenge. Navigating the inherent limitations of uniform quantization,
+particularly when dealing with outliers, and motivated by the launch of
+NVIDIA's H100 hardware, this study delves into the viability of floating-point
+(FP) quantization, particularly focusing on FP8 and FP4, as a potential
+solution. Our comprehensive investigation reveals that for LLMs, FP8 activation
+consistently outshines its integer (INT8) equivalent, with the performance edge
+becoming more noticeable in models possessing parameters beyond one billion.
+For weight quantization, our findings indicate that FP4 exhibits comparable, if
+not superior, performance to INT4, simplifying deployment on FP-supported
+hardware like H100. To mitigate the overhead from precision alignment caused by
+the disparity between weights and activations, we propose two scaling
+constraints for weight quantization that negligibly impact the performance
+compared to the standard W4A8 model. We additionally enhance our quantization
+methods by integrating the Low Rank Compensation (LoRC) strategy, yielding
+improvements especially in smaller models. The results of our investigation
+emphasize the immense potential of FP quantization for LLMs, paving the way for
+high-efficiency deployment in resource-limited settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text2Layer: Layered Image Generation using Latent Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09781v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09781v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyang Zhang, Wentian Zhao, Xin Lu, Jeff Chien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Layer compositing is one of the most popular image editing workflows among
+both amateurs and professionals. Motivated by the success of diffusion models,
+we explore layer compositing from a layered image generation perspective.
+Instead of generating an image, we propose to generate background, foreground,
+layer mask, and the composed image simultaneously. To achieve layered image
+generation, we train an autoencoder that is able to reconstruct layered images
+and train diffusion models on the latent representation. One benefit of the
+proposed problem is to enable better compositing workflows in addition to the
+high-quality image output. Another benefit is producing higher-quality layer
+masks compared to masks produced by a separate step of image segmentation.
+Experimental results show that the proposed method is able to generate
+high-quality layered images and initiates a benchmark for future work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Single-Feature Importance with ICECREAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09779v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09779v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Oesterle, Patrick Blöbaum, Atalanti A. Mastakouri, Elke Kirschbaum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Which set of features was responsible for a certain output of a machine
+learning model? Which components caused the failure of a cloud computing
+application? These are just two examples of questions we are addressing in this
+work by Identifying Coalition-based Explanations for Common and Rare Events in
+Any Model (ICECREAM). Specifically, we propose an information-theoretic
+quantitative measure for the influence of a coalition of variables on the
+distribution of a target variable. This allows us to identify which set of
+factors is essential to obtain a certain outcome, as opposed to
+well-established explainability and causal contribution analysis methods which
+can assign contributions only to individual factors and rank them by their
+importance. In experiments with synthetic and real-world data, we show that
+ICECREAM outperforms state-of-the-art methods for explainability and root cause
+analysis, and achieves impressive accuracy in both tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Spatial-Temporal Variational Quantum Circuit to Enable Deep
+  Learning on NISQ Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09771v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09771v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinyang Li, Zhepeng Wang, Zhirui Hu, Prasanna Date, Ang Li, Weiwen Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum computing presents a promising approach for machine learning with its
+capability for extremely parallel computation in high-dimension through
+superposition and entanglement. Despite its potential, existing quantum
+learning algorithms, such as Variational Quantum Circuits(VQCs), face
+challenges in handling more complex datasets, particularly those that are not
+linearly separable. What's more, it encounters the deployability issue, making
+the learning models suffer a drastic accuracy drop after deploying them to the
+actual quantum devices. To overcome these limitations, this paper proposes a
+novel spatial-temporal design, namely ST-VQC, to integrate non-linearity in
+quantum learning and improve the robustness of the learning model to noise.
+Specifically, ST-VQC can extract spatial features via a novel block-based
+encoding quantum sub-circuit coupled with a layer-wise computation quantum
+sub-circuit to enable temporal-wise deep learning. Additionally, a SWAP-Free
+physical circuit design is devised to improve robustness. These designs bring a
+number of hyperparameters. After a systematic analysis of the design space for
+each design component, an automated optimization framework is proposed to
+generate the ST-VQC quantum circuit. The proposed ST-VQC has been evaluated on
+two IBM quantum processors, ibm_cairo with 27 qubits and ibmq_lima with 7
+qubits to assess its effectiveness. The results of the evaluation on the
+standard dataset for binary classification show that ST-VQC can achieve over
+30% accuracy improvement compared with existing VQCs on actual quantum
+computers. Moreover, on a non-linear synthetic dataset, the ST-VQC outperforms
+a linear classifier by 27.9%, while the linear classifier using classical
+computing outperforms the existing VQC by 15.58%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Curvature Enhance the Adaptation Power of Framelet GCNs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dai Shi, Yi Guo, Zhiqi Shao, Junbin Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural network (GNN) has been demonstrated powerful in modeling
+graph-structured data. However, despite many successful cases of applying GNNs
+to various graph classification and prediction tasks, whether the graph
+geometrical information has been fully exploited to enhance the learning
+performance of GNNs is not yet well understood. This paper introduces a new
+approach to enhance GNN by discrete graph Ricci curvature. Specifically, the
+graph Ricci curvature defined on the edges of a graph measures how difficult
+the information transits on one edge from one node to another based on their
+neighborhoods. Motivated by the geometric analogy of Ricci curvature in the
+graph setting, we prove that by inserting the curvature information with
+different carefully designed transformation function $\zeta$, several known
+computational issues in GNN such as over-smoothing can be alleviated in our
+proposed model. Furthermore, we verified that edges with very positive Ricci
+curvature (i.e., $\kappa_{i,j} \approx 1$) are preferred to be dropped to
+enhance model's adaption to heterophily graph and one curvature based graph
+edge drop algorithm is proposed. Comprehensive experiments show that our
+curvature-based GNN model outperforms the state-of-the-art baselines in both
+homophily and heterophily graph datasets, indicating the effectiveness of
+involving graph geometric information in GNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sig-Splines: universal approximation and convex calibration of time
+  series generative models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09767v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09767v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Magnus Wiese, Phillip Murray, Ralf Korn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel generative model for multivariate discrete-time time
+series data. Drawing inspiration from the construction of neural spline flows,
+our algorithm incorporates linear transformations and the signature transform
+as a seamless substitution for traditional neural networks. This approach
+enables us to achieve not only the universality property inherent in neural
+networks but also introduces convexity in the model's parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcing POD based model reduction techniques in reaction-diffusion
+  complex networks using stochastic filtering and pattern recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Ajayakumar, Soumyendu Raha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Complex networks are used to model many real-world systems. However, the
+dimensionality of these systems can make them challenging to analyze.
+Dimensionality reduction techniques like POD can be used in such cases.
+However, these models are susceptible to perturbations in the input data. We
+propose an algorithmic framework that combines techniques from pattern
+recognition (PR) and stochastic filtering theory to enhance the output of such
+models. The results of our study show that our method can improve the accuracy
+of the surrogate model under perturbed inputs. Deep Neural Networks (DNNs) are
+susceptible to adversarial attacks. However, recent research has revealed that
+neural Ordinary Differential Equations (ODEs) exhibit robustness in specific
+applications. We benchmark our algorithmic framework with a Neural ODE-based
+approach as a reference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constructing Extreme Learning Machines with zero Spectral Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaumudi Joshi, Vukka Snigdha, Arya Kumar Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The phenomena of Spectral Bias, where the higher frequency components of a
+function being learnt in a feedforward Artificial Neural Network (ANN) are seen
+to converge more slowly than the lower frequencies, is observed ubiquitously
+across ANNs. This has created technology challenges in fields where resolution
+of higher frequencies is crucial, like in Physics Informed Neural Networks
+(PINNs). Extreme Learning Machines (ELMs) that obviate an iterative solution
+process which provides the theoretical basis of Spectral Bias (SB), should in
+principle be free of the same. This work verifies the reliability of this
+assumption, and shows that it is incorrect. However, the structure of ELMs
+makes them naturally amenable to implementation of variants of Fourier Feature
+Embeddings, which have been shown to mitigate SB in ANNs. This approach is
+implemented and verified to completely eliminate SB, thus bringing into
+feasibility the application of ELMs for practical problems like PINNs where
+resolution of higher frequencies is essential.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Distribution Matching for <span class="highlight-title">Dataset</span> Condensation <span class="chip">CVPR2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09742v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09742v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ganlong Zhao, Guanbin Li, Yipeng Qin, Yizhou Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset Condensation aims to condense a large dataset into a smaller one
+while maintaining its ability to train a well-performing model, thus reducing
+the storage cost and training effort in deep learning applications. However,
+conventional dataset condensation methods are optimization-oriented and
+condense the dataset by performing gradient or parameter matching during model
+optimization, which is computationally intensive even on small datasets and
+models. In this paper, we propose a novel dataset condensation method based on
+distribution matching, which is more efficient and promising. Specifically, we
+identify two important shortcomings of naive distribution matching (i.e.,
+imbalanced feature numbers and unvalidated embeddings for distance computation)
+and address them with three novel techniques (i.e., partitioning and expansion
+augmentation, efficient and enriched model sampling, and class-aware
+distribution regularization). Our simple yet effective method outperforms most
+previous optimization-oriented methods with much fewer computational resources,
+thereby scaling data condensation to larger datasets and models. Extensive
+experiments demonstrate the effectiveness of our method. Codes are available at
+https://github.com/uitrbn/IDM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RaTE: a Reproducible automatic Taxonomy Evaluation by Filling the Gap <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09706v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09706v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianjian Gao, Phillipe Langlais
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Taxonomies are an essential knowledge representation, yet most studies on
+automatic taxonomy construction (ATC) resort to manual evaluation to score
+proposed algorithms. We argue that automatic taxonomy evaluation (ATE) is just
+as important as taxonomy construction. We propose RaTE, an automatic label-free
+taxonomy scoring procedure, which relies on a large pre-trained language model.
+We apply our evaluation procedure to three state-of-the-art ATC algorithms with
+which we built seven taxonomies from the Yelp domain, and show that 1) RaTE
+correlates well with human judgments and 2) artificially degrading a taxonomy
+leads to decreasing RaTE score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15th International Conference on Computational Semantics (IWCS),
+  Association for Computational Linguistics (ACL)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Guided Generation for LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09702v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09702v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon T. Willard, Rémi Louf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article we describe an efficient approach to guiding language model
+text generation with regular expressions and context-free grammars. Our
+approach adds little to no overhead to the token sequence generation process,
+and makes guided generation feasible in practice. An implementation is provided
+in the open source Python library Outlines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ STRAPPER: Preference-based Reinforcement Learning via Self-training
+  Augmentation and Peer Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09692v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09692v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yachen Kang, Li He, Jinxin Liu, Zifeng Zhuang, Donglin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preference-based reinforcement learning (PbRL) promises to learn a complex
+reward function with binary human preference. However, such human-in-the-loop
+formulation requires considerable human effort to assign preference labels to
+segment pairs, hindering its large-scale applications. Recent approache has
+tried to reuse unlabeled segments, which implicitly elucidates the distribution
+of segments and thereby alleviates the human effort. And consistency
+regularization is further considered to improve the performance of
+semi-supervised learning. However, we notice that, unlike general
+classification tasks, in PbRL there exits a unique phenomenon that we defined
+as similarity trap in this paper. Intuitively, human can have diametrically
+opposite preferredness for similar segment pairs, but such similarity may trap
+consistency regularization fail in PbRL. Due to the existence of similarity
+trap, such consistency regularization improperly enhances the consistency
+possiblity of the model's predictions between segment pairs, and thus reduces
+the confidence in reward learning, since the augmented distribution does not
+match with the original one in PbRL. To overcome such issue, we present a
+self-training method along with our proposed peer regularization, which
+penalizes the reward model memorizing uninformative labels and acquires
+confident predictions. Empirically, we demonstrate that our approach is capable
+of learning well a variety of locomotion and robotic manipulation behaviors
+using different semi-supervised alternatives and peer regularization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Joint Service Caching, Communication and Computing Resource Allocation
+  in Collaborative MEC Systems: A DRL-based Two-timescale Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09691v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09691v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianqian Liu, Haixia Zhang, Xin Zhang, Dongfeng Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Meeting the strict Quality of Service (QoS) requirements of terminals has
+imposed a signiffcant challenge on Multiaccess Edge Computing (MEC) systems,
+due to the limited multidimensional resources. To address this challenge, we
+propose a collaborative MEC framework that facilitates resource sharing between
+the edge servers, and with the aim to maximize the long-term QoS and reduce the
+cache switching cost through joint optimization of service caching,
+collaborative offfoading, and computation and communication resource
+allocation. The dual timescale feature and temporal recurrence relationship
+between service caching and other resource allocation make solving the problem
+even more challenging. To solve it, we propose a deep reinforcement learning
+(DRL)-based dual timescale scheme, called DGL-DDPG, which is composed of a
+short-term genetic algorithm (GA) and a long short-term memory network-based
+deep deterministic policy gradient (LSTM-DDPG). In doing so, we reformulate the
+optimization problem as a Markov decision process (MDP) where the
+small-timescale resource allocation decisions generated by an improved GA are
+taken as the states and input into a centralized LSTM-DDPG agent to generate
+the service caching decision for the large-timescale. Simulation results
+demonstrate that our proposed algorithm outperforms the baseline algorithms in
+terms of the average QoS and cache switching cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Amazon-M2: A Multilingual Multi-locale Shopping Session <span class="highlight-title">Dataset</span> for
+  Recommendation and Text Generation <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jin, Haitao Mao, Zheng Li, Haoming Jiang, Chen Luo, Hongzhi Wen, Haoyu Han, Hanqing Lu, Zhengyang Wang, Ruirui Li, Zhen Li, Monica Xiao Cheng, Rahul Goutam, Haiyang Zhang, Karthik Subbian, Suhang Wang, Yizhou Sun, Jiliang Tang, Bing Yin, Xianfeng Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling customer shopping intentions is a crucial task for e-commerce, as it
+directly impacts user experience and engagement. Thus, accurately understanding
+customer preferences is essential for providing personalized recommendations.
+Session-based recommendation, which utilizes customer session data to predict
+their next interaction, has become increasingly popular. However, existing
+session datasets have limitations in terms of item attributes, user diversity,
+and dataset scale. As a result, they cannot comprehensively capture the
+spectrum of user behaviors and preferences. To bridge this gap, we present the
+Amazon Multilingual Multi-locale Shopping Session Dataset, namely Amazon-M2. It
+is the first multilingual dataset consisting of millions of user sessions from
+six different locales, where the major languages of products are English,
+German, Japanese, French, Italian, and Spanish. Remarkably, the dataset can
+help us enhance personalization and understanding of user preferences, which
+can benefit various existing tasks as well as enable new tasks. To test the
+potential of the dataset, we introduce three tasks in this work: (1)
+next-product recommendation, (2) next-product recommendation with domain
+shifts, and (3) next-product title generation. With the above tasks, we
+benchmark a range of algorithms on our proposed dataset, drawing new insights
+for further research and practice. In addition, based on the proposed dataset
+and tasks, we hosted a competition in the KDD CUP 2023 and have attracted
+thousands of users and submissions. The winning solutions and the associated
+workshop can be accessed at our website https://kddcup23.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Dataset for KDD Cup 2023, https://kddcup23.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sequential Kernelized Independence Testing <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.07383v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.07383v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksandr Podkopaev, Patrick Blöbaum, Shiva Prasad Kasiviswanathan, Aaditya Ramdas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Independence testing is a classical statistical problem that has been
+extensively studied in the batch setting when one fixes the sample size before
+collecting data. However, practitioners often prefer procedures that adapt to
+the complexity of a problem at hand instead of setting sample size in advance.
+Ideally, such procedures should (a) stop earlier on easy tasks (and later on
+harder tasks), hence making better use of available resources, and (b)
+continuously monitor the data and efficiently incorporate statistical evidence
+after collecting new data, while controlling the false alarm rate. Classical
+batch tests are not tailored for streaming data: valid inference after data
+peeking requires correcting for multiple testing which results in low power.
+Following the principle of testing by betting, we design sequential kernelized
+independence tests that overcome such shortcomings. We exemplify our broad
+framework using bets inspired by kernelized dependence measures, e.g., the
+Hilbert-Schmidt independence criterion. Our test is also valid under
+non-i.i.d., time-varying settings. We demonstrate the power of our approaches
+on both simulated and real data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Bayesian travel-time tomography with geologically-complex
+  priors using sensitivity-informed polynomial chaos expansion and deep
+  generative networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04228v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04228v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giovanni Angelo Meles, Macarena Amaya, Shiran Levy, Stefano Marelli, Niklas Linde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monte Carlo Markov Chain (MCMC) methods commonly confront two fundamental
+challenges: the accurate characterization of the prior distribution and the
+efficient evaluation of the likelihood. In the context of Bayesian studies on
+tomography, principal component analysis (PCA) can in some cases facilitate the
+straightforward definition of the prior distribution, while simultaneously
+enabling the implementation of accurate surrogate models based on polynomial
+chaos expansion (PCE) to replace computationally intensive full-physics forward
+solvers. When faced with scenarios where PCA does not offer a direct means of
+easily defining the prior distribution alternative methods like deep generative
+models (e.g., variational autoencoders (VAEs)), can be employed as viable
+options. However, accurately producing a surrogate capable of capturing the
+intricate non-linear relationship between the latent parameters of a VAE and
+the outputs of forward modeling presents a notable challenge. Indeed, while PCE
+models provide high accuracy when the input-output relationship can be
+effectively approximated by relatively low-degree multivariate polynomials,
+this condition is typically unmet when utilizing latent variables derived from
+deep generative models. In this contribution, we present a strategy that
+combines the excellent reconstruction performances of VAE in terms of prio
+representation with the accuracy of PCA-PCE surrogate modeling in the context
+of Bayesian ground penetrating radar (GPR) travel-time tomography. Within the
+MCMC process, the parametrization of the VAE is leveraged for prior exploration
+and sample proposal. Concurrently, modeling is conducted using PCE, which
+operates on either globally or locally defined principal components of the VAE
+samples under examination.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Value Summation: A Novel Scoring Function for MPC-based Model-based
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.08169v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.08169v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehran Raisi, Amirhossein Noohian, Luc Mccutcheon, Saber Fallah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel scoring function for the planning module of
+MPC-based reinforcement learning methods to address the inherent bias of using
+the reward function to score trajectories. The proposed method enhances the
+learning efficiency of existing MPC-based MBRL methods using the discounted sum
+of values. The method utilizes optimal trajectories to guide policy learning
+and updates its state-action value function based on real-world and augmented
+onboard data. The learning efficiency of the proposed method is evaluated in
+selected MuJoCo Gym environments as well as in learning locomotion skills for a
+simulated model of the Cassie robot. The results demonstrate that the proposed
+method outperforms the current state-of-the-art algorithms in terms of learning
+efficiency and average reward return.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A benchmark of categorical encoders for binary classification <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09191v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09191v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Matteucci, Vadim Arzamasov, Klemens Boehm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Categorical encoders transform categorical features into numerical
+representations that are indispensable for a wide range of machine learning
+models. Existing encoder benchmark studies lack generalizability because of
+their limited choice of (1) encoders, (2) experimental factors, and (3)
+datasets. Additionally, inconsistencies arise from the adoption of varying
+aggregation strategies. This paper is the most comprehensive benchmark of
+categorical encoders to date, including an extensive evaluation of 32
+configurations of encoders from diverse families, with 36 combinations of
+experimental factors, and on 50 datasets. The study shows the profound
+influence of dataset selection, experimental factors, and aggregation
+strategies on the benchmark's conclusions -- aspects disregarded in previous
+encoder benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the 37th Conference on Neural Information Processing
+  Systems (NeurIPS 2023) Track on Datasets and Benchmarks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of Complexity Measures for Deep Learning Generalization in
+  Medical Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.03328v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.03328v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksandar Vakanski, Min Xian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generalization performance of deep learning models for medical image
+analysis often decreases on images collected with different devices for data
+acquisition, device settings, or patient population. A better understanding of
+the generalization capacity on new images is crucial for clinicians'
+trustworthiness in deep learning. Although significant research efforts have
+been recently directed toward establishing generalization bounds and complexity
+measures, still, there is often a significant discrepancy between the predicted
+and actual generalization performance. As well, related large empirical studies
+have been primarily based on validation with general-purpose image datasets.
+This paper presents an empirical study that investigates the correlation
+between 25 complexity measures and the generalization abilities of supervised
+deep learning classifiers for breast ultrasound images. The results indicate
+that PAC-Bayes flatness-based and path norm-based measures produce the most
+consistent explanation for the combination of models and data. We also
+investigate the use of multi-task classification and segmentation approach for
+breast images, and report that such learning approach acts as an implicit
+regularizer and is conducive toward improved generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pre or Post-Softmax Scores in Gradient-based Attribution Methods, What
+  is Best? <span class="chip">ICPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13197v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13197v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel Lerma, Mirtha Lucas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient based attribution methods for neural networks working as classifiers
+use gradients of network scores. Here we discuss the practical differences
+between using gradients of pre-softmax scores versus post-softmax scores, and
+their respective advantages and disadvantages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures, 2023 IEEE 13th International Conference on
+  Pattern Recognition Systems (ICPRS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SurCo: Learning Linear Surrogates For Combinatorial Nonlinear
+  Optimization Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.12547v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.12547v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron Ferber, Taoan Huang, Daochen Zha, Martin Schubert, Benoit Steiner, Bistra Dilkina, Yuandong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimization problems with nonlinear cost functions and combinatorial
+constraints appear in many real-world applications but remain challenging to
+solve efficiently compared to their linear counterparts. To bridge this gap, we
+propose $\textbf{SurCo}$ that learns linear $\underline{\text{Sur}}$rogate
+costs which can be used in existing $\underline{\text{Co}}$mbinatorial solvers
+to output good solutions to the original nonlinear combinatorial optimization
+problem. The surrogate costs are learned end-to-end with nonlinear loss by
+differentiating through the linear surrogate solver, combining the flexibility
+of gradient-based methods with the structure of linear combinatorial
+optimization. We propose three $\texttt{SurCo}$ variants:
+$\texttt{SurCo}-\texttt{zero}$ for individual nonlinear problems,
+$\texttt{SurCo}-\texttt{prior}$ for problem distributions, and
+$\texttt{SurCo}-\texttt{hybrid}$ to combine both distribution and
+problem-specific information. We give theoretical intuition motivating
+$\texttt{SurCo}$, and evaluate it empirically. Experiments show that
+$\texttt{SurCo}$ finds better solutions faster than state-of-the-art and domain
+expert approaches in real-world optimization problems such as embedding table
+sharding, inverse photonic design, and nonlinear route planning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ConCerNet: A Contrastive Learning Based Framework for Automated
+  Conservation Law Discovery and Trustworthy Dynamical System Prediction <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.05783v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.05783v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang Zhang, Tsui-Wei Weng, Subhro Das, Alexandre Megretski, Luca Daniel, Lam M. Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNN) have shown great capacity of modeling a dynamical
+system; nevertheless, they usually do not obey physics constraints such as
+conservation laws. This paper proposes a new learning framework named ConCerNet
+to improve the trustworthiness of the DNN based dynamics modeling to endow the
+invariant properties. ConCerNet consists of two steps: (i) a contrastive
+learning method to automatically capture the system invariants (i.e.
+conservation properties) along the trajectory observations; (ii) a neural
+projection layer to guarantee that the learned dynamics models preserve the
+learned invariants. We theoretically prove the functional relationship between
+the learned latent representation and the unknown system invariant function.
+Experiments show that our method consistently outperforms the baseline neural
+networks in both coordinate error and conservation metrics by a large margin.
+With neural network based parameterization and no dependence on prior
+knowledge, our method can be extended to complex and large-scale dynamics by
+leveraging an autoencoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CREPE: Learnable <span class="highlight-title">Prompt</span>ing With CLIP Improves Visual Relationship
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04838v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04838v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rakshith Subramanyam, T. S. Jayram, Rushil Anirudh, Jayaraman J. Thiagarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we explore the potential of Vision-Language Models (VLMs),
+specifically CLIP, in predicting visual object relationships, which involves
+interpreting visual features from images into language-based relations. Current
+state-of-the-art methods use complex graphical models that utilize language
+cues and visual features to address this challenge. We hypothesize that the
+strong language priors in CLIP embeddings can simplify these graphical models
+paving for a simpler approach. We adopt the UVTransE relation prediction
+framework, which learns the relation as a translational embedding with subject,
+object, and union box embeddings from a scene. We systematically explore the
+design of CLIP-based subject, object, and union-box representations within the
+UVTransE framework and propose CREPE (CLIP Representation Enhanced Predicate
+Estimation). CREPE utilizes text-based representations for all three bounding
+boxes and introduces a novel contrastive training strategy to automatically
+infer the text prompt for union-box. Our approach achieves state-of-the-art
+performance in predicate estimation, mR@5 27.79, and mR@20 31.95 on the Visual
+Genome benchmark, achieving a 15.3\% gain in performance over recent
+state-of-the-art at mR@20. This work demonstrates CLIP's effectiveness in
+object relation prediction and encourages further research on VLMs in this
+challenging domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An exponentially-growing family of universal quantum circuits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.00736v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.00736v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mo Kordzanganeh, Pavel Sekatski, Markus Pflitsch, Alexey Melnikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum machine learning has become an area of growing interest but has
+certain theoretical and hardware-specific limitations. Notably, the problem of
+vanishing gradients, or barren plateaus, renders the training impossible for
+circuits with high qubit counts, imposing a limit on the number of qubits that
+data scientists can use for solving problems. Independently, angle-embedded
+supervised quantum neural networks were shown to produce truncated Fourier
+series with a degree directly dependent on two factors: the depth of the
+encoding and the number of parallel qubits the encoding applied to. The degree
+of the Fourier series limits the model expressivity. This work introduces two
+new architectures whose Fourier degrees grow exponentially: the sequential and
+parallel exponential quantum machine learning architectures. This is done by
+efficiently using the available Hilbert space when encoding, increasing the
+expressivity of the quantum encoding. Therefore, the exponential growth allows
+staying at the low-qubit limit to create highly expressive circuits avoiding
+barren plateaus. Practically, parallel exponential architecture was shown to
+outperform the existing linear architectures by reducing their final mean
+square error value by up to 44.7% in a one-dimensional test problem.
+Furthermore, the feasibility of this technique was also shown on a trapped ion
+quantum processing unit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncovering Bias in Personal Informatics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15592v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15592v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sofia Yfantidou, Pavlos Sermpezis, Athena Vakali, Ricardo Baeza-Yates
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personal informatics (PI) systems, powered by smartphones and wearables,
+enable people to lead healthier lifestyles by providing meaningful and
+actionable insights that break down barriers between users and their health
+information. Today, such systems are used by billions of users for monitoring
+not only physical activity and sleep but also vital signs and women's and heart
+health, among others. Despite their widespread usage, the processing of
+sensitive PI data may suffer from biases, which may entail practical and
+ethical implications. In this work, we present the first comprehensive
+empirical and analytical study of bias in PI systems, including biases in raw
+data and in the entire machine learning life cycle. We use the most detailed
+framework to date for exploring the different sources of bias and find that
+biases exist both in the data generation and the model learning and
+implementation streams. According to our results, the most affected minority
+groups are users with health issues, such as diabetes, joint issues, and
+hypertension, and female users, whose data biases are propagated or even
+amplified by learning models, while intersectional biases can also be observed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Accuracy: A Critical <span class="highlight-title">Review</span> of Fairness in Machine Learning for
+  Mobile and Wearable Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15585v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15585v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sofia Yfantidou, Marios Constantinides, Dimitris Spathis, Athena Vakali, Daniele Quercia, Fahim Kawsar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of mobile and wearable computing is undergoing a revolutionary
+integration of machine learning. Devices can now diagnose diseases, predict
+heart irregularities, and unlock the full potential of human cognition.
+However, the underlying algorithms powering these predictions are not immune to
+biases with respect to sensitive attributes (e.g., gender, race), leading to
+discriminatory outcomes. The goal of this work is to explore the extent to
+which the mobile and wearable computing community has adopted ways of reporting
+information about datasets and models to surface and, eventually, counter
+biases. Our systematic review of papers published in the Proceedings of the ACM
+Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT) journal from
+2018-2022 indicates that, while there has been progress made on algorithmic
+fairness, there is still ample room for growth. Our findings show that only a
+small portion (5%) of published papers adheres to modern fairness reporting,
+while the overwhelming majority thereof focuses on accuracy or error metrics.
+To generalize these results across venues of similar scope, we analyzed recent
+proceedings of ACM MobiCom, MobiSys, and SenSys, IEEE Pervasive, and IEEE
+Transactions on Mobile Computing Computing, and found no deviation from our
+primary result. In light of these findings, our work provides practical
+guidelines for the design and development of mobile and wearable technologies
+that not only strive for accuracy but also fairness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Finite-Time Analysis of Natural Actor-Critic for POMDPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.09753v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.09753v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Semih Cayci, Niao He, R. Srikant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the reinforcement learning problem for partially observed Markov
+decision processes (POMDPs) with large or even countably infinite state spaces,
+where the controller has access to only noisy observations of the underlying
+controlled Markov chain. We consider a natural actor-critic method that employs
+a finite internal memory for policy parameterization, and a multi-step temporal
+difference learning algorithm for policy evaluation. We establish, to the best
+of our knowledge, the first non-asymptotic global convergence of actor-critic
+methods for partially observed systems under function approximation. In
+particular, in addition to the function approximation and statistical errors
+that also arise in MDPs, we explicitly characterize the error due to the use of
+finite-state controllers. This additional error is stated in terms of the total
+variation distance between the traditional belief state in POMDPs and the
+posterior distribution of the hidden state when using a finite-state
+controller. Further, we show that this error can be made small in the case of
+sliding-block controllers by using larger block sizes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Temporal Label-Refinement for Weakly-Supervised Audio-Visual Event
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06385v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06385v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kalyan Ramakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-Visual Event Localization (AVEL) is the task of temporally localizing
+and classifying \emph{audio-visual events}, i.e., events simultaneously visible
+and audible in a video. In this paper, we solve AVEL in a weakly-supervised
+setting, where only video-level event labels (their presence/absence, but not
+their locations in time) are available as supervision for training. Our idea is
+to use a base model to estimate labels on the training data at a finer temporal
+resolution than at the video level and re-train the model with these labels.
+I.e., we determine the subset of labels for each \emph{slice} of frames in a
+training video by (i) replacing the frames outside the slice with those from a
+second video having no overlap in video-level labels, and (ii) feeding this
+synthetic video into the base model to extract labels for just the slice in
+question. To handle the out-of-distribution nature of our synthetic videos, we
+propose an auxiliary objective for the base model that induces more reliable
+predictions of the localized event labels as desired. Our three-stage pipeline
+outperforms several existing AVEL methods with no architectural changes and
+improves performance on a related weakly-supervised task as well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Planning to Fairly Allocate: Probabilistic Fairness in the Restless
+  Bandit Setting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.07677v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.07677v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christine Herlihy, Aviva Prins, Aravind Srinivasan, John P. Dickerson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Restless and collapsing bandits are often used to model budget-constrained
+resource allocation in settings where arms have action-dependent transition
+probabilities, such as the allocation of health interventions among patients.
+However, state-of-the-art Whittle-index-based approaches to this planning
+problem either do not consider fairness among arms, or incentivize fairness
+without guaranteeing it. We thus introduce ProbFair, a probabilistically fair
+policy that maximizes total expected reward and satisfies the budget constraint
+while ensuring a strictly positive lower bound on the probability of being
+pulled at each timestep. We evaluate our algorithm on a real-world application,
+where interventions support continuous positive airway pressure (CPAP) therapy
+adherence among patients, as well as on a broader class of synthetic transition
+matrices. We find that ProbFair preserves utility while providing fairness
+guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sionna RT: Differentiable Ray Tracing for Radio Propagation Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11103v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11103v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakob Hoydis, Fayçal Aït Aoudia, Sebastian Cammerer, Merlin Nimier-David, Nikolaus Binder, Guillermo Marcus, Alexander Keller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sionna is a GPU-accelerated open-source library for link-level simulations
+based on TensorFlow. Since release v0.14 it integrates a differentiable ray
+tracer (RT) for the simulation of radio wave propagation. This unique feature
+allows for the computation of gradients of the channel impulse response and
+other related quantities with respect to many system and environment
+parameters, such as material properties, antenna patterns, array geometries, as
+well as transmitter and receiver orientations and positions. In this paper, we
+outline the key components of Sionna RT and showcase example applications such
+as learning radio materials and optimizing transmitter orientations by gradient
+descent. While classic ray tracing is a crucial tool for 6G research topics
+like reconfigurable intelligent surfaces, integrated sensing and
+communications, as well as user localization, differentiable ray tracing is a
+key enabler for many novel and exciting research directions, for example,
+digital twins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 5 figures, update reflects new features of Sionna RT
+  introduced in release v0.15</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data Augmentation is a Hyperparameter: Cherry-picked Self-Supervision
+  for Unsupervised Anomaly Detection is Creating the Illusion of Success 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.07734v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.07734v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemin Yoo, Tiancheng Zhao, Leman Akoglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) has emerged as a promising alternative to
+create supervisory signals to real-world problems, avoiding the extensive cost
+of manual labeling. SSL is particularly attractive for unsupervised tasks such
+as anomaly detection (AD), where labeled anomalies are rare or often
+nonexistent. A large catalog of augmentation functions has been used for
+SSL-based AD (SSAD) on image data, and recent works have reported that the type
+of augmentation has a significant impact on accuracy. Motivated by those, this
+work sets out to put image-based SSAD under a larger lens and investigate the
+role of data augmentation in SSAD. Through extensive experiments on 3 different
+detector models and across 420 AD tasks, we provide comprehensive numerical and
+visual evidences that the alignment between data augmentation and
+anomaly-generating mechanism is the key to the success of SSAD, and in the lack
+thereof, SSL may even impair accuracy. To the best of our knowledge, this is
+the first meta-analysis on the role of data augmentation in SSAD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tackling Provably Hard Representative Selection via Graph Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.10403v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.10403v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehran Kazemi, Anton Tsitsulin, Hossein Esfandiari, MohammadHossein Bateni, Deepak Ramachandran, Bryan Perozzi, Vahab Mirrokni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representative Selection (RS) is the problem of finding a small subset of
+exemplars from a dataset that is representative of the dataset. In this paper,
+we study RS for attributed graphs, and focus on finding representative nodes
+that optimize the accuracy of a model trained on the selected representatives.
+Theoretically, we establish a new hardness result forRS (in the absence of a
+graph structure) by proving that a particular, highly practical variant of it
+(RS for Learning) is hard to approximate in polynomial time within any
+reasonable factor, which implies a significant potential gap between the
+optimum solution of widely-used surrogate functions and the actual accuracy of
+the model. We then study the setting where a (homophilous) graph structure is
+available, or can be constructed, between the data points.We show that with an
+appropriate modeling approach, the presence of such a structure can turn a hard
+RS (for learning) problem into one that can be effectively solved. To this end,
+we develop RS-GNN, a representation learning-based RS model based on Graph
+Neural Networks. Empirically, we demonstrate the effectiveness of RS-GNN on
+problems with predefined graph structures as well as problems with graphs
+induced from node feature similarities, by showing that RS-GNN achieves
+significant improvements over established baselines on a suite of eight
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Transactions of Machine Learning Research (TMLR)
+  Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards the Sparseness of Projection Head in <span class="highlight-title">Self-Supervised</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08913v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08913v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeen Song, Xingzhe Su, Jingyao Wang, Wenwen Qiang, Changwen Zheng, Fuchun Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, self-supervised learning (SSL) has emerged as a promising
+approach for extracting valuable representations from unlabeled data. One
+successful SSL method is contrastive learning, which aims to bring positive
+examples closer while pushing negative examples apart. Many current contrastive
+learning approaches utilize a parameterized projection head. Through a
+combination of empirical analysis and theoretical investigation, we provide
+insights into the internal mechanisms of the projection head and its
+relationship with the phenomenon of dimensional collapse. Our findings
+demonstrate that the projection head enhances the quality of representations by
+performing contrastive loss in a projected subspace. Therefore, we propose an
+assumption that only a subset of features is necessary when minimizing the
+contrastive loss of a mini-batch of data. Theoretical analysis further suggests
+that a sparse projection head can enhance generalization, leading us to
+introduce SparseHead - a regularization term that effectively constrains the
+sparsity of the projection head, and can be seamlessly integrated with any
+self-supervised learning (SSL) approaches. Our experimental results validate
+the effectiveness of SparseHead, demonstrating its ability to improve the
+performance of existing contrastive methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On sampling determinantal and Pfaffian point processes on a quantum
+  computer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15851v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15851v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rémi Bardenet, Michaël Fanuel, Alexandre Feller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DPPs were introduced by Macchi as a model in quantum optics the 1970s. Since
+then, they have been widely used as models and subsampling tools in statistics
+and computer science. Most applications require sampling from a DPP, and given
+their quantum origin, it is natural to wonder whether sampling a DPP on a
+quantum computer is easier than on a classical one. We focus here on DPPs over
+a finite state space, which are distributions over the subsets of
+$\{1,\dots,N\}$ parametrized by an $N\times N$ Hermitian kernel matrix. Vanilla
+sampling consists in two steps, of respective costs $\mathcal{O}(N^3)$ and
+$\mathcal{O}(Nr^2)$ operations on a classical computer, where $r$ is the rank
+of the kernel matrix. A large first part of the current paper consists in
+explaining why the state-of-the-art in quantum simulation of fermionic systems
+already yields quantum DPP sampling algorithms. We then modify existing quantum
+circuits, and discuss their insertion in a full DPP sampling pipeline that
+starts from practical kernel specifications. The bottom line is that, with $P$
+(classical) parallel processors, we can divide the preprocessing cost by $P$
+and build a quantum circuit with $\mathcal{O}(Nr)$ gates that sample a given
+DPP, with depth varying from $\mathcal{O}(N)$ to $\mathcal{O}(r\log N)$
+depending on qubit-communication constraints on the target machine. We also
+connect existing work on the simulation of superconductors to Pfaffian point
+processes, which generalize DPPs and would be a natural addition to the machine
+learner's toolbox. Finally, the circuits are empirically validated on a
+classical simulator and on 5-qubit machines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>48 pages, 8 figures. Additional results about parity of cardinality
+  of PfPP samples</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ M-FLAG: Medical Vision-Language <span class="highlight-title">Pre-train</span>ing with Frozen Language Models
+  and Latent Space Geometry Optimization <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08347v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08347v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Che Liu, Sibo Cheng, Chen Chen, Mengyun Qiao, Weitong Zhang, Anand Shah, Wenjia Bai, Rossella Arcucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical vision-language models enable co-learning and integrating features
+from medical imaging and clinical text. However, these models are not easy to
+train and the latent representation space can be complex. Here we propose a
+novel way for pre-training and regularising medical vision-language models. The
+proposed method, named Medical vision-language pre-training with Frozen
+language models and Latent spAce Geometry optimization (M-FLAG), leverages a
+frozen language model for training stability and efficiency and introduces a
+novel orthogonality loss to harmonize the latent space geometry. We demonstrate
+the potential of the pre-trained model on three downstream tasks: medical image
+classification, segmentation, and object detection. Extensive experiments
+across five public datasets demonstrate that M-FLAG significantly outperforms
+existing medical vision-language pre-training approaches and reduces the number
+of parameters by 78\%. Notably, M-FLAG achieves outstanding performance on the
+segmentation task while using only 1\% of the RSNA dataset, even outperforming
+ImageNet pre-trained models that have been fine-tuned using 100\% of the data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalization Error Bounds for Noisy, Iterative Algorithms via Maximal
+  Leakage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14518v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14518v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ibrahim Issa, Amedeo Roberto Esposito, Michael Gastpar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We adopt an information-theoretic framework to analyze the generalization
+behavior of the class of iterative, noisy learning algorithms. This class is
+particularly suitable for study under information-theoretic metrics as the
+algorithms are inherently randomized, and it includes commonly used algorithms
+such as Stochastic Gradient Langevin Dynamics (SGLD). Herein, we use the
+maximal leakage (equivalently, the Sibson mutual information of order infinity)
+metric, as it is simple to analyze, and it implies both bounds on the
+probability of having a large generalization error and on its expected value.
+We show that, if the update function (e.g., gradient) is bounded in $L_2$-norm
+and the additive noise is isotropic Gaussian noise, then one can obtain an
+upper-bound on maximal leakage in semi-closed form. Furthermore, we demonstrate
+how the assumptions on the update function affect the optimal (in the sense of
+minimizing the induced maximal leakage) choice of the noise. Finally, we
+compute explicit tight upper bounds on the induced maximal leakage for other
+scenarios of interest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated to fix an error in Theorem 4 (asymptotic analysis)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Softmax for Uncertainty Approximation in Text Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.14037v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.14037v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Nugaard Holm, Dustin Wright, Isabelle Augenstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty approximation in text classification is an important area with
+applications in domain adaptation and interpretability. One of the most widely
+used uncertainty approximation methods is Monte Carlo (MC) Dropout, which is
+computationally expensive as it requires multiple forward passes through the
+model. A cheaper alternative is to simply use the softmax based on a single
+forward pass without dropout to estimate model uncertainty. However, prior work
+has indicated that these predictions tend to be overconfident. In this paper,
+we perform a thorough empirical analysis of these methods on five datasets with
+two base neural architectures in order to identify the trade-offs between the
+two. We compare both softmax and an efficient version of MC Dropout on their
+uncertainty approximations and downstream text classification performance,
+while weighing their runtime (cost) against performance (benefit). We find
+that, while MC dropout produces the best uncertainty approximations, using a
+simple softmax leads to competitive and in some cases better uncertainty
+estimation for text classification at a much lower computational cost,
+suggesting that softmax can in fact be a sufficient uncertainty estimate when
+computational resources are a concern.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BOF-UCB: A Bayesian-Optimistic Frequentist Algorithm for Non-Stationary
+  Contextual Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03587v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03587v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicklas Werge, Abdullah Akgül, Melih Kandemir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel Bayesian-Optimistic Frequentist Upper Confidence Bound
+(BOF-UCB) algorithm for stochastic contextual linear bandits in non-stationary
+environments. This unique combination of Bayesian and frequentist principles
+enhances adaptability and performance in dynamic settings. The BOF-UCB
+algorithm utilizes sequential Bayesian updates to infer the posterior
+distribution of the unknown regression parameter, and subsequently employs a
+frequentist approach to compute the Upper Confidence Bound (UCB) by maximizing
+the expected reward over the posterior distribution. We provide theoretical
+guarantees of BOF-UCB's performance and demonstrate its effectiveness in
+balancing exploration and exploitation on synthetic datasets and classical
+control tasks in a reinforcement learning setting. Our results show that
+BOF-UCB outperforms existing methods, making it a promising solution for
+sequential decision-making in non-stationary environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaMSS: Adaptive Multi-Modality Segmentation-to-Survival Learning for
+  Survival Outcome Prediction from PET/CT Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09946v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09946v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Meng, Bingxin Gu, Michael Fulham, Shaoli Song, Dagan Feng, Lei Bi, Jinman Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Survival prediction is a major concern for cancer management. Deep survival
+models based on deep learning have been widely adopted to perform end-to-end
+survival prediction from medical images. Recent deep survival models achieved
+promising performance by jointly performing tumor segmentation with survival
+prediction, where the models were guided to extract tumor-related information
+through Multi-Task Learning (MTL). However, these deep survival models have
+difficulties in exploring out-of-tumor prognostic information. In addition,
+existing deep survival models are unable to effectively leverage multi-modality
+images. Empirically-designed fusion strategies were commonly adopted to fuse
+multi-modality information via task-specific manually-designed networks, thus
+limiting the adaptability to different scenarios. In this study, we propose an
+Adaptive Multi-modality Segmentation-to-Survival model (AdaMSS) for survival
+prediction from PET/CT images. Instead of adopting MTL, we propose a novel
+Segmentation-to-Survival Learning (SSL) strategy, where our AdaMSS is trained
+for tumor segmentation and survival prediction sequentially in two stages. This
+strategy enables the AdaMSS to focus on tumor regions in the first stage and
+gradually expand its focus to include other prognosis-related regions in the
+second stage. We also propose a data-driven strategy to fuse multi-modality
+information, which realizes adaptive optimization of fusion strategies based on
+training data during training. With the SSL and data-driven fusion strategies,
+our AdaMSS is designed as an adaptive model that can self-adapt its focus
+regions and fusion strategy for different training stages. Extensive
+experiments with two large clinical datasets show that our AdaMSS outperforms
+state-of-the-art survival prediction methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Alpha-divergence Variational Inference Meets Importance Weighted
+  Auto-Encoders: Methodology and Asymptotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kamélia Daudel, Joe Benton, Yuyang Shi, Arnaud Doucet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Several algorithms involving the Variational R\'enyi (VR) bound have been
+proposed to minimize an alpha-divergence between a target posterior
+distribution and a variational distribution. Despite promising empirical
+results, those algorithms resort to biased stochastic gradient descent
+procedures and thus lack theoretical guarantees. In this paper, we formalize
+and study the VR-IWAE bound, a generalization of the Importance Weighted
+Auto-Encoder (IWAE) bound. We show that the VR-IWAE bound enjoys several
+desirable properties and notably leads to the same stochastic gradient descent
+procedure as the VR bound in the reparameterized case, but this time by relying
+on unbiased gradient estimators. We then provide two complementary theoretical
+analyses of the VR-IWAE bound and thus of the standard IWAE bound. Those
+analyses shed light on the benefits or lack thereof of these bounds. Lastly, we
+illustrate our theoretical claims over toy and real-data examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MixPath: A Unified Approach for One-shot Neural Architecture Search <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2001.05887v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2001.05887v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangxiang Chu, Shun Lu, Xudong Li, Bo Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blending multiple convolutional kernels is proved advantageous in neural
+architecture design. However, current two-stage neural architecture search
+methods are mainly limited to single-path search spaces. How to efficiently
+search models of multi-path structures remains a difficult problem. In this
+paper, we are motivated to train a one-shot multi-path supernet to accurately
+evaluate the candidate architectures. Specifically, we discover that in the
+studied search spaces, feature vectors summed from multiple paths are nearly
+multiples of those from a single path. Such disparity perturbs the supernet
+training and its ranking ability. Therefore, we propose a novel mechanism
+called Shadow Batch Normalization (SBN) to regularize the disparate feature
+statistics. Extensive experiments prove that SBNs are capable of stabilizing
+the optimization and improving ranking performance. We call our unified
+multi-path one-shot approach as MixPath, which generates a series of models
+that achieve state-of-the-art results on ImageNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LongNet: Scaling <span class="highlight-title">Transformer</span>s to 1,000,000,000 Tokens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02486v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02486v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Ding, Shuming Ma, Li Dong, Xingxing Zhang, Shaohan Huang, Wenhui Wang, Nanning Zheng, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling sequence length has become a critical demand in the era of large
+language models. However, existing methods struggle with either computational
+complexity or model expressivity, rendering the maximum sequence length
+restricted. To address this issue, we introduce LongNet, a Transformer variant
+that can scale sequence length to more than 1 billion tokens, without
+sacrificing the performance on shorter sequences. Specifically, we propose
+dilated attention, which expands the attentive field exponentially as the
+distance grows. LongNet has significant advantages: 1) it has a linear
+computation complexity and a logarithm dependency between any two tokens in a
+sequence; 2) it can be served as a distributed trainer for extremely long
+sequences; 3) its dilated attention is a drop-in replacement for standard
+attention, which can be seamlessly integrated with the existing
+Transformer-based optimization. Experiments results demonstrate that LongNet
+yields strong performance on both long-sequence modeling and general language
+tasks. Our work opens up new possibilities for modeling very long sequences,
+e.g., treating a whole corpus or even the entire Internet as a sequence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Meta-Evaluation Problem in Explainable AI: Identifying Reliable
+  Estimators with MetaQuantus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07265v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07265v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna Hedström, Philine Bommer, Kristoffer K. Wickstrøm, Wojciech Samek, Sebastian Lapuschkin, Marina M. -C. Höhne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the unsolved challenges in the field of Explainable AI (XAI) is
+determining how to most reliably estimate the quality of an explanation method
+in the absence of ground truth explanation labels. Resolving this issue is of
+utmost importance as the evaluation outcomes generated by competing evaluation
+methods (or ''quality estimators''), which aim at measuring the same property
+of an explanation method, frequently present conflicting rankings. Such
+disagreements can be challenging for practitioners to interpret, thereby
+complicating their ability to select the best-performing explanation method. We
+address this problem through a meta-evaluation of different quality estimators
+in XAI, which we define as ''the process of evaluating the evaluation method''.
+Our novel framework, MetaQuantus, analyses two complementary performance
+characteristics of a quality estimator: its resilience to noise and reactivity
+to randomness, thus circumventing the need for ground truth labels. We
+demonstrate the effectiveness of our framework through a series of experiments,
+targeting various open questions in XAI such as the selection and
+hyperparameter optimisation of quality estimators. Our work is released under
+an open-source license (https://github.com/annahedstroem/MetaQuantus) to serve
+as a development tool for XAI- and Machine Learning (ML) practitioners to
+verify and benchmark newly constructed quality estimators in a given
+explainability context. With this work, we provide the community with clear and
+theoretically-grounded guidance for identifying reliable evaluation methods,
+thus facilitating reproducibility in the field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 15 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal brain age estimation using interpretable adaptive
+  population-graph learning <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04639v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04639v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyriaki-Margarita Bintsi, Vasileios Baltatzis, Rolandos Alexandros Potamias, Alexander Hammers, Daniel Rueckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain age estimation is clinically important as it can provide valuable
+information in the context of neurodegenerative diseases such as Alzheimer's.
+Population graphs, which include multimodal imaging information of the subjects
+along with the relationships among the population, have been used in literature
+along with Graph Convolutional Networks (GCNs) and have proved beneficial for a
+variety of medical imaging tasks. A population graph is usually static and
+constructed manually using non-imaging information. However, graph construction
+is not a trivial task and might significantly affect the performance of the
+GCN, which is inherently very sensitive to the graph structure. In this work,
+we propose a framework that learns a population graph structure optimized for
+the downstream task. An attention mechanism assigns weights to a set of imaging
+and non-imaging features (phenotypes), which are then used for edge extraction.
+The resulting graph is used to train the GCN. The entire pipeline can be
+trained end-to-end. Additionally, by visualizing the attention weights that
+were the most important for the graph construction, we increase the
+interpretability of the graph. We use the UK Biobank, which provides a large
+variety of neuroimaging and non-imaging phenotypes, to evaluate our method on
+brain age regression and classification. The proposed method outperforms
+competing static graph approaches and other state-of-the-art adaptive methods.
+We further show that the assigned attention scores indicate that there are both
+imaging and non-imaging phenotypes that are informative for brain age
+estimation and are in agreement with the relevant literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IntelliGraphs: <span class="highlight-title">Dataset</span>s for Benchmarking Knowledge Graph Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thiviyan Thanapalasingam, Emile van Krieken, Peter Bloem, Paul Groth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graph Embedding (KGE) models are used to learn continuous
+representations of entities and relations. A key task in the literature is
+predicting missing links between entities. However, Knowledge Graphs are not
+just sets of links but also have semantics underlying their structure.
+Semantics is crucial in several downstream tasks, such as query answering or
+reasoning. We introduce the subgraph inference task, where a model has to
+generate likely and semantically valid subgraphs. We propose IntelliGraphs, a
+set of five new Knowledge Graph datasets. The IntelliGraphs datasets contain
+subgraphs with semantics expressed in logical rules for evaluating subgraph
+inference. We also present the dataset generator that produced the synthetic
+datasets. We designed four novel baseline models, which include three models
+based on traditional KGEs. We evaluate their expressiveness and show that these
+models cannot capture the semantics. We believe this benchmark will encourage
+the development of machine learning models that emphasize semantic
+understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CB-HVTNet: A channel-boosted hybrid vision <span class="highlight-title">transformer</span> network for
+  lymphocyte assessment in histopathological images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09211v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09211v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Momina Liaqat Ali, Zunaira Rauf, Asifullah Khan, Anabia Sohail, Rafi Ullah, Jeonghwan Gwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers, due to their ability to learn long range dependencies, have
+overcome the shortcomings of convolutional neural networks (CNNs) for global
+perspective learning. Therefore, they have gained the focus of researchers for
+several vision related tasks including medical diagnosis. However, their
+multi-head attention module only captures global level feature representations,
+which is insufficient for medical images. To address this issue, we propose a
+Channel Boosted Hybrid Vision Transformer (CB HVT) that uses transfer learning
+to generate boosted channels and employs both transformers and CNNs to analyse
+lymphocytes in histopathological images. The proposed CB HVT comprises five
+modules, including a channel generation module, channel exploitation module,
+channel merging module, region-aware module, and a detection and segmentation
+head, which work together to effectively identify lymphocytes. The channel
+generation module uses the idea of channel boosting through transfer learning
+to extract diverse channels from different auxiliary learners. In the CB HVT,
+these boosted channels are first concatenated and ranked using an attention
+mechanism in the channel exploitation module. A fusion block is then utilized
+in the channel merging module for a gradual and systematic merging of the
+diverse boosted channels to improve the network's learning representations. The
+CB HVT also employs a proposal network in its region aware module and a head to
+effectively identify objects, even in overlapping regions and with artifacts.
+We evaluated the proposed CB HVT on two publicly available datasets for
+lymphocyte assessment in histopathological images. The results show that CB HVT
+outperformed other state of the art detection models, and has good
+generalization ability, demonstrating its value as a tool for pathologists.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Why Does Little Robustness Help? Understanding Adversarial
+  Transferability From Surrogate Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07873v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07873v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yechao Zhang, Shengshan Hu, Leo Yu Zhang, Junyu Shi, Minghui Li, Xiaogeng Liu, Wei Wan, Hai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs
+that successfully fool white-box surrogate models can also deceive other
+black-box models with different architectures. Although a bunch of empirical
+studies have provided guidance on generating highly transferable AEs, many of
+these findings lack explanations and even lead to inconsistent advice. In this
+paper, we take a further step towards understanding adversarial
+transferability, with a particular focus on surrogate aspects. Starting from
+the intriguing little robustness phenomenon, where models adversarially trained
+with mildly perturbed adversarial samples can serve as better surrogates, we
+attribute it to a trade-off between two predominant factors: model smoothness
+and gradient similarity. Our investigations focus on their joint effects,
+rather than their separate correlations with transferability. Through a series
+of theoretical and empirical analyses, we conjecture that the data distribution
+shift in adversarial training explains the degradation of gradient similarity.
+Building on these insights, we explore the impacts of data augmentation and
+gradient regularization on transferability and identify that the trade-off
+generally exists in the various training mechanisms, thus building a
+comprehensive blueprint for the regulation mechanism behind transferability.
+Finally, we provide a general route for constructing better surrogates to boost
+transferability which optimizes both model smoothness and gradient similarity
+simultaneously, e.g., the combination of input gradient regularization and
+sharpness-aware minimization (SAM), validated by extensive experiments. In
+summary, we call for attention to the united impacts of these two factors for
+launching effective transfer attacks, rather than optimizing one while ignoring
+the other, and emphasize the crucial role of manipulating surrogate models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21
+  pages, 12 figures, 13 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Does Circuit Analysis Interpretability Scale? Evidence from Multiple
+  Choice Capabilities in Chinchilla 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09458v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09458v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Lieberum, Matthew Rahtz, János Kramár, Neel Nanda, Geoffrey Irving, Rohin Shah, Vladimir Mikulik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  \emph{Circuit analysis} is a promising technique for understanding the
+internal mechanisms of language models. However, existing analyses are done in
+small models far from the state of the art. To address this, we present a case
+study of circuit analysis in the 70B Chinchilla model, aiming to test the
+scalability of circuit analysis. In particular, we study multiple-choice
+question answering, and investigate Chinchilla's capability to identify the
+correct answer \emph{label} given knowledge of the correct answer \emph{text}.
+We find that the existing techniques of logit attribution, attention pattern
+visualization, and activation patching naturally scale to Chinchilla, allowing
+us to identify and categorize a small set of `output nodes' (attention heads
+and MLPs).
+  We further study the `correct letter' category of attention heads aiming to
+understand the semantics of their features, with mixed results. For normal
+multiple-choice question answers, we significantly compress the query, key and
+value subspaces of the head without loss of performance when operating on the
+answer labels for multiple-choice questions, and we show that the query and key
+subspaces represent an `Nth item in an enumeration' feature to at least some
+extent. However, when we attempt to use this explanation to understand the
+heads' behaviour on a more general distribution including randomized answer
+labels, we find that it is only a partial explanation, suggesting there is more
+to learn about the operation of `correct letter' heads on multiple choice
+question answering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The RL Perceptron: Generalisation Dynamics of Policy Learning in High
+  Dimensions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10404v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10404v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nishil Patel, Sebastian Lee, Stefano Sarao Mannelli, Sebastian Goldt, Adrew Saxe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) algorithms have proven transformative in a range
+of domains. To tackle real-world domains, these systems often use neural
+networks to learn policies directly from pixels or other high-dimensional
+sensory input. By contrast, much theory of RL has focused on discrete state
+spaces or worst-case analysis, and fundamental questions remain about the
+dynamics of policy learning in high-dimensional settings. Here, we propose a
+solvable high-dimensional model of RL that can capture a variety of learning
+protocols, and derive its typical dynamics as a set of closed-form ordinary
+differential equations (ODEs). We derive optimal schedules for the learning
+rates and task difficulty - analogous to annealing schemes and curricula during
+training in RL - and show that the model exhibits rich behaviour, including
+delayed learning under sparse rewards; a variety of learning regimes depending
+on reward baselines; and a speed-accuracy trade-off driven by reward
+stringency. Experiments on variants of the Procgen game "Bossfight" and Arcade
+Learning Environment game "Pong" also show such a speed-accuracy trade-off in
+practice. Together, these results take a step towards closing the gap between
+theory and practice in high-dimensional RL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Network-GIANT: Fully distributed Newton-type optimization via harmonic
+  Hessian consensus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07898v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07898v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessio Maritan, Ganesh Sharma, Luca Schenato, Subhrakanti Dey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper considers the problem of distributed multi-agent learning, where
+the global aim is to minimize a sum of local objective (empirical loss)
+functions through local optimization and information exchange between
+neighbouring nodes. We introduce a Newton-type fully distributed optimization
+algorithm, Network-GIANT, which is based on GIANT, a Federated learning
+algorithm that relies on a centralized parameter server. The Network-GIANT
+algorithm is designed via a combination of gradient-tracking and a Newton-type
+iterative algorithm at each node with consensus based averaging of local
+gradient and Newton updates. We prove that our algorithm guarantees semi-global
+and exponential convergence to the exact solution over the network assuming
+strongly convex and smooth loss functions. We provide empirical evidence of the
+superior convergence performance of Network-GIANT over other state-of-art
+distributed learning algorithms such as Network-DANE and Newton-Raphson
+Consensus.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Automated Hemorrhage Detection in Sparse-view Computed
+  Tomography via Deep Convolutional Neural Network based Artifact Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09340v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09340v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Thalhammer, Manuel Schultheiss, Tina Dorosti, Tobias Lasser, Franz Pfeiffer, Daniela Pfeiffer, Florian Schaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Sparse-view computed tomography (CT) is an effective way to reduce
+dose by lowering the total number of views acquired, albeit at the expense of
+image quality, which, in turn, can impact the ability to detect diseases. We
+explore deep learning-based artifact reduction in sparse-view cranial CT scans
+and its impact on automated hemorrhage detection. Methods: We trained a U-Net
+for artefact reduction on simulated sparse-view cranial CT scans from 3000
+patients obtained from a public dataset and reconstructed with varying levels
+of sub-sampling. Additionally, we trained a convolutional neural network on
+fully sampled CT data from 17,545 patients for automated hemorrhage detection.
+We evaluated the classification performance using the area under the receiver
+operator characteristic curves (AUC-ROCs) with corresponding 95% confidence
+intervals (CIs) and the DeLong test, along with confusion matrices. The
+performance of the U-Net was compared to an analytical approach based on total
+variation (TV). Results: The U-Net performed superior compared to unprocessed
+and TV-processed images with respect to image quality and automated hemorrhage
+diagnosis. With U-Net post-processing, the number of views can be reduced from
+4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;
+0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256
+views (0.967; 0.964-0.969) with a slight performance decrease (P<.001).
+Conclusion: The results suggest that U-Net based artifact reduction
+substantially enhances automated hemorrhage detection in sparse-view cranial
+CTs. Our findings highlight that appropriate post-processing is crucial for
+optimal image quality and diagnostic accuracy while minimizing radiation dose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Making Substitute Models More Bayesian Can Enhance Transferability of
+  Adversarial Examples <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.05086v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.05086v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhang Li, Yiwen Guo, Wangmeng Zuo, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transferability of adversarial examples across deep neural networks
+(DNNs) is the crux of many black-box attacks. Many prior efforts have been
+devoted to improving the transferability via increasing the diversity in inputs
+of some substitute models. In this paper, by contrast, we opt for the diversity
+in substitute models and advocate to attack a Bayesian model for achieving
+desirable transferability. Deriving from the Bayesian formulation, we develop a
+principled strategy for possible finetuning, which can be combined with many
+off-the-shelf Gaussian posterior approximations over DNN parameters. Extensive
+experiments have been conducted to verify the effectiveness of our method, on
+common benchmark datasets, and the results demonstrate that our method
+outperforms recent state-of-the-arts by large margins (roughly 19% absolute
+increase in average attack success rate on ImageNet), and, by combining with
+these recent methods, further performance gain can be obtained. Our code:
+https://github.com/qizhangli/MoreBayesian-attack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2023, fix typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can In-context Learners Learn a Reasoning Concept from Demonstrations? <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.01692v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.01692v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Štefánik, Marek Kadlčík
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models exhibit an emergent ability to learn a new task from a small
+number of input-output demonstrations. However, recent work shows that
+in-context learners largely rely on their pre-trained knowledge, such as the
+sentiment of the labels, instead of learning new associations from the input.
+We argue that the commonly-used few-shot evaluation using a random selection of
+in-context demonstrations can not disentangle models' reliance on such biases,
+as most of the randomly-selected demonstrations do not present relations
+informative for prediction beyond exposing the task's input-output
+distribution.
+  Therefore, to evaluate models' in-context learning ability independent of
+models' memory, we introduce a Concept-sharing few-shot learning method
+choosing the demonstrations that share an underlying concept with the predicted
+sample. We extract a set of such concepts from available human explanations and
+measure how much models can benefit from presenting these concepts in few-shot
+demonstrations.
+  We find that most of the recent in-context learners can not consistently
+benefit from the demonstrated concepts, irrespective of the model size.
+However, we note that T0 models are more sensitive to exhibited concepts,
+benefiting from concept-sharing demonstrations in 7 out of 8 evaluation
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Awarded Best Paper at ACL 2023 Natural Language Reasoning and
+  Structured Explanations (NLRSE) workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Alternately Optimized Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.03638v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.03638v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Han, Xiaorui Liu, Haitao Mao, MohamadAli Torkamani, Feng Shi, Victor Lee, Jiliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have greatly advanced the semi-supervised node
+classification task on graphs. The majority of existing GNNs are trained in an
+end-to-end manner that can be viewed as tackling a bi-level optimization
+problem. This process is often inefficient in computation and memory usage. In
+this work, we propose a new optimization framework for semi-supervised learning
+on graphs. The proposed framework can be conveniently solved by the alternating
+optimization algorithms, resulting in significantly improved efficiency.
+Extensive experiments demonstrate that the proposed method can achieve
+comparable or better performance with state-of-the-art baselines while it has
+significantly better computation and memory efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-linear Embeddings in Hil<span class="highlight-title">bert</span> Simplex Geometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.11434v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.11434v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank Nielsen, Ke Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key technique of machine learning and computer vision is to embed discrete
+weighted graphs into continuous spaces for further downstream processing.
+Embedding discrete hierarchical structures in hyperbolic geometry has proven
+very successful since it was shown that any weighted tree can be embedded in
+that geometry with arbitrary low distortion. Various optimization methods for
+hyperbolic embeddings based on common models of hyperbolic geometry have been
+studied. In this paper, we consider Hilbert geometry for the standard simplex
+which is isometric to a vector space equipped with the variation polytope norm.
+We study the representation power of this Hilbert simplex geometry by embedding
+distance matrices of graphs. Our findings demonstrate that Hilbert simplex
+geometry is competitive to alternative geometries such as the Poincar\'e
+hyperbolic ball or the Euclidean geometry for embedding tasks while being fast
+and numerically robust.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ H$_2$O: Heavy-Hitter Oracle for Efficient Generative Inference of Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14048v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14048v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher Ré, Clark Barrett, Zhangyang Wang, Beidi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), despite their recent impressive
+accomplishments, are notably cost-prohibitive to deploy, particularly for
+applications involving long-content generation, such as dialogue systems and
+story writing. Often, a large amount of transient state information, referred
+to as the KV cache, is stored in GPU memory in addition to model parameters,
+scaling linearly with the sequence length and batch size. In this paper, we
+introduce a novel approach for implementing the KV cache which significantly
+reduces its memory footprint. Our approach is based on the noteworthy
+observation that a small portion of tokens contributes most of the value when
+computing attention scores. We call these tokens Heavy Hitters (H$_2$). Through
+a comprehensive investigation, we find that (i) the emergence of H$_2$ is
+natural and strongly correlates with the frequent co-occurrence of tokens in
+the text, and (ii) removing them results in significant performance
+degradation. Based on these insights, we propose Heavy Hitter Oracle (H$_2$O),
+a KV cache eviction policy that dynamically retains a balance of recent and
+H$_2$ tokens. We formulate the KV cache eviction as a dynamic submodular
+problem and prove (under mild assumptions) a theoretical guarantee for our
+novel eviction algorithm which could help guide future work. We validate the
+accuracy of our algorithm with OPT, LLaMA, and GPT-NeoX across a wide range of
+tasks. Our implementation of H$_2$O with 20% heavy hitters improves the
+throughput over three leading inference systems DeepSpeed Zero-Inference,
+Hugging Face Accelerate, and FlexGen by up to 29$\times$, 29$\times$, and
+3$\times$ on OPT-6.7B and OPT-30B. With the same batch size, H2O can reduce the
+latency by up to 1.9$\times$. The code is available at
+https://github.com/FMInference/H2O.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retentive Network: A Successor to <span class="highlight-title">Transformer</span> for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08621v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08621v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose Retentive Network (RetNet) as a foundation
+architecture for large language models, simultaneously achieving training
+parallelism, low-cost inference, and good performance. We theoretically derive
+the connection between recurrence and attention. Then we propose the retention
+mechanism for sequence modeling, which supports three computation paradigms,
+i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel
+representation allows for training parallelism. The recurrent representation
+enables low-cost $O(1)$ inference, which improves decoding throughput, latency,
+and GPU memory without sacrificing performance. The chunkwise recurrent
+representation facilitates efficient long-sequence modeling with linear
+complexity, where each chunk is encoded parallelly while recurrently
+summarizing the chunks. Experimental results on language modeling show that
+RetNet achieves favorable scaling results, parallel training, low-cost
+deployment, and efficient inference. The intriguing properties make RetNet a
+strong successor to Transformer for large language models. Code will be
+available at https://aka.ms/retnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Positional Encoding via Random Feature Propagation <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02918v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02918v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moshe Eliasof, Fabrizio Frasca, Beatrice Bevilacqua, Eran Treister, Gal Chechik, Haggai Maron
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Two main families of node feature augmentation schemes have been explored for
+enhancing GNNs: random features and spectral positional encoding. Surprisingly,
+however, there is still no clear understanding of the relation between these
+two augmentation schemes. Here we propose a novel family of positional encoding
+schemes which draws a link between the above two approaches and improves over
+both. The new approach, named Random Feature Propagation (RFP), is inspired by
+the power iteration method and its generalizations. It concatenates several
+intermediate steps of an iterative algorithm for computing the dominant
+eigenvectors of a propagation matrix, starting from random node features.
+Notably, these propagation steps are based on graph-dependent propagation
+operators that can be either predefined or learned. We explore the theoretical
+and empirical benefits of RFP. First, we provide theoretical justifications for
+using random features, for incorporating early propagation steps, and for using
+multiple random initializations. Then, we empirically demonstrate that RFP
+significantly outperforms both spectral PE and random features in multiple node
+classification and graph classification benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solvent: A Framework for Protein Folding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04603v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04603v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemyung Lee, Kyeongtak Han, Jaehoon Kim, Hasun Yu, Youhan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Consistency and reliability are crucial for conducting AI research. Many
+famous research fields, such as object detection, have been compared and
+validated with solid benchmark frameworks. After AlphaFold2, the protein
+folding task has entered a new phase, and many methods are proposed based on
+the component of AlphaFold2. The importance of a unified research framework in
+protein folding contains implementations and benchmarks to consistently and
+fairly compare various approaches. To achieve this, we present Solvent, an
+protein folding framework that supports significant components of
+state-of-th-arts models in the manner of off-the-shelf interface Solvent
+contains different models implemented in a unified codebase and supports
+training and evaluation for defined models on the same dataset. We benchmark
+well-known algorithms and their components and provide experiments that give
+helpful insights into the protein structure modeling field. We hope that
+Solvent will increase the reliability and consistency of proposed models and
+gives efficiency in both speed and costs, resulting in acceleration on protein
+folding modeling research. The code is available at
+https://github.com/kakaobrain/solvent, and the project will continue to be
+developed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint, 8pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Off-Policy Average Reward Actor-Critic with Deterministic Policy Search <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12239v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12239v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naman Saxena, Subhojyoti Khastigir, Shishir Kolathaya, Shalabh Bhatnagar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The average reward criterion is relatively less studied as most existing
+works in the Reinforcement Learning literature consider the discounted reward
+criterion. There are few recent works that present on-policy average reward
+actor-critic algorithms, but average reward off-policy actor-critic is
+relatively less explored. In this work, we present both on-policy and
+off-policy deterministic policy gradient theorems for the average reward
+performance criterion. Using these theorems, we also present an Average Reward
+Off-Policy Deep Deterministic Policy Gradient (ARO-DDPG) Algorithm. We first
+show asymptotic convergence analysis using the ODE-based method. Subsequently,
+we provide a finite time analysis of the resulting stochastic approximation
+scheme with linear function approximator and obtain an $\epsilon$-optimal
+stationary policy with a sample complexity of $\Omega(\epsilon^{-2.5})$. We
+compare the average reward performance of our proposed ARO-DDPG algorithm and
+observe better empirical performance compared to state-of-the-art on-policy
+average reward actor-critic algorithms over MuJoCo-based environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trustworthy Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.06265v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.06265v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shoujin Wang, Xiuzhen Zhang, Yan Wang, Huan Liu, Francesco Ricci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems (RSs) aim to help users to effectively retrieve items of
+their interests from a large catalogue. For a quite long period of time,
+researchers and practitioners have been focusing on developing accurate RSs.
+Recent years have witnessed an increasing number of threats to RSs, coming from
+attacks, system and user generated noise, system bias. As a result, it has
+become clear that a strict focus on RS accuracy is limited and the research
+must consider other important factors, e.g., trustworthiness. For end users, a
+trustworthy RS (TRS) should not only be accurate, but also transparent,
+unbiased and fair as well as robust to noise or attacks. These observations
+actually led to a paradigm shift of the research on RSs: from accuracy-oriented
+RSs to TRSs. However, researchers lack a systematic overview and discussion of
+the literature in this novel and fast developing field of TRSs. To this end, in
+this paper, we provide an overview of TRSs, including a discussion of the
+motivation and basic concepts of TRSs, a presentation of the challenges in
+building TRSs, and a perspective on the future directions in this area. We also
+provide a novel conceptual framework to support the construction of TRSs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SwinGNN: Rethinking Permutation Invariance in Diffusion Models for Graph
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01646v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01646v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Yan, Zhengyang Liang, Yang Song, Renjie Liao, Lele Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models based on permutation-equivariant networks can learn
+permutation-invariant distributions for graph data. However, in comparison to
+their non-invariant counterparts, we have found that these invariant models
+encounter greater learning challenges since 1) their effective target
+distributions exhibit more modes; 2) their optimal one-step denoising scores
+are the score functions of Gaussian mixtures with more components. Motivated by
+this analysis, we propose a non-invariant diffusion model, called
+$\textit{SwinGNN}$, which employs an efficient edge-to-edge 2-WL message
+passing network and utilizes shifted window based self-attention inspired by
+SwinTransformers. Further, through systematic ablations, we identify several
+critical training and sampling techniques that significantly improve the sample
+quality of graph generation. At last, we introduce a simple post-processing
+trick, $\textit{i.e.}$, randomly permuting the generated graphs, which provably
+converts any graph generative model to a permutation-invariant one. Extensive
+experiments on synthetic and real-world protein and molecule datasets show that
+our SwinGNN achieves state-of-the-art performances. Our code is released at
+https://github.com/qiyan98/SwinGNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Meta-Learning Parameterized Skills 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.03597v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.03597v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Fu, Shangqun Yu, Saket Tiwari, Michael Littman, George Konidaris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel parameterized skill-learning algorithm that aims to learn
+transferable parameterized skills and synthesize them into a new action space
+that supports efficient learning in long-horizon tasks. We propose to leverage
+off-policy Meta-RL combined with a trajectory-centric smoothness term to learn
+a set of parameterized skills. Our agent can use these learned skills to
+construct a three-level hierarchical framework that models a
+Temporally-extended Parameterized Action Markov Decision Process. We
+empirically demonstrate that the proposed algorithms enable an agent to solve a
+set of difficult long-horizon (obstacle-course and robot manipulation) tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlpaServe: Statistical Multiplexing with Model Parallelism for Deep
+  Learning Serving <span class="chip">OSDI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11665v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11665v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E. Gonzalez, Ion Stoica
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model parallelism is conventionally viewed as a method to scale a single
+large deep learning model beyond the memory limits of a single device. In this
+paper, we demonstrate that model parallelism can be additionally used for the
+statistical multiplexing of multiple devices when serving multiple models, even
+when a single model can fit into a single device. Our work reveals a
+fundamental trade-off between the overhead introduced by model parallelism and
+the opportunity to exploit statistical multiplexing to reduce serving latency
+in the presence of bursty workloads. We explore the new trade-off space and
+present a novel serving system, AlpaServe, that determines an efficient
+strategy for placing and parallelizing collections of large deep learning
+models across a distributed cluster. Evaluation results on production workloads
+show that AlpaServe can process requests at up to 10x higher rates or 6x more
+burstiness while staying within latency constraints for more than 99% of
+requests.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>OSDI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Conceptual Model for End-to-End Causal Discovery in Knowledge Tracing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16165v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16165v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nischal Ashok Kumar, Wanyong Feng, Jaewook Lee, Hunter McNichols, Aritra Ghosh, Andrew Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we take a preliminary step towards solving the problem of
+causal discovery in knowledge tracing, i.e., finding the underlying causal
+relationship among different skills from real-world student response data. This
+problem is important since it can potentially help us understand the causal
+relationship between different skills without extensive A/B testing, which can
+potentially help educators to design better curricula according to skill
+prerequisite information. Specifically, we propose a conceptual solution, a
+novel causal gated recurrent unit (GRU) module in a modified deep knowledge
+tracing model, which uses i) a learnable permutation matrix for causal ordering
+among skills and ii) an optionally learnable lower-triangular matrix for causal
+structure among skills. We also detail how to learn the model parameters in an
+end-to-end, differentiable way. Our solution placed among the top entries in
+Task 3 of the NeurIPS 2022 Challenge on Causal Insights for Learning Paths in
+Education. We detail preliminary experiments as evaluated on the challenge's
+public leaderboard since the ground truth causal structure has not been
+publicly released, making detailed local evaluation impossible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16th International Conference on Educational Data Mining (EDM 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Outline, Then Details: Syntactically Guided Coarse-To-Fine Code
+  Generation <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.00909v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.00909v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqing Zheng, S P Sharan, Ajay Kumar Jaiswal, Kevin Wang, Yihan Xi, Dejia Xu, Zhangyang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For a complicated algorithm, its implementation by a human programmer usually
+starts with outlining a rough control flow followed by iterative enrichments,
+eventually yielding carefully generated syntactic structures and variables in a
+hierarchy. However, state-of-the-art large language models generate codes in a
+single pass, without intermediate warm-ups to reflect the structured thought
+process of "outline-then-detail". Inspired by the recent success of
+chain-of-thought prompting, we propose ChainCoder, a program synthesis language
+model that generates Python code progressively, i.e. from coarse to fine in
+multiple passes. We first decompose source code into layout frame components
+and accessory components via abstract syntax tree parsing to construct a
+hierarchical representation. We then reform our prediction target into a
+multi-pass objective, each pass generates a subsequence, which is concatenated
+in the hierarchy. Finally, a tailored transformer architecture is leveraged to
+jointly encode the natural language descriptions and syntactically aligned I/O
+data samples. Extensive evaluations show that ChainCoder outperforms
+state-of-the-arts, demonstrating that our progressive generation eases the
+reasoning procedure and guides the language model to generate higher-quality
+solutions. Our codes are available at:
+https://github.com/VITA-Group/ChainCoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distilling Large Vision-Language Model with Out-of-Distribution
+  Generalizability <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03135v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03135v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanlin Li, Yunhao Fang, Minghua Liu, Zhan Ling, Zhuowen Tu, Hao Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models have achieved outstanding performance, but their
+size and computational requirements make their deployment on
+resource-constrained devices and time-sensitive tasks impractical. Model
+distillation, the process of creating smaller, faster models that maintain the
+performance of larger models, is a promising direction towards the solution.
+This paper investigates the distillation of visual representations in large
+teacher vision-language models into lightweight student models using a small-
+or mid-scale dataset. Notably, this study focuses on open-vocabulary
+out-of-distribution (OOD) generalization, a challenging problem that has been
+overlooked in previous model distillation literature. We propose two principles
+from vision and language modality perspectives to enhance student's OOD
+generalization: (1) by better imitating teacher's visual representation space,
+and carefully promoting better coherence in vision-language alignment with the
+teacher; (2) by enriching the teacher's language representations with
+informative and finegrained semantic attributes to effectively distinguish
+between different labels. We propose several metrics and conduct extensive
+experiments to investigate their techniques. The results demonstrate
+significant improvements in zero-shot and few-shot student performance on
+open-vocabulary out-of-distribution classification, highlighting the
+effectiveness of our proposed approaches. Code released at
+https://github.com/xuanlinli17/large_vlm_distillation_ood
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at International Conference on Computer Vision (ICCV) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Invariant Aggregator for Defending against Federated Backdoor Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.01834v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.01834v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyang Wang, Dimitrios Dimitriadis, Sanmi Koyejo, Shruti Tople
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning is gaining popularity as it enables training high-utility
+models across several clients without directly sharing their private data. As a
+downside, the federated setting makes the model vulnerable to various
+adversarial attacks in the presence of malicious clients. Despite the
+theoretical and empirical success in defending against attacks that aim to
+degrade models' utility, defense against backdoor attacks that increase model
+accuracy on backdoor samples exclusively without hurting the utility on other
+samples remains challenging. To this end, we first analyze the vulnerability of
+federated learning to backdoor attacks over a flat loss landscape which is
+common for well-designed neural networks such as Resnet [He et al., 2015] but
+is often overlooked by previous works. Over a flat loss landscape, misleading
+federated learning models to exclusively benefit malicious clients with
+backdoor samples do not require a significant difference between malicious and
+benign client-wise updates, making existing defenses insufficient. In contrast,
+we propose an invariant aggregator that redirects the aggregated update to
+invariant directions that are generally useful via selectively masking out the
+gradient elements that favor few and possibly malicious clients regardless of
+the difference magnitude. Theoretical results suggest that our approach
+provably mitigates backdoor attacks over both flat and sharp loss landscapes.
+Empirical results on three datasets with different modalities and varying
+numbers of clients further demonstrate that our approach mitigates a broad
+class of backdoor attacks with a negligible cost on the model utility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Representing Random Utility Choice Models with Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.12877v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.12877v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Aouad, Antoine Désir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the successes of deep learning, we propose a class of neural
+network-based discrete choice models, called RUMnets, inspired by the random
+utility maximization (RUM) framework. This model formulates the agents' random
+utility function using a sample average approximation. We show that RUMnets
+sharply approximate the class of RUM discrete choice models: any model derived
+from random utility maximization has choice probabilities that can be
+approximated arbitrarily closely by a RUMnet. Reciprocally, any RUMnet is
+consistent with the RUM principle. We derive an upper bound on the
+generalization error of RUMnets fitted on choice data, and gain theoretical
+insights on their ability to predict choices on new, unseen data depending on
+critical parameters of the dataset and architecture. By leveraging open-source
+libraries for neural networks, we find that RUMnets are competitive against
+several choice modeling and machine learning methods in terms of predictive
+accuracy on two real-world datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TbExplain: A Text-based Explanation Method for Scene Classification
+  Models with the Statistical Prediction Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amirhossein Aminimehr, Pouya Khani, Amirali Molaei, Amirmohammad Kazemeini, Erik Cambria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of Explainable Artificial Intelligence (XAI) aims to improve the
+interpretability of black-box machine learning models. Building a heatmap based
+on the importance value of input features is a popular method for explaining
+the underlying functions of such models in producing their predictions.
+Heatmaps are almost understandable to humans, yet they are not without flaws.
+Non-expert users, for example, may not fully understand the logic of heatmaps
+(the logic in which relevant pixels to the model's prediction are highlighted
+with different intensities or colors). Additionally, objects and regions of the
+input image that are relevant to the model prediction are frequently not
+entirely differentiated by heatmaps. In this paper, we propose a framework
+called TbExplain that employs XAI techniques and a pre-trained object detector
+to present text-based explanations of scene classification models. Moreover,
+TbExplain incorporates a novel method to correct predictions and textually
+explain them based on the statistics of objects in the input image when the
+initial prediction is unreliable. To assess the trustworthiness and validity of
+the text-based explanations, we conducted a qualitative experiment, and the
+findings indicated that these explanations are sufficiently reliable.
+Furthermore, our quantitative and qualitative experiments on TbExplain with
+scene classification datasets reveal an improvement in classification accuracy
+over ResNet variants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AGAR: Attention Graph-RNN for Adaptative Motion Prediction of Point
+  Clouds of Deformable Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Gomes, Silvia Rossi, Laura Toni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on motion prediction for point cloud sequences in the
+challenging case of deformable 3D objects, such as human body motion. First, we
+investigate the challenges caused by deformable shapes and complex motions
+present in this type of representation, with the ultimate goal of understanding
+the technical limitations of state-of-the-art models. From this understanding,
+we propose an improved architecture for point cloud prediction of deformable 3D
+objects. Specifically, to handle deformable shapes, we propose a graph-based
+approach that learns and exploits the spatial structure of point clouds to
+extract more representative features. Then we propose a module able to combine
+the learned features in an adaptative manner according to the point cloud
+movements. The proposed adaptative module controls the composition of local and
+global motions for each point, enabling the network to model complex motions in
+deformable 3D objects more effectively. We tested the proposed method on the
+following datasets: MNIST moving digits, the Mixamo human bodies motions, JPEG
+and CWIPC-SXR real-world dynamic bodies. Simulation results demonstrate that
+our method outperforms the current baseline methods given its improved ability
+to model complex movements as well as preserve point cloud shape. Furthermore,
+we demonstrate the generalizability of the proposed framework for dynamic
+feature learning, by testing the framework for action recognition on the
+MSRAction3D dataset and achieving results on-par with state-of-the-art methods
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embedded Heterogeneous Attention <span class="highlight-title">Transformer</span> for Cross-lingual Image
+  Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Song, Zhenzhen Hu, Richang Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-lingual image captioning is confronted with both cross-lingual and
+cross-modal challenges for multimedia analysis. The crucial issue in this task
+is to model the global and local matching between the image and different
+languages. Existing cross-modal embedding methods based on Transformer
+architecture oversight the local matching between the image region and
+monolingual words, not to mention in the face of a variety of differentiated
+languages. Due to the heterogeneous property of the cross-modal and
+cross-lingual task, we utilize the heterogeneous network to establish
+cross-domain relationships and the local correspondences between the image and
+different languages. In this paper, we propose an Embedded Heterogeneous
+Attention Transformer (EHAT) to build reasoning paths bridging cross-domain for
+cross-lingual image captioning and integrate into transformer. The proposed
+EHAT consists of a Masked Heterogeneous Cross-attention (MHCA), Heterogeneous
+Attention Reasoning Network (HARN) and Heterogeneous Co-attention (HCA). HARN
+as the core network, models and infers cross-domain relationship anchored by
+vision bounding box representation features to connect two languages word
+features and learn the heterogeneous maps. MHCA and HCA implement cross-domain
+integration in the encoder through the special heterogeneous attention and
+enable single model to generate two language captioning. We test on MSCOCO
+dataset to generate English and Chinese, which are most widely used and have
+obvious difference between their language families. Our experiments show that
+our method even achieve better than advanced monolingual methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Semantic Perceptual Listener Head Video Generation: A
+  High-performance Pipeline <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhigang Chang, Weitai Hu, Qing Yang, Shibao Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In dyadic speaker-listener interactions, the listener's head reactions along
+with the speaker's head movements, constitute an important non-verbal semantic
+expression together. The listener Head generation task aims to synthesize
+responsive listener's head videos based on audios of the speaker and reference
+images of the listener. Compared to the Talking-head generation, it is more
+challenging to capture the correlation clues from the speaker's audio and
+visual information. Following the ViCo baseline scheme, we propose a
+high-performance solution by enhancing the hierarchical semantic extraction
+capability of the audio encoder module and improving the decoder part, renderer
+and post-processing modules. Our solution gets the first place on the official
+leaderboard for the track of listening head generation. This paper is a
+technical report of ViCo@2023 Conversational Head Generation Challenge in ACM
+Multimedia 2023 conference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NTIRE 2023 Quality Assessment of Video Enhancement Challenge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohong Liu, Xiongkuo Min, Wei Sun, Yulun Zhang, Kai Zhang, Radu Timofte, Guangtao Zhai, Yixuan Gao, Yuqin Cao, Tengchuan Kou, Yunlong Dong, Ziheng Jia, Yilin Li, Wei Wu, Shuming Hu, Sibin Deng, Pengxiang Xiao, Ying Chen, Kai Li, Kai Zhao, Kun Yuan, Ming Sun, Heng Cong, Hao Wang, Lingzhi Fu, Yusheng Zhang, Rongyu Zhang, Hang Shi, Qihang Xu, Longan Xiao, Zhiliang Ma, Mirko Agarla, Luigi Celona, Claudio Rota, Raimondo Schettini, Zhiwei Huang, Yanan Li, Xiaotao Wang, Lei Lei, Hongye Liu, Wei Hong, Ironhead Chuang, Allen Lin, Drake Guan, Iris Chen, Kae Lou, Willy Huang, Yachun Tasi, Yvonne Kao, Haotian Fan, Fangyuan Kong, Shiqi Zhou, Hao Liu, Yu Lai, Shanshan Chen, Wenqi Wang, Haoning Wu, Chaofeng Chen, Chunzheng Zhu, Zekun Guo, Shiling Zhao, Haibing Yin, Hongkui Wang, Hanene Brachemi Meftah, Sid Ahmed Fezza, Wassim Hamidouche, Olivier Déforges, Tengfei Shi, Azadeh Mansouri, Hossein Motamednia, Amir Hossein Bakhtiari, Ahmad Mahmoudi Aznaveh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper reports on the NTIRE 2023 Quality Assessment of Video Enhancement
+Challenge, which will be held in conjunction with the New Trends in Image
+Restoration and Enhancement Workshop (NTIRE) at CVPR 2023. This challenge is to
+address a major challenge in the field of video processing, namely, video
+quality assessment (VQA) for enhanced videos. The challenge uses the VQA
+Dataset for Perceptual Video Enhancement (VDPVE), which has a total of 1211
+enhanced videos, including 600 videos with color, brightness, and contrast
+enhancements, 310 videos with deblurring, and 301 deshaked videos. The
+challenge has a total of 167 registered participants. 61 participating teams
+submitted their prediction results during the development phase, with a total
+of 3168 submissions. A total of 176 submissions were submitted by 37
+participating teams during the final testing phase. Finally, 19 participating
+teams submitted their models and fact sheets, and detailed the methods they
+used. Some methods have achieved better results than baseline methods, and the
+winning methods have demonstrated superior prediction performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estudio de la Experiencia de Usuario mediante un Sistema de Dashboards
+  de Análisis de Aprendizaje Multimodal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Álvaro Becerra, Roberto Daza, Ruth Cobos, Aythami Morales, Julian Fierrez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the article, we present a Web-based System called M2LADS, which supports
+the integration and visualization of multimodal data recorded in user
+experiences (UX) in a Learning Analytics (LA) system in the form of Web-based
+Dashboards. Based on the edBB platform, the multimodal data gathered contains
+biometric and behavioral signals including electroencephalogram data to measure
+learners' cognitive attention, heart rate for affective measures and visual
+attention from the video recordings. Additionally, learners' static background
+data and their learning performance measures are tracked using LOGGE tool.
+M2LADS provides opportunities to capture learners' holistic experience during
+their interactions with the learning analytic system in order to improve the
+system and the user experience of the learners.
+  --
+  En este art\'iculo, presentamos M2LADS, un sistema que permite la
+integraci\'on y visualizaci\'on de datos multimodales en forma de Dashboards
+Web. Estos datos provienen de sesiones de experiencia de usuario en un sistema
+de Learning Analytics (LA) llevadas a cabo por estudiantes de MOOCs. Los datos
+multimodales incluyen se\~nales biom\'etricas y de comportamiento monitorizados
+por la plataforma edBB, como electroencefalogramas (EEG) de 5 canales,
+frecuencia card\'iaca, atenci\'on visual, videos en el espectro visible y NIR,
+entre otros. Adem\'as, se incluyen datos de interacci\'on de los estudiantes
+con el sistema de LA a trav\'es de la herramienta LOGGE. Toda esta
+informaci\'on proporciona una comprensi\'on completa de la experiencia del
+usuario al utilizar el sistema de LA, lo que ha permitido tanto mejorar el
+sistema LA como la experiencia de aprendizaje de los estudiantes de MOOCs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in "XXIII CONGRESO INTERNACIONAL DE INTERACCI\'ON
+  PERSONA-ORDENADOR 2023". Article in Spanish language. The abstract in English
+  and Spanish. There is an extended abstract of 2 pages in English</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Learning for Videos: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.00419v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.00419v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Madeline C. Schiappa, Yogesh S. Rawat, Mubarak Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable success of deep learning in various domains relies on the
+availability of large-scale annotated datasets. However, obtaining annotations
+is expensive and requires great effort, which is especially challenging for
+videos. Moreover, the use of human-generated annotations leads to models with
+biased learning and poor domain generalization and robustness. As an
+alternative, self-supervised learning provides a way for representation
+learning which does not require annotations and has shown promise in both image
+and video domains. Different from the image domain, learning video
+representations are more challenging due to the temporal dimension, bringing in
+motion and other environmental dynamics. This also provides opportunities for
+video-exclusive ideas that advance self-supervised learning in the video and
+multimodal domain. In this survey, we provide a review of existing approaches
+on self-supervised learning focusing on the video domain. We summarize these
+methods into four different categories based on their learning objectives: 1)
+pretext tasks, 2) generative learning, 3) contrastive learning, and 4)
+cross-modal agreement. We further introduce the commonly used datasets,
+downstream evaluation tasks, insights into the limitations of existing works,
+and the potential future directions in this area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM CSUR (December 2022). Project Link: https://bit.ly/3Oimc7Q</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio
+  <span class="highlight-title">Pretrain</span>ing for Speech Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07848v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07848v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Pan, Lei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning based cross-modality pretraining methods have recently
+exhibited impressive success in diverse fields. In this paper, we propose
+GEmo-CLAP, a kind of gender-attribute-enhanced contrastive language-audio
+pretraining (CLAP) method for speech emotion recognition. Specifically, a novel
+emotion CLAP model (Emo-CLAP) is first built, utilizing various self-supervised
+pre-trained models. Second, considering the importance of gender attribute in
+speech emotion modeling, the soft label based GEmo-CLAP (SL-GEmo-CLAP) and
+multi-task learning based GEmo-CLAP (ML-GEmo-CLAP) are further proposed to
+integrate the emotion and gender information of speech signals, forming more
+reasonable objectives. Extensive experiments on IEMOCAP show that our proposed
+two GEmo-CLAP models consistently outperform the baseline Emo-CLAP with
+different pre-trained models, while also achieving the best recognition
+performance compared with recent state-of-the-art methods. Noticeably, the
+proposed WavLM-based ML-GEmo-CLAP obtains the best UAR of 80.16\% and WAR of
+82.06\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+
+</body>
+
+<footer>
+    <div>
+        <time id="build-timestamp" datetime="2023-07-27T05:20:47.709423215Z">
+            2023-07-27 05:20:47 UTC
+        </time>
+    </div>
+</footer>
+<script src="index.js"></script>
+</html>
diff --git a/index.js b/index.js
new file mode 100644
index 00000000..69f5da7b
--- /dev/null
+++ b/index.js
@@ -0,0 +1,39 @@
+/* Exapand/Collapse with TAB key */
+var expanded = false;
+document.onkeydown = function (e) {
+    if (e.keyCode === 9) {
+        expanded = !expanded;
+        document.querySelectorAll("details").forEach(detail => detail.open = expanded);
+        return false;
+    }
+};
+
+/* Switch Theme */
+const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]');
+
+function switchTheme(e) {
+    if (e.target.checked) {
+        document.documentElement.setAttribute('data-theme', 'light');
+        document.getElementById("theme-icon").className = "ri-sun-line";
+        localStorage.setItem('theme', 'light'); //add this
+    } else {
+        document.documentElement.setAttribute('data-theme', 'dark');
+        document.getElementById("theme-icon").className = "ri-moon-line";
+        localStorage.setItem('theme', 'dark'); //add this
+    }
+}
+
+toggleSwitch.addEventListener('change', switchTheme, false);
+const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null;
+if (currentTheme) {
+    document.documentElement.setAttribute('data-theme', currentTheme);
+    if (currentTheme === 'light') {
+        toggleSwitch.checked = true;
+    }
+}
+
+const timestamp = document.getElementById("build-timestamp");
+const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString();
+
+const badge = document.getElementById("build-timestamp-badge");
+// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`